py/gadict_freq.py
author Oleksandr Gavenko <gavenkoa@gmail.com>
Tue, 08 Nov 2016 19:01:27 +0200
changeset 646 2d488cfc4c0c
parent 645 6d4a074cea27
child 723 53095b480a73
permissions -rw-r--r--
Add frequency markers to dictd dictionary and Anki cards.
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
643
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     1
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     2
import sys
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     3
import codecs
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     4
import io
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     5
import regex
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     6
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     7
class WordlistParser:
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     8
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     9
    def __init__(self, stream):
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    10
        self.stream = stream
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    11
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    12
    def parse(self):
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    13
        wlist = []
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    14
        while True:
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    15
            line = self.stream.readline()
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    16
            if len(line) == 0:
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    17
                break
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    18
            line = line.strip()
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    19
            wlist.append(line)
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    20
        return wlist
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    21
645
6d4a074cea27 Small improvements.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 643
diff changeset
    22
class HeadVarParser:
643
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    23
645
6d4a074cea27 Small improvements.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 643
diff changeset
    24
    BASEVAR_RE = regex.compile(u"^(\t)?(.*)$")
643
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    25
645
6d4a074cea27 Small improvements.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 643
diff changeset
    26
    def __init__(self, stream, limit = None):
643
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    27
        self.stream = stream
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    28
        self.limit = limit
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    29
        self.lineno = 0
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    30
        self.cnt = 0
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    31
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    32
    def parse(self):
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    33
        wlist = []
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    34
        while True:
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    35
            line = self.stream.readline()
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    36
            if len(line) == 0:
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    37
                break
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    38
            self.lineno += 1
645
6d4a074cea27 Small improvements.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 643
diff changeset
    39
            m = self.BASEVAR_RE.match(line)
643
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    40
            if not m:
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    41
                raise Exception("Line {:d}: '{:s}' wrong format\n".format(self.lineno, line))
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    42
            tab = m.group(1)
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    43
            if not tab:
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    44
                self.cnt += 1
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    45
            if self.limit and self.cnt > self.limit:
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    46
                break
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    47
            headword = m.group(2).strip().lower()
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    48
            wlist.append(headword)
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    49
        return wlist
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    50
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    51
class FreqlistParser:
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    52
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    53
    FREQ_RE = regex.compile(u"^([0-9]+) (.*)$")
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    54
645
6d4a074cea27 Small improvements.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 643
diff changeset
    55
    def __init__(self, stream, limit = None):
643
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    56
        self.stream = stream
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    57
        self.limit = limit
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    58
        self.lineno = 0
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    59
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    60
    def parse(self):
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    61
        wlist = []
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    62
        while True:
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    63
            if self.limit and self.lineno >= self.limit:
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    64
                break
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    65
            line = self.stream.readline()
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    66
            if len(line) == 0:
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    67
                break
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    68
            self.lineno += 1
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    69
            m = self.FREQ_RE.match(line)
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    70
            if not m:
646
2d488cfc4c0c Add frequency markers to dictd dictionary and Anki cards.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 645
diff changeset
    71
                raise Exception("Line '{:s}' #{:d} is not in NUM WORD format\n".format(line, self.lineno))
643
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    72
            headword = m.group(2).strip().lower()
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    73
            wlist.append(headword)
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    74
        return wlist
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    75
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    76
if __name__ == '__main__':
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    77
    USAGE = "Usage: PROG  $WORDLIST  [+/-][NUM][b/f]:FREQLIST..."
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    78
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    79
    if len(sys.argv) < 3:
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    80
        raise Exception(USAGE)
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    81
    FINAME = sys.argv[1]
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    82
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    83
    with io.open(FINAME, mode='r', buffering=1, encoding="utf-8") as stream:
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    84
        parser = WordlistParser(stream)
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    85
        HEADWORDS = parser.parse()
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    86
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    87
    COMMAND_RE = regex.compile("([-+])([0-9]+)?([bf]):([^:]+)")
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    88
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    89
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    90
    IN_SET = set()
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    91
    EX_SET = set()
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    92
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    93
    for idx in range(2, len(sys.argv)):
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    94
        spec = sys.argv[idx]
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    95
        m = COMMAND_RE.match(spec)
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    96
        if not m:
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    97
            raise Exception("Wrong FREQLIST spec: '{:s}'\n".format(spec) + USAGE)
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    98
        fname = m.group(4)
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    99
        limit = m.group(2)
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   100
        mode = m.group(3)
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   101
        if limit:
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   102
            limit = int(limit)
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   103
        with io.open(fname, mode='r', buffering=1, encoding="utf-8") as stream:
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   104
            if mode == "b":
645
6d4a074cea27 Small improvements.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 643
diff changeset
   105
                parser = HeadVarParser(stream, limit)
643
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   106
            elif mode == "f":
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   107
                parser = FreqlistParser(stream, limit)
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   108
            else:
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   109
                raise Expection("Unknown mode in specification...")
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   110
            try:
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   111
                wlist = parser.parse()
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   112
            except:
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   113
                print("Error during processing: {:s}".format(fname))
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   114
                raise
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   115
        wlist = set(wlist)
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   116
        if m.group(1) == "+":
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   117
            IN_SET |= wlist
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   118
        else:
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   119
            EX_SET |= wlist
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   120
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   121
    for headword in HEADWORDS:
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   122
        if any(c in headword for c in " '."):
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   123
            continue
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   124
        normilized= headword.lower()
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   125
        if (len(IN_SET) == 0 or normilized in IN_SET) and not normilized in EX_SET:
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   126
            print(headword)
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   127