py/gadict_freq.py
author Oleksandr Gavenko <gavenkoa@gmail.com>
Sun, 19 Mar 2017 10:16:43 +0200
changeset 804 7aa283584ee6
parent 757 5417f2102dc5
child 1004 181301cc2c0c
permissions -rw-r--r--
Skip tabs for proper statistic calculation.
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
643
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     1
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     2
import sys
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     3
import codecs
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     4
import io
757
5417f2102dc5 Switch to built-in `re` Python module over `regex`.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 723
diff changeset
     5
import re
643
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     6
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     7
class WordlistParser:
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     8
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     9
    def __init__(self, stream):
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    10
        self.stream = stream
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    11
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    12
    def parse(self):
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    13
        wlist = []
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    14
        while True:
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    15
            line = self.stream.readline()
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    16
            if len(line) == 0:
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    17
                break
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    18
            line = line.strip()
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    19
            wlist.append(line)
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    20
        return wlist
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    21
723
53095b480a73 Treat first argument in format like the rest arguments. Targets to print
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 646
diff changeset
    22
class WordformParser:
643
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    23
757
5417f2102dc5 Switch to built-in `re` Python module over `regex`.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 723
diff changeset
    24
    BASEVAR_RE = re.compile(u"^(\t)?(.*)$", re.UNICODE)
643
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    25
645
6d4a074cea27 Small improvements.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 643
diff changeset
    26
    def __init__(self, stream, limit = None):
643
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    27
        self.stream = stream
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    28
        self.limit = limit
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    29
        self.lineno = 0
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    30
        self.cnt = 0
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    31
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    32
    def parse(self):
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    33
        wlist = []
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    34
        while True:
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    35
            line = self.stream.readline()
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    36
            if len(line) == 0:
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    37
                break
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    38
            self.lineno += 1
645
6d4a074cea27 Small improvements.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 643
diff changeset
    39
            m = self.BASEVAR_RE.match(line)
643
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    40
            if not m:
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    41
                raise Exception("Line {:d}: '{:s}' wrong format\n".format(self.lineno, line))
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    42
            tab = m.group(1)
804
7aa283584ee6 Skip tabs for proper statistic calculation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 757
diff changeset
    43
            if tab:
7aa283584ee6 Skip tabs for proper statistic calculation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 757
diff changeset
    44
                continue
7aa283584ee6 Skip tabs for proper statistic calculation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 757
diff changeset
    45
            self.cnt += 1
643
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    46
            if self.limit and self.cnt > self.limit:
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    47
                break
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    48
            headword = m.group(2).strip().lower()
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    49
            wlist.append(headword)
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    50
        return wlist
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    51
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    52
class FreqlistParser:
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    53
757
5417f2102dc5 Switch to built-in `re` Python module over `regex`.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 723
diff changeset
    54
    FREQ_RE = re.compile(u"^([0-9]+) (.*)$", re.UNICODE)
643
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    55
645
6d4a074cea27 Small improvements.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 643
diff changeset
    56
    def __init__(self, stream, limit = None):
643
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    57
        self.stream = stream
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    58
        self.limit = limit
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    59
        self.lineno = 0
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    60
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    61
    def parse(self):
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    62
        wlist = []
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    63
        while True:
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    64
            if self.limit and self.lineno >= self.limit:
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    65
                break
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    66
            line = self.stream.readline()
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    67
            if len(line) == 0:
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    68
                break
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    69
            self.lineno += 1
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    70
            m = self.FREQ_RE.match(line)
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    71
            if not m:
646
2d488cfc4c0c Add frequency markers to dictd dictionary and Anki cards.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 645
diff changeset
    72
                raise Exception("Line '{:s}' #{:d} is not in NUM WORD format\n".format(line, self.lineno))
643
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    73
            headword = m.group(2).strip().lower()
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    74
            wlist.append(headword)
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    75
        return wlist
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    76
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    77
if __name__ == '__main__':
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    78
    USAGE = "Usage: PROG  $WORDLIST  [+/-][NUM][b/f]:FREQLIST..."
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    79
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    80
    if len(sys.argv) < 3:
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    81
        raise Exception(USAGE)
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    82
    FINAME = sys.argv[1]
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    83
757
5417f2102dc5 Switch to built-in `re` Python module over `regex`.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 723
diff changeset
    84
    COMMAND_RE = re.compile("([-+])([0-9]+)?([bf]):([^:]+)")
643
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    85
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    86
    IN_SET = set()
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    87
    EX_SET = set()
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    88
723
53095b480a73 Treat first argument in format like the rest arguments. Targets to print
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 646
diff changeset
    89
    for idx in range(1, len(sys.argv)):
643
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    90
        spec = sys.argv[idx]
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    91
        m = COMMAND_RE.match(spec)
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    92
        if not m:
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    93
            raise Exception("Wrong FREQLIST spec: '{:s}'\n".format(spec) + USAGE)
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    94
        fname = m.group(4)
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    95
        limit = m.group(2)
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    96
        mode = m.group(3)
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    97
        if limit:
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    98
            limit = int(limit)
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    99
        with io.open(fname, mode='r', buffering=1, encoding="utf-8") as stream:
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   100
            if mode == "b":
723
53095b480a73 Treat first argument in format like the rest arguments. Targets to print
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 646
diff changeset
   101
                parser = WordformParser(stream, limit)
643
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   102
            elif mode == "f":
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   103
                parser = FreqlistParser(stream, limit)
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   104
            else:
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   105
                raise Expection("Unknown mode in specification...")
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   106
            try:
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   107
                wlist = parser.parse()
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   108
            except:
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   109
                print("Error during processing: {:s}".format(fname))
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   110
                raise
723
53095b480a73 Treat first argument in format like the rest arguments. Targets to print
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 646
diff changeset
   111
        wlist = set([w.lower() for w in wlist])
643
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   112
        if m.group(1) == "+":
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   113
            IN_SET |= wlist
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   114
        else:
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   115
            EX_SET |= wlist
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   116
723
53095b480a73 Treat first argument in format like the rest arguments. Targets to print
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 646
diff changeset
   117
    for headword in IN_SET - EX_SET:
53095b480a73 Treat first argument in format like the rest arguments. Targets to print
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 646
diff changeset
   118
        # if any(c in headword for c in " '."):
53095b480a73 Treat first argument in format like the rest arguments. Targets to print
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 646
diff changeset
   119
        #     continue
53095b480a73 Treat first argument in format like the rest arguments. Targets to print
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 646
diff changeset
   120
        print(headword)
643
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   121