py/gadict_freq.py
changeset 643 c2c32f45dde6
child 645 6d4a074cea27
equal deleted inserted replaced
642:c1032aea6265 643:c2c32f45dde6
       
     1 
       
     2 import sys
       
     3 import codecs
       
     4 import io
       
     5 import regex
       
     6 
       
     7 class WordlistParser:
       
     8 
       
     9     def __init__(self, stream):
       
    10         self.stream = stream
       
    11 
       
    12     def parse(self):
       
    13         wlist = []
       
    14         while True:
       
    15             line = self.stream.readline()
       
    16             if len(line) == 0:
       
    17                 break
       
    18             line = line.strip()
       
    19             wlist.append(line)
       
    20         return wlist
       
    21 
       
    22 class BasewordParser:
       
    23 
       
    24     BASEWORD_RE = regex.compile(u"^(\t)?(.*)$")
       
    25 
       
    26     def __init__(self, stream, limit):
       
    27         self.stream = stream
       
    28         self.limit = limit
       
    29         self.lineno = 0
       
    30         self.cnt = 0
       
    31 
       
    32     def parse(self):
       
    33         wlist = []
       
    34         while True:
       
    35             line = self.stream.readline()
       
    36             if len(line) == 0:
       
    37                 break
       
    38             self.lineno += 1
       
    39             m = self.BASEWORD_RE.match(line)
       
    40             if not m:
       
    41                 raise Exception("Line {:d}: '{:s}' wrong format\n".format(self.lineno, line))
       
    42             tab = m.group(1)
       
    43             if not tab:
       
    44                 self.cnt += 1
       
    45             if self.limit and self.cnt > self.limit:
       
    46                 break
       
    47             headword = m.group(2).strip().lower()
       
    48             wlist.append(headword)
       
    49         return wlist
       
    50 
       
    51 class FreqlistParser:
       
    52 
       
    53     FREQ_RE = regex.compile(u"^([0-9]+) (.*)$")
       
    54 
       
    55     def __init__(self, stream, limit):
       
    56         self.stream = stream
       
    57         self.limit = limit
       
    58         self.lineno = 0
       
    59 
       
    60     def parse(self):
       
    61         wlist = []
       
    62         while True:
       
    63             if self.limit and self.lineno >= self.limit:
       
    64                 break
       
    65             line = self.stream.readline()
       
    66             if len(line) == 0:
       
    67                 break
       
    68             self.lineno += 1
       
    69             m = self.FREQ_RE.match(line)
       
    70             if not m:
       
    71                 raise Exception("Line {:d} is not in NUM WORD format\n".format(self.lineno, line))
       
    72             headword = m.group(2).strip().lower()
       
    73             wlist.append(headword)
       
    74         return wlist
       
    75 
       
    76 if __name__ == '__main__':
       
    77     USAGE = "Usage: PROG  $WORDLIST  [+/-][NUM][b/f]:FREQLIST..."
       
    78 
       
    79     if len(sys.argv) < 3:
       
    80         raise Exception(USAGE)
       
    81     FINAME = sys.argv[1]
       
    82 
       
    83     with io.open(FINAME, mode='r', buffering=1, encoding="utf-8") as stream:
       
    84         parser = WordlistParser(stream)
       
    85         HEADWORDS = parser.parse()
       
    86 
       
    87     COMMAND_RE = regex.compile("([-+])([0-9]+)?([bf]):([^:]+)")
       
    88 
       
    89 
       
    90     IN_SET = set()
       
    91     EX_SET = set()
       
    92 
       
    93     for idx in range(2, len(sys.argv)):
       
    94         spec = sys.argv[idx]
       
    95         m = COMMAND_RE.match(spec)
       
    96         if not m:
       
    97             raise Exception("Wrong FREQLIST spec: '{:s}'\n".format(spec) + USAGE)
       
    98         fname = m.group(4)
       
    99         limit = m.group(2)
       
   100         mode = m.group(3)
       
   101         if limit:
       
   102             limit = int(limit)
       
   103         with io.open(fname, mode='r', buffering=1, encoding="utf-8") as stream:
       
   104             if mode == "b":
       
   105                 parser = BasewordParser(stream, limit)
       
   106             elif mode == "f":
       
   107                 parser = FreqlistParser(stream, limit)
       
   108             else:
       
   109                 raise Expection("Unknown mode in specification...")
       
   110             try:
       
   111                 wlist = parser.parse()
       
   112             except:
       
   113                 print("Error during processing: {:s}".format(fname))
       
   114                 raise
       
   115         wlist = set(wlist)
       
   116         if m.group(1) == "+":
       
   117             IN_SET |= wlist
       
   118         else:
       
   119             EX_SET |= wlist
       
   120 
       
   121     for headword in HEADWORDS:
       
   122         if any(c in headword for c in " '."):
       
   123             continue
       
   124         normilized= headword.lower()
       
   125         if (len(IN_SET) == 0 or normilized in IN_SET) and not normilized in EX_SET:
       
   126             print(headword)
       
   127