py/gadict_freq.py
changeset 645 6d4a074cea27
parent 643 c2c32f45dde6
child 646 2d488cfc4c0c
equal deleted inserted replaced
644:e38cd6112193 645:6d4a074cea27
    17                 break
    17                 break
    18             line = line.strip()
    18             line = line.strip()
    19             wlist.append(line)
    19             wlist.append(line)
    20         return wlist
    20         return wlist
    21 
    21 
    22 class BasewordParser:
    22 class HeadVarParser:
    23 
    23 
    24     BASEWORD_RE = regex.compile(u"^(\t)?(.*)$")
    24     BASEVAR_RE = regex.compile(u"^(\t)?(.*)$")
    25 
    25 
    26     def __init__(self, stream, limit):
    26     def __init__(self, stream, limit = None):
    27         self.stream = stream
    27         self.stream = stream
    28         self.limit = limit
    28         self.limit = limit
    29         self.lineno = 0
    29         self.lineno = 0
    30         self.cnt = 0
    30         self.cnt = 0
    31 
    31 
    34         while True:
    34         while True:
    35             line = self.stream.readline()
    35             line = self.stream.readline()
    36             if len(line) == 0:
    36             if len(line) == 0:
    37                 break
    37                 break
    38             self.lineno += 1
    38             self.lineno += 1
    39             m = self.BASEWORD_RE.match(line)
    39             m = self.BASEVAR_RE.match(line)
    40             if not m:
    40             if not m:
    41                 raise Exception("Line {:d}: '{:s}' wrong format\n".format(self.lineno, line))
    41                 raise Exception("Line {:d}: '{:s}' wrong format\n".format(self.lineno, line))
    42             tab = m.group(1)
    42             tab = m.group(1)
    43             if not tab:
    43             if not tab:
    44                 self.cnt += 1
    44                 self.cnt += 1
    50 
    50 
    51 class FreqlistParser:
    51 class FreqlistParser:
    52 
    52 
    53     FREQ_RE = regex.compile(u"^([0-9]+) (.*)$")
    53     FREQ_RE = regex.compile(u"^([0-9]+) (.*)$")
    54 
    54 
    55     def __init__(self, stream, limit):
    55     def __init__(self, stream, limit = None):
    56         self.stream = stream
    56         self.stream = stream
    57         self.limit = limit
    57         self.limit = limit
    58         self.lineno = 0
    58         self.lineno = 0
    59 
    59 
    60     def parse(self):
    60     def parse(self):
   100         mode = m.group(3)
   100         mode = m.group(3)
   101         if limit:
   101         if limit:
   102             limit = int(limit)
   102             limit = int(limit)
   103         with io.open(fname, mode='r', buffering=1, encoding="utf-8") as stream:
   103         with io.open(fname, mode='r', buffering=1, encoding="utf-8") as stream:
   104             if mode == "b":
   104             if mode == "b":
   105                 parser = BasewordParser(stream, limit)
   105                 parser = HeadVarParser(stream, limit)
   106             elif mode == "f":
   106             elif mode == "f":
   107                 parser = FreqlistParser(stream, limit)
   107                 parser = FreqlistParser(stream, limit)
   108             else:
   108             else:
   109                 raise Expection("Unknown mode in specification...")
   109                 raise Expection("Unknown mode in specification...")
   110             try:
   110             try: