py/gadict_headwords.py
changeset 643 c2c32f45dde6
child 720 b5a4b476eddf
equal deleted inserted replaced
642:c1032aea6265 643:c2c32f45dde6
       
     1 
       
     2 import sys
       
     3 import codecs
       
     4 import io
       
     5 import regex
       
     6 
       
     7 FINAME = None
       
     8 FONAME = None
       
     9 if len(sys.argv) >= 2:
       
    10     FINAME = sys.argv[1]
       
    11 if len(sys.argv) >= 3:
       
    12     FONAME = sys.argv[2]
       
    13 
       
    14 FIN = io.open(FINAME, mode='r', buffering=1, encoding="utf-8")
       
    15 if FONAME is None:
       
    16     FOUT = sys.stdout
       
    17 else:
       
    18     FOUT = codecs.open(FONAME, "w", "utf-8")
       
    19 
       
    20 
       
    21 class GadictParser:
       
    22 
       
    23     SEPARATOR_RE = regex.compile(u"^__$")
       
    24     EMPTY_RE = regex.compile( u"^$" )
       
    25     HEADWORD_RE = regex.compile( u"^(\\p{L}.*)$" )
       
    26 
       
    27     def __init__(self, stream):
       
    28         self.stream = stream
       
    29         self.lineno = 0
       
    30 
       
    31     def parse(self):
       
    32         wlist = []
       
    33         while True:
       
    34             line = self.stream.readline()
       
    35             if len(line) == 0:
       
    36                 break
       
    37             self.lineno += 1
       
    38             m = self.SEPARATOR_RE.match(line)
       
    39             if not m:
       
    40                 continue
       
    41 
       
    42             line = self.stream.readline()
       
    43             if len(line) == 0:
       
    44                 break
       
    45             self.lineno += 1
       
    46             m = self.EMPTY_RE.match(line)
       
    47             if not m:
       
    48                 raise Exception("Line {:d}: '{:s}' is not empty line\n".format(self.lineno, line))
       
    49 
       
    50             line = self.stream.readline()
       
    51             if len(line) == 0:
       
    52                 break
       
    53             line = line.strip()
       
    54             self.lineno += 1
       
    55             m = self.HEADWORD_RE.match(line)
       
    56             if not m:
       
    57                 raise Exception("Line {:d}: '{:s}' is not a headword\n".format(self.lineno, line))
       
    58 
       
    59             wlist.append(line)
       
    60         return wlist
       
    61 
       
    62 try:
       
    63     parser = GadictParser(FIN)
       
    64     for headword in parser.parse():
       
    65         FOUT.write(headword)
       
    66         FOUT.write("\n")
       
    67 finally:
       
    68     FIN.close()
       
    69     FOUT.close()
       
    70