py/gadict_headwords.py
changeset 720 b5a4b476eddf
parent 643 c2c32f45dde6
child 757 5417f2102dc5
equal deleted inserted replaced
719:ebc16c3a9129 720:b5a4b476eddf
    20 
    20 
    21 class GadictParser:
    21 class GadictParser:
    22 
    22 
    23     SEPARATOR_RE = regex.compile(u"^__$")
    23     SEPARATOR_RE = regex.compile(u"^__$")
    24     EMPTY_RE = regex.compile( u"^$" )
    24     EMPTY_RE = regex.compile( u"^$" )
       
    25     HEADWORD_ATTR_RE = regex.compile( u"^ " )
    25     HEADWORD_RE = regex.compile( u"^(\\p{L}.*)$" )
    26     HEADWORD_RE = regex.compile( u"^(\\p{L}.*)$" )
    26 
    27 
    27     def __init__(self, stream):
    28     def __init__(self, stream):
    28         self.stream = stream
    29         self.stream = stream
    29         self.lineno = 0
    30         self.lineno = 0
    31     def parse(self):
    32     def parse(self):
    32         wlist = []
    33         wlist = []
    33         while True:
    34         while True:
    34             line = self.stream.readline()
    35             line = self.stream.readline()
    35             if len(line) == 0:
    36             if len(line) == 0:
    36                 break
    37                 return wlist
    37             self.lineno += 1
    38             self.lineno += 1
    38             m = self.SEPARATOR_RE.match(line)
    39             m = self.SEPARATOR_RE.match(line)
    39             if not m:
    40             if not m:
    40                 continue
    41                 continue
    41 
    42 
    42             line = self.stream.readline()
    43             line = self.stream.readline()
    43             if len(line) == 0:
    44             if len(line) == 0:
    44                 break
    45                 return wlist
    45             self.lineno += 1
    46             self.lineno += 1
    46             m = self.EMPTY_RE.match(line)
    47             m = self.EMPTY_RE.match(line)
    47             if not m:
    48             if not m:
    48                 raise Exception("Line {:d}: '{:s}' is not empty line\n".format(self.lineno, line))
    49                 raise Exception("Line {:d}: '{:s}' is not empty line\n".format(self.lineno, line))
    49 
    50 
    50             line = self.stream.readline()
    51             while True:
    51             if len(line) == 0:
    52                 line = self.stream.readline()
    52                 break
    53                 if len(line) == 0:
    53             line = line.strip()
    54                     return wlist
    54             self.lineno += 1
    55                 self.lineno += 1
    55             m = self.HEADWORD_RE.match(line)
    56                 m = self.HEADWORD_ATTR_RE.match(line)
    56             if not m:
    57                 if m:
    57                 raise Exception("Line {:d}: '{:s}' is not a headword\n".format(self.lineno, line))
    58                     continue
    58 
    59                 line = line.strip()
    59             wlist.append(line)
    60                 if len(line) == 0:
       
    61                     break
       
    62                 m = self.HEADWORD_RE.match(line)
       
    63                 if not m:
       
    64                     raise Exception("{:d}: '{:s}' is not a headword\n".format(self.lineno, line))
       
    65                 wlist.append(line)
    60         return wlist
    66         return wlist
    61 
    67 
    62 try:
    68 try:
    63     parser = GadictParser(FIN)
    69     parser = GadictParser(FIN)
    64     for headword in parser.parse():
    70     for headword in parser.parse():
    65         FOUT.write(headword)
    71         FOUT.write(headword)
    66         FOUT.write("\n")
    72         FOUT.write("\n")
       
    73 except Exception as ex:
       
    74     print("{}:{}".format(FINAME, str(ex)))
       
    75     raise ex
    67 finally:
    76 finally:
    68     FIN.close()
    77     FIN.close()
    69     FOUT.close()
    78     FOUT.close()
    70 
    79