equal
deleted
inserted
replaced
20 |
20 |
21 class GadictParser: |
21 class GadictParser: |
22 |
22 |
23 SEPARATOR_RE = regex.compile(u"^__$") |
23 SEPARATOR_RE = regex.compile(u"^__$") |
24 EMPTY_RE = regex.compile( u"^$" ) |
24 EMPTY_RE = regex.compile( u"^$" ) |
|
25 HEADWORD_ATTR_RE = regex.compile( u"^ " ) |
25 HEADWORD_RE = regex.compile( u"^(\\p{L}.*)$" ) |
26 HEADWORD_RE = regex.compile( u"^(\\p{L}.*)$" ) |
26 |
27 |
27 def __init__(self, stream): |
28 def __init__(self, stream): |
28 self.stream = stream |
29 self.stream = stream |
29 self.lineno = 0 |
30 self.lineno = 0 |
31 def parse(self): |
32 def parse(self): |
32 wlist = [] |
33 wlist = [] |
33 while True: |
34 while True: |
34 line = self.stream.readline() |
35 line = self.stream.readline() |
35 if len(line) == 0: |
36 if len(line) == 0: |
36 break |
37 return wlist |
37 self.lineno += 1 |
38 self.lineno += 1 |
38 m = self.SEPARATOR_RE.match(line) |
39 m = self.SEPARATOR_RE.match(line) |
39 if not m: |
40 if not m: |
40 continue |
41 continue |
41 |
42 |
42 line = self.stream.readline() |
43 line = self.stream.readline() |
43 if len(line) == 0: |
44 if len(line) == 0: |
44 break |
45 return wlist |
45 self.lineno += 1 |
46 self.lineno += 1 |
46 m = self.EMPTY_RE.match(line) |
47 m = self.EMPTY_RE.match(line) |
47 if not m: |
48 if not m: |
48 raise Exception("Line {:d}: '{:s}' is not empty line\n".format(self.lineno, line)) |
49 raise Exception("Line {:d}: '{:s}' is not empty line\n".format(self.lineno, line)) |
49 |
50 |
50 line = self.stream.readline() |
51 while True: |
51 if len(line) == 0: |
52 line = self.stream.readline() |
52 break |
53 if len(line) == 0: |
53 line = line.strip() |
54 return wlist |
54 self.lineno += 1 |
55 self.lineno += 1 |
55 m = self.HEADWORD_RE.match(line) |
56 m = self.HEADWORD_ATTR_RE.match(line) |
56 if not m: |
57 if m: |
57 raise Exception("Line {:d}: '{:s}' is not a headword\n".format(self.lineno, line)) |
58 continue |
58 |
59 line = line.strip() |
59 wlist.append(line) |
60 if len(line) == 0: |
|
61 break |
|
62 m = self.HEADWORD_RE.match(line) |
|
63 if not m: |
|
64 raise Exception("{:d}: '{:s}' is not a headword\n".format(self.lineno, line)) |
|
65 wlist.append(line) |
60 return wlist |
66 return wlist |
61 |
67 |
62 try: |
68 try: |
63 parser = GadictParser(FIN) |
69 parser = GadictParser(FIN) |
64 for headword in parser.parse(): |
70 for headword in parser.parse(): |
65 FOUT.write(headword) |
71 FOUT.write(headword) |
66 FOUT.write("\n") |
72 FOUT.write("\n") |
|
73 except Exception as ex: |
|
74 print("{}:{}".format(FINAME, str(ex))) |
|
75 raise ex |
67 finally: |
76 finally: |
68 FIN.close() |
77 FIN.close() |
69 FOUT.close() |
78 FOUT.close() |
70 |
79 |