equal
deleted
inserted
replaced
|
1 |
|
2 import sys |
|
3 import codecs |
|
4 import io |
|
5 import regex |
|
6 |
|
7 FINAME = None |
|
8 FONAME = None |
|
9 if len(sys.argv) >= 2: |
|
10 FINAME = sys.argv[1] |
|
11 if len(sys.argv) >= 3: |
|
12 FONAME = sys.argv[2] |
|
13 |
|
14 FIN = io.open(FINAME, mode='r', buffering=1, encoding="utf-8") |
|
15 if FONAME is None: |
|
16 FOUT = sys.stdout |
|
17 else: |
|
18 FOUT = codecs.open(FONAME, "w", "utf-8") |
|
19 |
|
20 |
|
21 class GadictParser: |
|
22 |
|
23 SEPARATOR_RE = regex.compile(u"^__$") |
|
24 EMPTY_RE = regex.compile( u"^$" ) |
|
25 HEADWORD_RE = regex.compile( u"^(\\p{L}.*)$" ) |
|
26 |
|
27 def __init__(self, stream): |
|
28 self.stream = stream |
|
29 self.lineno = 0 |
|
30 |
|
31 def parse(self): |
|
32 wlist = [] |
|
33 while True: |
|
34 line = self.stream.readline() |
|
35 if len(line) == 0: |
|
36 break |
|
37 self.lineno += 1 |
|
38 m = self.SEPARATOR_RE.match(line) |
|
39 if not m: |
|
40 continue |
|
41 |
|
42 line = self.stream.readline() |
|
43 if len(line) == 0: |
|
44 break |
|
45 self.lineno += 1 |
|
46 m = self.EMPTY_RE.match(line) |
|
47 if not m: |
|
48 raise Exception("Line {:d}: '{:s}' is not empty line\n".format(self.lineno, line)) |
|
49 |
|
50 line = self.stream.readline() |
|
51 if len(line) == 0: |
|
52 break |
|
53 line = line.strip() |
|
54 self.lineno += 1 |
|
55 m = self.HEADWORD_RE.match(line) |
|
56 if not m: |
|
57 raise Exception("Line {:d}: '{:s}' is not a headword\n".format(self.lineno, line)) |
|
58 |
|
59 wlist.append(line) |
|
60 return wlist |
|
61 |
|
62 try: |
|
63 parser = GadictParser(FIN) |
|
64 for headword in parser.parse(): |
|
65 FOUT.write(headword) |
|
66 FOUT.write("\n") |
|
67 finally: |
|
68 FIN.close() |
|
69 FOUT.close() |
|
70 |