author | Oleksandr Gavenko <gavenkoa@gmail.com> |
Tue, 06 Mar 2018 11:52:32 +0200 | |
changeset 1019 | 33ad36183b9e |
parent 757 | 5417f2102dc5 |
permissions | -rw-r--r-- |
643
c2c32f45dde6
Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
1 |
|
c2c32f45dde6
Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
2 |
import sys |
c2c32f45dde6
Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
3 |
import codecs |
c2c32f45dde6
Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
4 |
import io |
757
5417f2102dc5
Switch to built-in `re` Python module over `regex`.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
720
diff
changeset
|
5 |
import re |
643
c2c32f45dde6
Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
6 |
|
c2c32f45dde6
Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
7 |
FINAME = None |
c2c32f45dde6
Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
8 |
FONAME = None |
c2c32f45dde6
Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
9 |
if len(sys.argv) >= 2: |
c2c32f45dde6
Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
10 |
FINAME = sys.argv[1] |
c2c32f45dde6
Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
11 |
if len(sys.argv) >= 3: |
c2c32f45dde6
Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
12 |
FONAME = sys.argv[2] |
c2c32f45dde6
Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
13 |
|
c2c32f45dde6
Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
14 |
FIN = io.open(FINAME, mode='r', buffering=1, encoding="utf-8") |
c2c32f45dde6
Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
15 |
if FONAME is None: |
c2c32f45dde6
Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
16 |
FOUT = sys.stdout |
c2c32f45dde6
Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
17 |
else: |
c2c32f45dde6
Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
18 |
FOUT = codecs.open(FONAME, "w", "utf-8") |
c2c32f45dde6
Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
19 |
|
c2c32f45dde6
Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
20 |
|
c2c32f45dde6
Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
21 |
class GadictParser: |
c2c32f45dde6
Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
22 |
|
757
5417f2102dc5
Switch to built-in `re` Python module over `regex`.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
720
diff
changeset
|
23 |
SEPARATOR_RE = re.compile(u"^__$", re.UNICODE) |
5417f2102dc5
Switch to built-in `re` Python module over `regex`.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
720
diff
changeset
|
24 |
EMPTY_RE = re.compile( u"^$" ) |
5417f2102dc5
Switch to built-in `re` Python module over `regex`.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
720
diff
changeset
|
25 |
HEADWORD_ATTR_RE = re.compile( u"^ " ) |
5417f2102dc5
Switch to built-in `re` Python module over `regex`.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
720
diff
changeset
|
26 |
HEADWORD_RE = re.compile(u"^(\\w.*)$", re.UNICODE) |
643
c2c32f45dde6
Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
27 |
|
c2c32f45dde6
Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
28 |
def __init__(self, stream): |
c2c32f45dde6
Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
29 |
self.stream = stream |
c2c32f45dde6
Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
30 |
self.lineno = 0 |
c2c32f45dde6
Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
31 |
|
c2c32f45dde6
Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
32 |
def parse(self): |
c2c32f45dde6
Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
33 |
wlist = [] |
c2c32f45dde6
Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
34 |
while True: |
c2c32f45dde6
Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
35 |
line = self.stream.readline() |
c2c32f45dde6
Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
36 |
if len(line) == 0: |
720
b5a4b476eddf
Include all headwords into list not just first.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
643
diff
changeset
|
37 |
return wlist |
643
c2c32f45dde6
Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
38 |
self.lineno += 1 |
c2c32f45dde6
Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
39 |
m = self.SEPARATOR_RE.match(line) |
c2c32f45dde6
Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
40 |
if not m: |
c2c32f45dde6
Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
41 |
continue |
c2c32f45dde6
Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
42 |
|
c2c32f45dde6
Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
43 |
line = self.stream.readline() |
c2c32f45dde6
Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
44 |
if len(line) == 0: |
720
b5a4b476eddf
Include all headwords into list not just first.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
643
diff
changeset
|
45 |
return wlist |
643
c2c32f45dde6
Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
46 |
self.lineno += 1 |
c2c32f45dde6
Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
47 |
m = self.EMPTY_RE.match(line) |
c2c32f45dde6
Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
48 |
if not m: |
c2c32f45dde6
Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
49 |
raise Exception("Line {:d}: '{:s}' is not empty line\n".format(self.lineno, line)) |
c2c32f45dde6
Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
50 |
|
720
b5a4b476eddf
Include all headwords into list not just first.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
643
diff
changeset
|
51 |
while True: |
b5a4b476eddf
Include all headwords into list not just first.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
643
diff
changeset
|
52 |
line = self.stream.readline() |
b5a4b476eddf
Include all headwords into list not just first.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
643
diff
changeset
|
53 |
if len(line) == 0: |
b5a4b476eddf
Include all headwords into list not just first.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
643
diff
changeset
|
54 |
return wlist |
b5a4b476eddf
Include all headwords into list not just first.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
643
diff
changeset
|
55 |
self.lineno += 1 |
b5a4b476eddf
Include all headwords into list not just first.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
643
diff
changeset
|
56 |
m = self.HEADWORD_ATTR_RE.match(line) |
b5a4b476eddf
Include all headwords into list not just first.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
643
diff
changeset
|
57 |
if m: |
b5a4b476eddf
Include all headwords into list not just first.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
643
diff
changeset
|
58 |
continue |
b5a4b476eddf
Include all headwords into list not just first.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
643
diff
changeset
|
59 |
line = line.strip() |
b5a4b476eddf
Include all headwords into list not just first.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
643
diff
changeset
|
60 |
if len(line) == 0: |
b5a4b476eddf
Include all headwords into list not just first.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
643
diff
changeset
|
61 |
break |
b5a4b476eddf
Include all headwords into list not just first.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
643
diff
changeset
|
62 |
m = self.HEADWORD_RE.match(line) |
b5a4b476eddf
Include all headwords into list not just first.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
643
diff
changeset
|
63 |
if not m: |
b5a4b476eddf
Include all headwords into list not just first.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
643
diff
changeset
|
64 |
raise Exception("{:d}: '{:s}' is not a headword\n".format(self.lineno, line)) |
b5a4b476eddf
Include all headwords into list not just first.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
643
diff
changeset
|
65 |
wlist.append(line) |
643
c2c32f45dde6
Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
66 |
return wlist |
c2c32f45dde6
Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
67 |
|
c2c32f45dde6
Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
68 |
try: |
c2c32f45dde6
Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
69 |
parser = GadictParser(FIN) |
c2c32f45dde6
Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
70 |
for headword in parser.parse(): |
c2c32f45dde6
Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
71 |
FOUT.write(headword) |
c2c32f45dde6
Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
72 |
FOUT.write("\n") |
720
b5a4b476eddf
Include all headwords into list not just first.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
643
diff
changeset
|
73 |
except Exception as ex: |
b5a4b476eddf
Include all headwords into list not just first.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
643
diff
changeset
|
74 |
print("{}:{}".format(FINAME, str(ex))) |
b5a4b476eddf
Include all headwords into list not just first.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
643
diff
changeset
|
75 |
raise ex |
643
c2c32f45dde6
Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
76 |
finally: |
c2c32f45dde6
Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
77 |
FIN.close() |
c2c32f45dde6
Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
78 |
FOUT.close() |
c2c32f45dde6
Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
79 |