|
1 |
|
2 import io |
|
3 import sys |
|
4 # import re |
|
5 import regex |
|
6 |
|
7 |
|
8 # fgadict = "gadict_en-ru+ua.gadict" |
|
9 fgadict = None |
|
10 fnout = None |
|
11 if len(sys.argv) >= 2: |
|
12 fgadict = sys.argv[1] |
|
13 if len(sys.argv) >= 3: |
|
14 fnout = sys.argv[2] |
|
15 |
|
16 fin = io.open(fgadict, mode='r', buffering=1, encoding="utf-8") |
|
17 if fnout is None: |
|
18 fout = sys.stdout |
|
19 else: |
|
20 fout = open(fnout, "w") |
|
21 |
|
22 |
|
23 class ParseException(Exception): |
|
24 |
|
25 def __init__(self, msg): |
|
26 self.msg = msg |
|
27 |
|
28 def __repr__(self): |
|
29 return self.msg |
|
30 |
|
31 |
|
32 class Parser: |
|
33 |
|
34 SEPARATOR_RE = regex.compile(r"^__$") |
|
35 HEADWORD_RE = regex.compile(r"^(\p{L}.*)$") |
|
36 HEADWORD_VAR_RE = regex.compile(r"^ +(s|pl|v[123]|male|female|comp|super)$") |
|
37 HEADWORD_PRON_RE = regex.compile(r"^ +\[([\p{L}' ]+)\]$") |
|
38 TRANSL_POS_RE = regex.compile(r"^n|pron|adj|v|adv|prep|conj|num|int|phr\.v|abbr$") |
|
39 TRANSL_RE = regex.compile(r"^(ru|uk|la|en): ([\p{L}(][\p{L}\p{P}~ ]*)$") |
|
40 TRANSL_EX_RE = regex.compile(r"^(ru|uk|la|en)> (\p{L}.*)$") |
|
41 |
|
42 TRAILING_SPACES_RE = regex.compile(r"\p{Z}+$") |
|
43 |
|
44 def __init__(self): |
|
45 pass |
|
46 |
|
47 def readline(self): |
|
48 self.line = self.stream.readline() |
|
49 self.eof = len(self.line) == 0 |
|
50 if not self.eof: |
|
51 self.lineno += 1 |
|
52 |
|
53 def parse(self, stream): |
|
54 self.lineno = 0 |
|
55 self.stream = stream |
|
56 self.dom = [] |
|
57 try: |
|
58 self.parse_prelude() |
|
59 while not self.eof: |
|
60 self.parse_article() |
|
61 except ParseException as ex: |
|
62 if self.TRAILING_SPACES_RE.match(self.line): |
|
63 fout.write("{:s}:{:d}: {:s}".format(fgadict, self.lineno, "Traling spaces detected...\n")) |
|
64 fout.write("{:s}:{:d}: {:s}\nLINE: {:s}\n".format(fgadict, self.lineno, str(ex), self.line)) |
|
65 raise Exception(ex) |
|
66 return self.dom |
|
67 |
|
68 def parse_prelude(self): |
|
69 """Read dictionary prelude until first "__" delimiter.""" |
|
70 while True: |
|
71 self.readline() |
|
72 if self.eof: |
|
73 raise ParseException("There are no articles...") |
|
74 if self.SEPARATOR_RE.match(self.line): |
|
75 break |
|
76 |
|
77 def parse_article(self): |
|
78 """Try to match article until next "__" delimiter. Assume that `self.line` point to "__" delimiter.""" |
|
79 self.words = None |
|
80 self.tran = None |
|
81 self.parse_empty_line() |
|
82 self.parse_headlines() |
|
83 self.parse_translation() |
|
84 self.dom.append((self.words, self.tran)) |
|
85 |
|
86 def parse_empty_line(self): |
|
87 self.readline() |
|
88 if self.eof or len(self.line) != 1: |
|
89 raise ParseException(""""__" delimiter should followed by empty line...""") |
|
90 |
|
91 def parse_headlines(self): |
|
92 """Try to match word variations with attributed. Assume that `self.line` on preceding empty line.""" |
|
93 self.words = {} |
|
94 self.readline() |
|
95 if self.eof: |
|
96 raise ParseException("""There are no definition after "__" delimiter...""") |
|
97 m = self.HEADWORD_RE.match(self.line) |
|
98 if m is None: |
|
99 raise ParseException("""There are no headword after "__" delimiter...""") |
|
100 word = m.group(1) |
|
101 pron = None |
|
102 attrs = set() |
|
103 while True: |
|
104 self.readline() |
|
105 if self.eof or len(self.line) == 1: |
|
106 break |
|
107 m = self.HEADWORD_RE.match(self.line) |
|
108 if m is not None: |
|
109 if word is None: |
|
110 raise ParseException("""Didn't match previous headword...""") |
|
111 self.words[word] = (pron, attrs) |
|
112 word = m.group(1) |
|
113 pron = None |
|
114 attrs = set() |
|
115 continue |
|
116 m = self.HEADWORD_PRON_RE.match(self.line) |
|
117 if m is not None: |
|
118 if pron is not None: |
|
119 raise ParseException("""Pronunciation is redefined...""") |
|
120 pron = m.group(1) |
|
121 continue |
|
122 m = self.HEADWORD_VAR_RE.match(self.line) |
|
123 if m is not None: |
|
124 attrs.add(m.group(1)) |
|
125 continue |
|
126 raise ParseException("""Line is not headword or translation or headword attribute...""") |
|
127 self.words[word] = (pron, attrs) |
|
128 |
|
129 def parse_translation(self): |
|
130 senses = [] |
|
131 pos = None |
|
132 tr = [] |
|
133 ex = [] |
|
134 while True: |
|
135 self.readline() |
|
136 if self.eof: |
|
137 break |
|
138 m = self.SEPARATOR_RE.match(self.line) |
|
139 if m is not None: |
|
140 break |
|
141 if len(self.line) == 1: |
|
142 senses.append((pos, tr, ex)) |
|
143 pos = None |
|
144 tr = [] |
|
145 ex = [] |
|
146 continue |
|
147 m = self.TRANSL_POS_RE.match(self.line) |
|
148 if m is not None: |
|
149 if pos is not None: |
|
150 raise ParseException("""Each translation should have only one part of speech marker...""") |
|
151 pos = m.group(0) |
|
152 continue |
|
153 m = self.TRANSL_RE.match(self.line) |
|
154 if m is not None: |
|
155 tr.append((m.group(1), m.group(2))) |
|
156 continue |
|
157 m = self.TRANSL_EX_RE.match(self.line) |
|
158 if m is not None: |
|
159 ex.append((m.group(1), m.group(2))) |
|
160 continue |
|
161 raise ParseException("""Uknown syntax...""") |
|
162 if len(tr) > 0: |
|
163 senses.append((pos, tr, ex)) |
|
164 self.tran = senses |
|
165 |
|
166 parser = Parser() |
|
167 dom = parser.parse(fin) |
|
168 fin.close() |
|
169 |
|
170 for idx in range(1, len(dom)): |
|
171 article = dom[idx] |
|
172 fout.write("_____\n\n") |
|
173 title = "; ".join(article[0].keys()) |
|
174 fout.write(title) |
|
175 fout.write("\n\n") |
|
176 for (word, (pron, attrs)) in article[0].items(): |
|
177 if word == "approach": |
|
178 fout.write(str(article[0])) |
|
179 fout.write(" ") |
|
180 fout.write(word) |
|
181 fout.write("\n") |
|
182 if pron is not None: |
|
183 fout.write(" [") |
|
184 fout.write(pron) |
|
185 fout.write("]\n") |
|
186 if len(attrs) > 0: |
|
187 fout.write(" ") |
|
188 l = list(attrs) |
|
189 l.sort() |
|
190 fout.write(", ".join(l)) |
|
191 fout.write("\n") |
|
192 fout.write("\n") |
|
193 for (pos, trs, exs) in article[1]: |
|
194 fout.write(" ") |
|
195 if pos is not None: |
|
196 fout.write("⟨") |
|
197 fout.write(pos) |
|
198 fout.write("⟩ ") |
|
199 for (lang, tr) in trs: |
|
200 if lang == "ru": |
|
201 fout.write(tr) |
|
202 break |
|
203 fout.write("\n") |
|
204 |
|
205 # fout.write(str(article[0])+"\n") |
|
206 |