28 elif self.line is None: |
28 elif self.line is None: |
29 return ":{:d}:{:s}".format(self.lineno, self.msg) |
29 return ":{:d}:{:s}".format(self.lineno, self.msg) |
30 else: |
30 else: |
31 return ":{:d}: {:s}\nLINE: {:s}".format(self.lineno, self.msg, self.line) |
31 return ":{:d}: {:s}\nLINE: {:s}".format(self.lineno, self.msg, self.line) |
32 |
32 |
|
33 class Sense: |
|
34 |
|
35 def __init__(self, pos, tr_list = None, ex_list = None, syn_list = None, ant_list = None, topic_list = None): |
|
36 if not pos: |
|
37 raise ParseException("Part of speech expected...\n") |
|
38 self.pos = pos |
|
39 if tr_list: |
|
40 self.tr_list = tr_list |
|
41 else: |
|
42 self.tr_list = [] |
|
43 self.ex_list = ex_list |
|
44 self.syn_list = syn_list |
|
45 self.ant_list = ant_list |
|
46 self.topic_list = topic_list |
|
47 |
|
48 def add_tr(self, tr): |
|
49 self.tr_list.append(tr) |
|
50 |
|
51 def add_ex(self, ex): |
|
52 if not self.ex_list: |
|
53 self.ex_list = [ex] |
|
54 else: |
|
55 self.ex_list.append(ex) |
|
56 |
|
57 def add_syn(self, syn): |
|
58 if not self.syn_list: |
|
59 self.syn_list = [syn] |
|
60 else: |
|
61 self.syn_list.append(syn) |
|
62 |
|
63 def add_ant(self, ant): |
|
64 if not self.ant_list: |
|
65 self.ant_list = [ant] |
|
66 else: |
|
67 self.ant_list.append(ant) |
|
68 |
|
69 def add_topic(self, topic): |
|
70 if not self.topic_list: |
|
71 self.topic_list = [topic] |
|
72 else: |
|
73 self.topic_list.append(topic) |
33 |
74 |
34 class Parser: |
75 class Parser: |
35 """gadict dictionary format parser.""" |
76 """gadict dictionary format parser.""" |
36 |
77 |
37 COMMENT_RE = regex.compile(r"^# ") |
78 COMMENT_RE = regex.compile(r"^# ") |
38 |
79 |
39 SEPARATOR_RE = regex.compile(r"^__$") |
80 SEPARATOR_RE = regex.compile(r"^__$") |
40 HEADWORD_RE = regex.compile(r"^(\p{L}.*)$") |
81 HEADWORD_RE = regex.compile(r"^(\p{L}.*)$") |
41 HEADWORD_VAR_RE = regex.compile(r"^ +(s|pl|v[123]|male|female|comp|super|abbr|Am|Br|Au)$") |
82 HEADWORD_VAR_RE = regex.compile(r"^ +(s|pl|v[123]|male|female|comp|super|abbr|Am|Br|Au)$") |
42 HEADWORD_PRON_RE = regex.compile(r"^ +\[([\p{L}' ]+)\]$") |
83 HEADWORD_PRON_RE = regex.compile(r"^ +\[([\p{L}' ]+)\]$") |
43 TRANSL_POS_RE = regex.compile(r"^n|pron|adj|v|adv|prep|conj|num|int|phr|phr\.v|abbr|prefix$") |
84 TRANSL_POS_RE = regex.compile(r"^n|det|pron|adj|v|adv|prep|conj|num|int|phr|phr\.v|abbr|prefix$") |
44 TRANSL_RE = regex.compile(r"^(ru|uk|la|en): ([\p{L}(].*)$") |
85 TRANSL_RE = regex.compile(r"^(ru|uk|la|en): ([\p{L}(].*)$") |
45 TRANSL_EX_RE = regex.compile(r"^(ru|uk|la|en)> ([-\p{L}].*)$") |
86 TRANSL_EX_RE = regex.compile(r"^(ru|uk|la|en)> ([-\p{L}].*)$") |
46 TOPIC_RE = regex.compile(r"^(topic|ant|syn): (\p{L}.*)$") |
87 TOPIC_RE = regex.compile(r"^topic: (\p{L}.*)$") |
|
88 SYN_RE = regex.compile(r"^syn: (\p{L}.*)$") |
|
89 ANT_RE = regex.compile(r"^ant: (\p{L}.*)$") |
47 |
90 |
48 CONT_RE = regex.compile(r"^ +(.*)") |
91 CONT_RE = regex.compile(r"^ +(.*)") |
49 |
92 |
50 TRAILING_SPACES_RE = regex.compile(r"\p{Z}+$") |
93 TRAILING_SPACES_RE = regex.compile(r"\p{Z}+$") |
51 |
94 |
192 else: |
235 else: |
193 return string |
236 return string |
194 |
237 |
195 def parse_translation(self): |
238 def parse_translation(self): |
196 senses = [] |
239 senses = [] |
197 pos = None |
240 sense = None |
198 tr = [] |
|
199 ex = [] |
|
200 read = True |
241 read = True |
201 while True: |
242 while True: |
202 if read: |
243 if read: |
203 self.readline() |
244 self.readline() |
204 read = True |
245 read = True |
205 if self.eof: |
246 if self.eof: |
206 break |
247 break |
207 m = self.SEPARATOR_RE.match(self.line) |
248 m = self.SEPARATOR_RE.match(self.line) |
208 if m is not None: |
249 if m is not None: |
|
250 if sense: |
|
251 senses.append(sense) |
209 break |
252 break |
210 if len(self.line) == 1: |
253 if len(self.line) == 1: |
211 senses.append((pos, tr, ex)) |
254 if sense: |
212 pos = None |
255 senses.append(sense) |
213 tr = [] |
256 sense = None |
214 ex = [] |
|
215 continue |
257 continue |
216 m = self.TRANSL_POS_RE.match(self.line) |
258 m = self.TRANSL_POS_RE.match(self.line) |
217 if m is not None: |
259 if m is not None: |
218 if pos is not None: |
260 if sense is not None: |
219 raise ParseException("""Each translation should have only one part of speech marker...""") |
261 raise ParseException("""Each translation should have only one part of speech marker...""") |
220 pos = m.group(0) |
262 pos = m.group(0) |
221 continue |
263 sense = Sense(pos) |
|
264 continue |
|
265 if not sense: |
|
266 raise ParseException("""Missing part of speech marker...""") |
222 m = self.TOPIC_RE.match(self.line) |
267 m = self.TOPIC_RE.match(self.line) |
223 if m is not None: |
268 if m is not None: |
224 # TODO |
269 topics = m.group(1).split(";") |
|
270 for topic in topics: |
|
271 topic = topic.strip() |
|
272 if len(topic) == 0: |
|
273 raise ParseException("""Empty topic...""") |
|
274 sense.add_topic(topic) |
|
275 continue |
|
276 m = self.SYN_RE.match(self.line) |
|
277 if m is not None: |
|
278 syns = m.group(1).split(";") |
|
279 for syn in syns: |
|
280 syn = syn.strip() |
|
281 if len(syn) == 0: |
|
282 raise ParseException("""Empty synonym...""") |
|
283 sense.add_syn(syn) |
|
284 continue |
|
285 m = self.ANT_RE.match(self.line) |
|
286 if m is not None: |
|
287 ants = m.group(1).split(";") |
|
288 for ant in ants: |
|
289 ant = ant.strip() |
|
290 if len(ant) == 0: |
|
291 raise ParseException("""Empty antonym...""") |
|
292 sense.add_ant(ant) |
225 continue |
293 continue |
226 m = self.TRANSL_RE.match(self.line) |
294 m = self.TRANSL_RE.match(self.line) |
227 if m is not None: |
295 if m is not None: |
228 tr.append((m.group(1), m.group(2) + self.parse_translation_continuation())) |
296 sense.add_tr((m.group(1), m.group(2) + self.parse_translation_continuation())) |
229 read = False |
297 read = False |
230 continue |
298 continue |
231 m = self.TRANSL_EX_RE.match(self.line) |
299 m = self.TRANSL_EX_RE.match(self.line) |
232 if m is not None: |
300 if m is not None: |
233 ex.append((m.group(1), m.group(2) + self.parse_translation_continuation())) |
301 sense.add_ex((m.group(1), m.group(2) + self.parse_translation_continuation())) |
234 read = False |
302 read = False |
235 continue |
303 continue |
236 raise ParseException("""Uknown syntax...""") |
304 raise ParseException("""Uknown syntax...""") |
237 if len(tr) > 0: |
|
238 senses.append((pos, tr, ex)) |
|
239 self.tran = senses |
305 self.tran = senses |