32 else: |
32 else: |
33 return u":{:d}: {:s}\nLINE: {:s}".format(self.lineno, self.msg, self.line) |
33 return u":{:d}: {:s}\nLINE: {:s}".format(self.lineno, self.msg, self.line) |
34 |
34 |
35 class Headword: |
35 class Headword: |
36 |
36 |
37 def __init__(self, headword, pron = None, attrs = None): |
37 def __init__(self, headword, pron = None, attrs = None, homo = None): |
38 self.headword = headword |
38 self.headword = headword |
39 self.pron = pron |
39 self.pron = pron |
40 self.attrs = attrs |
40 self.attrs = attrs |
|
41 self.homo = homo |
41 |
42 |
42 def __str__(self): |
43 def __str__(self): |
43 return self.headword |
44 return self.headword |
44 def __repr__(self): |
45 def __repr__(self): |
45 return "<Headword {}>".format(self.headword) |
46 return "<Headword {}>".format(self.headword) |
129 |
130 |
130 SEPARATOR_RE = re.compile(u"^__$", re.UNICODE) |
131 SEPARATOR_RE = re.compile(u"^__$", re.UNICODE) |
131 HEADWORD_RE = re.compile( u"^(\\w.*)$" ) |
132 HEADWORD_RE = re.compile( u"^(\\w.*)$" ) |
132 HEADWORD_VAR_RE = re.compile(u"^ +(rare|s|pl|v[123]|male|female|comp|super|abbr|Am|Br|Au)$", re.UNICODE) |
133 HEADWORD_VAR_RE = re.compile(u"^ +(rare|s|pl|v[123]|male|female|comp|super|abbr|Am|Br|Au)$", re.UNICODE) |
133 HEADWORD_PRON_RE = re.compile(u"^ +\\[([a-zˌˈːəæɛɒʊɪɔɜɑʌɚɐɹʃʧθðɡʒŋɾʔ ]+)\\]$", re.UNICODE) |
134 HEADWORD_PRON_RE = re.compile(u"^ +\\[([a-zˌˈːəæɛɒʊɪɔɜɑʌɚɐɹʃʧθðɡʒŋɾʔ ]+)\\]$", re.UNICODE) |
|
135 HEADWORD_HOMO_RE = re.compile(u"^ +homo: (\\w+)$", re.UNICODE) |
134 TRANSL_POS_RE = re.compile(u"^(?:n|det|pron|adj|v|adv|prep|conj|num|int|phr|phr\\.v|contr|abbr|prefix)$", re.UNICODE) |
136 TRANSL_POS_RE = re.compile(u"^(?:n|det|pron|adj|v|adv|prep|conj|num|int|phr|phr\\.v|contr|abbr|prefix)$", re.UNICODE) |
135 TRANSL_RE = re.compile(u"^(ru|uk|la|en): ([\\w(].*)$", re.UNICODE) |
137 TRANSL_RE = re.compile(u"^(ru|uk|la|en): ([\\w(].*)$", re.UNICODE) |
136 TRANSL_EX_RE = re.compile(u"""^(ru|uk|la|en)> ([-'"\\w].*)$""", re.UNICODE) |
138 TRANSL_EX_RE = re.compile(u"""^(ru|uk|la|en)> ([-'"\\w].*)$""", re.UNICODE) |
137 TRANSL_GLOS_RE = re.compile(u"^(ru|uk|la|en)= ([-\\w\\d].*)$", re.UNICODE) |
139 TRANSL_GLOS_RE = re.compile(u"^(ru|uk|la|en)= ([-\\w\\d].*)$", re.UNICODE) |
138 TOPIC_RE = re.compile(u"^topic: (\\w.*)$", re.UNICODE) |
140 TOPIC_RE = re.compile(u"^topic: (\\w.*)$", re.UNICODE) |
253 if m is None: |
255 if m is None: |
254 raise ParseException("""There are no headword after "__" delimiter...""") |
256 raise ParseException("""There are no headword after "__" delimiter...""") |
255 word = m.group(1) |
257 word = m.group(1) |
256 pron = None |
258 pron = None |
257 attrs = set() |
259 attrs = set() |
|
260 homo = None |
258 while True: |
261 while True: |
259 self.readline() |
262 self.readline() |
260 if self.eof or len(self.line) == 0: |
263 if self.eof or len(self.line) == 0: |
261 break |
264 break |
262 m = self.HEADWORD_RE.match(self.line) |
265 m = self.HEADWORD_RE.match(self.line) |
263 if m is not None: |
266 if m is not None: |
264 if word is None: |
267 if word is None: |
265 raise ParseException("""Didn't match previous headword...""") |
268 raise ParseException("""Didn't match previous headword...""") |
266 self.words.append(Headword(word, pron, attrs)) |
269 self.words.append(Headword(word, pron, attrs, homo = homo)) |
267 word = m.group(1) |
270 word = m.group(1) |
268 pron = None |
271 pron = None |
269 attrs = set() |
272 attrs = set() |
|
273 homo = None |
270 continue |
274 continue |
271 m = self.HEADWORD_PRON_RE.match(self.line) |
275 m = self.HEADWORD_PRON_RE.match(self.line) |
272 if m is not None: |
276 if m is not None: |
273 if pron is not None: |
277 if pron is not None: |
274 raise ParseException("""Pronunciation is redefined...""") |
278 raise ParseException("""Pronunciation is redefined...""") |
276 continue |
280 continue |
277 m = self.HEADWORD_VAR_RE.match(self.line) |
281 m = self.HEADWORD_VAR_RE.match(self.line) |
278 if m is not None: |
282 if m is not None: |
279 attrs.add(m.group(1)) |
283 attrs.add(m.group(1)) |
280 continue |
284 continue |
|
285 m = self.HEADWORD_HOMO_RE.match(self.line) |
|
286 if m is not None: |
|
287 if homo is not None: |
|
288 raise ParseException("""Homophones are redefined...""") |
|
289 homo = [s.strip() for s in m.group(1).split(";")] |
|
290 continue |
281 raise ParseException("""Line is not a headword or translation or headword attribute...""") |
291 raise ParseException("""Line is not a headword or translation or headword attribute...""") |
282 self.words.append(Headword(word, pron, attrs)) |
292 self.words.append(Headword(word, pron, attrs, homo)) |
283 |
293 |
284 def parse_translation_continuation(self): |
294 def parse_translation_continuation(self): |
285 string = "" |
295 string = "" |
286 while True: |
296 while True: |
287 self.readline() |
297 self.readline() |