# HG changeset patch # User Oleksandr Gavenko # Date 1459118623 -10800 # Node ID ece60575a96a48a7585bba9382bf66939e828651 # Parent 2fac252890a5ebec7738a0c8dd89e28ea86b483e Adopt to parse VOA dictionary: add topics support and translation continuation. diff -r 2fac252890a5 -r ece60575a96a py/gadict.py --- a/py/gadict.py Mon Mar 28 01:42:47 2016 +0300 +++ b/py/gadict.py Mon Mar 28 01:43:43 2016 +0300 @@ -38,9 +38,10 @@ HEADWORD_RE = regex.compile(r"^(\p{L}.*)$") HEADWORD_VAR_RE = regex.compile(r"^ +(s|pl|v[123]|male|female|comp|super)$") HEADWORD_PRON_RE = regex.compile(r"^ +\[([\p{L}' ]+)\]$") - TRANSL_POS_RE = regex.compile(r"^n|pron|adj|v|adv|prep|conj|num|int|phr\.v|abbr$") - TRANSL_RE = regex.compile(r"^(ru|uk|la|en): ([\p{L}(][\p{L}\p{P}~ ]*)$") + TRANSL_POS_RE = regex.compile(r"^n|pron|adj|v|adv|prep|conj|num|int|phr\.v|abbr|prefix$") + TRANSL_RE = regex.compile(r"^(ru|uk|la|en): ([\p{L}(].*)$") TRANSL_EX_RE = regex.compile(r"^(ru|uk|la|en)> (\p{L}.*)$") + TOPIC_RE = regex.compile(r"^topic: (\p{L}.*)$") CONT_RE = regex.compile(r"^ +(.*)") @@ -75,7 +76,7 @@ raise ParseException(ex.msg, self.lineno, self.line) return self.dom - def parse_continuation(self): + def parse_prelude_continuation(self): string = "" while True: self.readline() @@ -98,7 +99,7 @@ raise ParseException("There are no articles...") m = self.PRELUDE_ABOUT_RE.match(self.line) if m: - pre.about += m.group(1) + self.parse_continuation() + pre.about += m.group(1) + self.parse_prelude_continuation() if self.eof: raise ParseException("There are no articles...") if self.SEPARATOR_RE.match(self.line): @@ -173,13 +174,28 @@ raise ParseException("""Line is not a headword or translation or headword attribute...""") self.words[word] = (pron, attrs) + def parse_translation_continuation(self): + string = "" + while True: + self.readline() + if self.eof: + return string + m = self.CONT_RE.match(self.line) + if m is not None: + string += "\n" + m.group(1) + else: + return string + def parse_translation(self): senses = [] pos = None tr = [] ex = [] + read = True while True: - self.readline() + if read: + self.readline() + read = True if self.eof: break m = self.SEPARATOR_RE.match(self.line) @@ -197,13 +213,19 @@ raise ParseException("""Each translation should have only one part of speech marker...""") pos = m.group(0) continue + m = self.TOPIC_RE.match(self.line) + if m is not None: + # TODO + continue m = self.TRANSL_RE.match(self.line) if m is not None: - tr.append((m.group(1), m.group(2))) + tr.append((m.group(1), m.group(2) + self.parse_translation_continuation())) + read = False continue m = self.TRANSL_EX_RE.match(self.line) if m is not None: - ex.append((m.group(1), m.group(2))) + ex.append((m.group(1), m.group(2) + self.parse_translation_continuation())) + read = False continue raise ParseException("""Uknown syntax...""") if len(tr) > 0: