Adopt to parse VOA dictionary: add topics support and translation continuation.
authorOleksandr Gavenko <gavenkoa@gmail.com>
Mon, 28 Mar 2016 01:43:43 +0300
changeset 412 ece60575a96a
parent 411 2fac252890a5
child 413 4c00deeb1225
Adopt to parse VOA dictionary: add topics support and translation continuation.
py/gadict.py
--- a/py/gadict.py	Mon Mar 28 01:42:47 2016 +0300
+++ b/py/gadict.py	Mon Mar 28 01:43:43 2016 +0300
@@ -38,9 +38,10 @@
     HEADWORD_RE = regex.compile(r"^(\p{L}.*)$")
     HEADWORD_VAR_RE = regex.compile(r"^ +(s|pl|v[123]|male|female|comp|super)$")
     HEADWORD_PRON_RE = regex.compile(r"^ +\[([\p{L}' ]+)\]$")
-    TRANSL_POS_RE = regex.compile(r"^n|pron|adj|v|adv|prep|conj|num|int|phr\.v|abbr$")
-    TRANSL_RE = regex.compile(r"^(ru|uk|la|en): ([\p{L}(][\p{L}\p{P}~ ]*)$")
+    TRANSL_POS_RE = regex.compile(r"^n|pron|adj|v|adv|prep|conj|num|int|phr\.v|abbr|prefix$")
+    TRANSL_RE = regex.compile(r"^(ru|uk|la|en): ([\p{L}(].*)$")
     TRANSL_EX_RE = regex.compile(r"^(ru|uk|la|en)> (\p{L}.*)$")
+    TOPIC_RE = regex.compile(r"^topic: (\p{L}.*)$")
 
     CONT_RE = regex.compile(r"^ +(.*)")
 
@@ -75,7 +76,7 @@
             raise ParseException(ex.msg, self.lineno, self.line)
         return self.dom
 
-    def parse_continuation(self):
+    def parse_prelude_continuation(self):
         string = ""
         while True:
             self.readline()
@@ -98,7 +99,7 @@
                 raise ParseException("There are no articles...")
             m = self.PRELUDE_ABOUT_RE.match(self.line)
             if m:
-                pre.about += m.group(1) + self.parse_continuation()
+                pre.about += m.group(1) + self.parse_prelude_continuation()
                 if self.eof:
                     raise ParseException("There are no articles...")
             if self.SEPARATOR_RE.match(self.line):
@@ -173,13 +174,28 @@
             raise ParseException("""Line is not a headword or translation or headword attribute...""")
         self.words[word] = (pron, attrs)
 
+    def parse_translation_continuation(self):
+        string = ""
+        while True:
+            self.readline()
+            if self.eof:
+                return string
+            m = self.CONT_RE.match(self.line)
+            if m is not None:
+                string += "\n" + m.group(1)
+            else:
+                return string
+
     def parse_translation(self):
         senses = []
         pos = None
         tr = []
         ex = []
+        read = True
         while True:
-            self.readline()
+            if read:
+                self.readline()
+            read = True
             if self.eof:
                 break
             m = self.SEPARATOR_RE.match(self.line)
@@ -197,13 +213,19 @@
                     raise ParseException("""Each translation should have only one part of speech marker...""")
                 pos = m.group(0)
                 continue
+            m = self.TOPIC_RE.match(self.line)
+            if m is not None:
+                # TODO
+                continue
             m = self.TRANSL_RE.match(self.line)
             if m is not None:
-                tr.append((m.group(1), m.group(2)))
+                tr.append((m.group(1), m.group(2) + self.parse_translation_continuation()))
+                read = False
                 continue
             m = self.TRANSL_EX_RE.match(self.line)
             if m is not None:
-                ex.append((m.group(1), m.group(2)))
+                ex.append((m.group(1), m.group(2) + self.parse_translation_continuation()))
+                read = False
                 continue
             raise ParseException("""Uknown syntax...""")
         if len(tr) > 0: