Adopt to parse VOA dictionary: add topics support and translation continuation.
--- a/py/gadict.py Mon Mar 28 01:42:47 2016 +0300
+++ b/py/gadict.py Mon Mar 28 01:43:43 2016 +0300
@@ -38,9 +38,10 @@
HEADWORD_RE = regex.compile(r"^(\p{L}.*)$")
HEADWORD_VAR_RE = regex.compile(r"^ +(s|pl|v[123]|male|female|comp|super)$")
HEADWORD_PRON_RE = regex.compile(r"^ +\[([\p{L}' ]+)\]$")
- TRANSL_POS_RE = regex.compile(r"^n|pron|adj|v|adv|prep|conj|num|int|phr\.v|abbr$")
- TRANSL_RE = regex.compile(r"^(ru|uk|la|en): ([\p{L}(][\p{L}\p{P}~ ]*)$")
+ TRANSL_POS_RE = regex.compile(r"^n|pron|adj|v|adv|prep|conj|num|int|phr\.v|abbr|prefix$")
+ TRANSL_RE = regex.compile(r"^(ru|uk|la|en): ([\p{L}(].*)$")
TRANSL_EX_RE = regex.compile(r"^(ru|uk|la|en)> (\p{L}.*)$")
+ TOPIC_RE = regex.compile(r"^topic: (\p{L}.*)$")
CONT_RE = regex.compile(r"^ +(.*)")
@@ -75,7 +76,7 @@
raise ParseException(ex.msg, self.lineno, self.line)
return self.dom
- def parse_continuation(self):
+ def parse_prelude_continuation(self):
string = ""
while True:
self.readline()
@@ -98,7 +99,7 @@
raise ParseException("There are no articles...")
m = self.PRELUDE_ABOUT_RE.match(self.line)
if m:
- pre.about += m.group(1) + self.parse_continuation()
+ pre.about += m.group(1) + self.parse_prelude_continuation()
if self.eof:
raise ParseException("There are no articles...")
if self.SEPARATOR_RE.match(self.line):
@@ -173,13 +174,28 @@
raise ParseException("""Line is not a headword or translation or headword attribute...""")
self.words[word] = (pron, attrs)
+ def parse_translation_continuation(self):
+ string = ""
+ while True:
+ self.readline()
+ if self.eof:
+ return string
+ m = self.CONT_RE.match(self.line)
+ if m is not None:
+ string += "\n" + m.group(1)
+ else:
+ return string
+
def parse_translation(self):
senses = []
pos = None
tr = []
ex = []
+ read = True
while True:
- self.readline()
+ if read:
+ self.readline()
+ read = True
if self.eof:
break
m = self.SEPARATOR_RE.match(self.line)
@@ -197,13 +213,19 @@
raise ParseException("""Each translation should have only one part of speech marker...""")
pos = m.group(0)
continue
+ m = self.TOPIC_RE.match(self.line)
+ if m is not None:
+ # TODO
+ continue
m = self.TRANSL_RE.match(self.line)
if m is not None:
- tr.append((m.group(1), m.group(2)))
+ tr.append((m.group(1), m.group(2) + self.parse_translation_continuation()))
+ read = False
continue
m = self.TRANSL_EX_RE.match(self.line)
if m is not None:
- ex.append((m.group(1), m.group(2)))
+ ex.append((m.group(1), m.group(2) + self.parse_translation_continuation()))
+ read = False
continue
raise ParseException("""Uknown syntax...""")
if len(tr) > 0: