Make storage for topics, antonyms and synonyms. Require pos marker.
--- a/py/gadict.py Thu Aug 25 00:29:40 2016 +0300
+++ b/py/gadict.py Thu Aug 25 00:32:25 2016 +0300
@@ -30,6 +30,47 @@
else:
return ":{:d}: {:s}\nLINE: {:s}".format(self.lineno, self.msg, self.line)
+class Sense:
+
+ def __init__(self, pos, tr_list = None, ex_list = None, syn_list = None, ant_list = None, topic_list = None):
+ if not pos:
+ raise ParseException("Part of speech expected...\n")
+ self.pos = pos
+ if tr_list:
+ self.tr_list = tr_list
+ else:
+ self.tr_list = []
+ self.ex_list = ex_list
+ self.syn_list = syn_list
+ self.ant_list = ant_list
+ self.topic_list = topic_list
+
+ def add_tr(self, tr):
+ self.tr_list.append(tr)
+
+ def add_ex(self, ex):
+ if not self.ex_list:
+ self.ex_list = [ex]
+ else:
+ self.ex_list.append(ex)
+
+ def add_syn(self, syn):
+ if not self.syn_list:
+ self.syn_list = [syn]
+ else:
+ self.syn_list.append(syn)
+
+ def add_ant(self, ant):
+ if not self.ant_list:
+ self.ant_list = [ant]
+ else:
+ self.ant_list.append(ant)
+
+ def add_topic(self, topic):
+ if not self.topic_list:
+ self.topic_list = [topic]
+ else:
+ self.topic_list.append(topic)
class Parser:
"""gadict dictionary format parser."""
@@ -40,10 +81,12 @@
HEADWORD_RE = regex.compile(r"^(\p{L}.*)$")
HEADWORD_VAR_RE = regex.compile(r"^ +(s|pl|v[123]|male|female|comp|super|abbr|Am|Br|Au)$")
HEADWORD_PRON_RE = regex.compile(r"^ +\[([\p{L}' ]+)\]$")
- TRANSL_POS_RE = regex.compile(r"^n|pron|adj|v|adv|prep|conj|num|int|phr|phr\.v|abbr|prefix$")
+ TRANSL_POS_RE = regex.compile(r"^n|det|pron|adj|v|adv|prep|conj|num|int|phr|phr\.v|abbr|prefix$")
TRANSL_RE = regex.compile(r"^(ru|uk|la|en): ([\p{L}(].*)$")
TRANSL_EX_RE = regex.compile(r"^(ru|uk|la|en)> ([-\p{L}].*)$")
- TOPIC_RE = regex.compile(r"^(topic|ant|syn): (\p{L}.*)$")
+ TOPIC_RE = regex.compile(r"^topic: (\p{L}.*)$")
+ SYN_RE = regex.compile(r"^syn: (\p{L}.*)$")
+ ANT_RE = regex.compile(r"^ant: (\p{L}.*)$")
CONT_RE = regex.compile(r"^ +(.*)")
@@ -194,9 +237,7 @@
def parse_translation(self):
senses = []
- pos = None
- tr = []
- ex = []
+ sense = None
read = True
while True:
if read:
@@ -206,34 +247,59 @@
break
m = self.SEPARATOR_RE.match(self.line)
if m is not None:
+ if sense:
+ senses.append(sense)
break
if len(self.line) == 1:
- senses.append((pos, tr, ex))
- pos = None
- tr = []
- ex = []
+ if sense:
+ senses.append(sense)
+ sense = None
continue
m = self.TRANSL_POS_RE.match(self.line)
if m is not None:
- if pos is not None:
+ if sense is not None:
raise ParseException("""Each translation should have only one part of speech marker...""")
pos = m.group(0)
+ sense = Sense(pos)
continue
+ if not sense:
+ raise ParseException("""Missing part of speech marker...""")
m = self.TOPIC_RE.match(self.line)
if m is not None:
- # TODO
+ topics = m.group(1).split(";")
+ for topic in topics:
+ topic = topic.strip()
+ if len(topic) == 0:
+ raise ParseException("""Empty topic...""")
+ sense.add_topic(topic)
+ continue
+ m = self.SYN_RE.match(self.line)
+ if m is not None:
+ syns = m.group(1).split(";")
+ for syn in syns:
+ syn = syn.strip()
+ if len(syn) == 0:
+ raise ParseException("""Empty synonym...""")
+ sense.add_syn(syn)
+ continue
+ m = self.ANT_RE.match(self.line)
+ if m is not None:
+ ants = m.group(1).split(";")
+ for ant in ants:
+ ant = ant.strip()
+ if len(ant) == 0:
+ raise ParseException("""Empty antonym...""")
+ sense.add_ant(ant)
continue
m = self.TRANSL_RE.match(self.line)
if m is not None:
- tr.append((m.group(1), m.group(2) + self.parse_translation_continuation()))
+ sense.add_tr((m.group(1), m.group(2) + self.parse_translation_continuation()))
read = False
continue
m = self.TRANSL_EX_RE.match(self.line)
if m is not None:
- ex.append((m.group(1), m.group(2) + self.parse_translation_continuation()))
+ sense.add_ex((m.group(1), m.group(2) + self.parse_translation_continuation()))
read = False
continue
raise ParseException("""Uknown syntax...""")
- if len(tr) > 0:
- senses.append((pos, tr, ex))
self.tran = senses
--- a/py/gadict_c5.py Thu Aug 25 00:29:40 2016 +0300
+++ b/py/gadict_c5.py Thu Aug 25 00:32:25 2016 +0300
@@ -84,14 +84,16 @@
FOUT.write(", ".join(l))
FOUT.write("\n")
FOUT.write("\n")
- for (pos, trs, exs) in article[1]:
+ for sense in article[1]:
+ if not sense:
+ raise Exception("""Empty sense for article: """ + article[0].__iter__().__next__())
FOUT.write(" ")
- if pos is not None:
+ if sense.pos:
FOUT.write("«")
- FOUT.write(pos)
+ FOUT.write(sense.pos)
FOUT.write("» ")
FOUT.write("\n")
- for (lang, tr) in trs:
+ for (lang, tr) in sense.tr_list:
FOUT.write(" ")
if LANGS is None:
FOUT.write(lang)