gadict: changeset 530:91771594bc8b

--- a/py/gadict.py	Thu Aug 25 00:29:40 2016 +0300
+++ b/py/gadict.py	Thu Aug 25 00:32:25 2016 +0300
@@ -30,6 +30,47 @@
         else:
             return ":{:d}: {:s}\nLINE: {:s}".format(self.lineno, self.msg, self.line)
 
+class Sense:
+
+    def __init__(self, pos, tr_list = None, ex_list = None, syn_list = None, ant_list = None, topic_list = None):
+        if not pos:
+            raise ParseException("Part of speech expected...\n")
+        self.pos = pos
+        if tr_list:
+            self.tr_list = tr_list
+        else:
+            self.tr_list = []
+        self.ex_list = ex_list
+        self.syn_list = syn_list
+        self.ant_list = ant_list
+        self.topic_list = topic_list
+
+    def add_tr(self, tr):
+        self.tr_list.append(tr)
+
+    def add_ex(self, ex):
+        if not self.ex_list:
+            self.ex_list = [ex]
+        else:
+            self.ex_list.append(ex)
+
+    def add_syn(self, syn):
+        if not self.syn_list:
+            self.syn_list = [syn]
+        else:
+            self.syn_list.append(syn)
+
+    def add_ant(self, ant):
+        if not self.ant_list:
+            self.ant_list = [ant]
+        else:
+            self.ant_list.append(ant)
+
+    def add_topic(self, topic):
+        if not self.topic_list:
+            self.topic_list = [topic]
+        else:
+            self.topic_list.append(topic)
 
 class Parser:
     """gadict dictionary format parser."""
@@ -40,10 +81,12 @@
     HEADWORD_RE = regex.compile(r"^(\p{L}.*)$")
     HEADWORD_VAR_RE = regex.compile(r"^ +(s|pl|v[123]|male|female|comp|super|abbr|Am|Br|Au)$")
     HEADWORD_PRON_RE = regex.compile(r"^ +\[([\p{L}' ]+)\]$")
-    TRANSL_POS_RE = regex.compile(r"^n|pron|adj|v|adv|prep|conj|num|int|phr|phr\.v|abbr|prefix$")
+    TRANSL_POS_RE = regex.compile(r"^n|det|pron|adj|v|adv|prep|conj|num|int|phr|phr\.v|abbr|prefix$")
     TRANSL_RE = regex.compile(r"^(ru|uk|la|en): ([\p{L}(].*)$")
     TRANSL_EX_RE = regex.compile(r"^(ru|uk|la|en)> ([-\p{L}].*)$")
-    TOPIC_RE = regex.compile(r"^(topic|ant|syn): (\p{L}.*)$")
+    TOPIC_RE = regex.compile(r"^topic: (\p{L}.*)$")
+    SYN_RE = regex.compile(r"^syn: (\p{L}.*)$")
+    ANT_RE = regex.compile(r"^ant: (\p{L}.*)$")
 
     CONT_RE = regex.compile(r"^ +(.*)")
 
@@ -194,9 +237,7 @@
 
     def parse_translation(self):
         senses = []
-        pos = None
-        tr = []
-        ex = []
+        sense = None
         read = True
         while True:
             if read:
@@ -206,34 +247,59 @@
                 break
             m = self.SEPARATOR_RE.match(self.line)
             if m is not None:
+                if sense:
+                    senses.append(sense)
                 break
             if len(self.line) == 1:
-                senses.append((pos, tr, ex))
-                pos = None
-                tr = []
-                ex = []
+                if sense:
+                    senses.append(sense)
+                sense = None
                 continue
             m = self.TRANSL_POS_RE.match(self.line)
             if m is not None:
-                if pos is not None:
+                if sense is not None:
                     raise ParseException("""Each translation should have only one part of speech marker...""")
                 pos = m.group(0)
+                sense = Sense(pos)
                 continue
+            if not sense:
+                raise ParseException("""Missing part of speech marker...""")
             m = self.TOPIC_RE.match(self.line)
             if m is not None:
-                # TODO
+                topics = m.group(1).split(";")
+                for topic in topics:
+                    topic = topic.strip()
+                    if len(topic) == 0:
+                        raise ParseException("""Empty topic...""")
+                    sense.add_topic(topic)
+                continue
+            m = self.SYN_RE.match(self.line)
+            if m is not None:
+                syns = m.group(1).split(";")
+                for syn in syns:
+                    syn = syn.strip()
+                    if len(syn) == 0:
+                        raise ParseException("""Empty synonym...""")
+                    sense.add_syn(syn)
+                continue
+            m = self.ANT_RE.match(self.line)
+            if m is not None:
+                ants = m.group(1).split(";")
+                for ant in ants:
+                    ant = ant.strip()
+                    if len(ant) == 0:
+                        raise ParseException("""Empty antonym...""")
+                    sense.add_ant(ant)
                 continue
             m = self.TRANSL_RE.match(self.line)
             if m is not None:
-                tr.append((m.group(1), m.group(2) + self.parse_translation_continuation()))
+                sense.add_tr((m.group(1), m.group(2) + self.parse_translation_continuation()))
                 read = False
                 continue
             m = self.TRANSL_EX_RE.match(self.line)
             if m is not None:
-                ex.append((m.group(1), m.group(2) + self.parse_translation_continuation()))
+                sense.add_ex((m.group(1), m.group(2) + self.parse_translation_continuation()))
                 read = False
                 continue
             raise ParseException("""Uknown syntax...""")
-        if len(tr) > 0:
-            senses.append((pos, tr, ex))
         self.tran = senses

--- a/py/gadict_c5.py	Thu Aug 25 00:29:40 2016 +0300
+++ b/py/gadict_c5.py	Thu Aug 25 00:32:25 2016 +0300
@@ -84,14 +84,16 @@
             FOUT.write(", ".join(l))
         FOUT.write("\n")
     FOUT.write("\n")
-    for (pos, trs, exs) in article[1]:
+    for sense in article[1]:
+        if not sense:
+            raise Exception("""Empty sense for article: """ + article[0].__iter__().__next__())
         FOUT.write("  ")
-        if pos is not None:
+        if sense.pos:
             FOUT.write("«")
-            FOUT.write(pos)
+            FOUT.write(sense.pos)
             FOUT.write("» ")
             FOUT.write("\n")
-        for (lang, tr) in trs:
+        for (lang, tr) in sense.tr_list:
             FOUT.write("  ")
             if LANGS is None:
                 FOUT.write(lang)

author	Oleksandr Gavenko <gavenkoa@gmail.com>
	Thu, 25 Aug 2016 00:32:25 +0300
changeset 530	91771594bc8b
parent 529	ed54a93aa8d7
child 531	354cac8d039d

py/gadict.py		file \| annotate \| diff \| comparison \| revisions
py/gadict_c5.py		file \| annotate \| diff \| comparison \| revisions