gadict: changeset 393:2756a6deca7e

--- a/Makefile	Sun Mar 27 16:35:30 2016 +0300
+++ b/Makefile	Sun Mar 27 16:44:14 2016 +0300
@@ -322,7 +322,7 @@
 	dictzip -c $< >$@
 
 dist/dictd/%.c5: %.gadict | dist/dictd
-	python3 gadict.py $< $@
+	python3 py/gadict.py $< $@
 
 dist/dictd:
 	mkdir -p $@

--- a/gadict.py	Sun Mar 27 16:35:30 2016 +0300
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,206 +0,0 @@
-
-import io
-import sys
-# import re
-import regex
-
-
-# fgadict = "gadict_en-ru+ua.gadict"
-fgadict = None
-fnout = None
-if len(sys.argv) >= 2:
-    fgadict = sys.argv[1]
-if len(sys.argv) >= 3:
-    fnout = sys.argv[2]
-
-fin = io.open(fgadict, mode='r', buffering=1, encoding="utf-8")
-if fnout is None:
-    fout = sys.stdout
-else:
-    fout = open(fnout, "w")
-
-
-class ParseException(Exception):
-
-    def __init__(self, msg):
-        self.msg = msg
-
-    def __repr__(self):
-        return self.msg
-
-
-class Parser:
-
-    SEPARATOR_RE = regex.compile(r"^__$")
-    HEADWORD_RE = regex.compile(r"^(\p{L}.*)$")
-    HEADWORD_VAR_RE = regex.compile(r"^ +(s|pl|v[123]|male|female|comp|super)$")
-    HEADWORD_PRON_RE = regex.compile(r"^ +\[([\p{L}' ]+)\]$")
-    TRANSL_POS_RE = regex.compile(r"^n|pron|adj|v|adv|prep|conj|num|int|phr\.v|abbr$")
-    TRANSL_RE = regex.compile(r"^(ru|uk|la|en): ([\p{L}(][\p{L}\p{P}~ ]*)$")
-    TRANSL_EX_RE = regex.compile(r"^(ru|uk|la|en)> (\p{L}.*)$")
-
-    TRAILING_SPACES_RE = regex.compile(r"\p{Z}+$")
-
-    def __init__(self):
-        pass
-
-    def readline(self):
-        self.line = self.stream.readline()
-        self.eof = len(self.line) == 0
-        if not self.eof:
-            self.lineno += 1
-
-    def parse(self, stream):
-        self.lineno = 0
-        self.stream = stream
-        self.dom = []
-        try:
-            self.parse_prelude()
-            while not self.eof:
-                self.parse_article()
-        except ParseException as ex:
-            if self.TRAILING_SPACES_RE.match(self.line):
-                fout.write("{:s}:{:d}: {:s}".format(fgadict, self.lineno, "Traling spaces detected...\n"))
-            fout.write("{:s}:{:d}: {:s}\nLINE: {:s}\n".format(fgadict, self.lineno, str(ex), self.line))
-            raise Exception(ex)
-        return self.dom
-
-    def parse_prelude(self):
-        """Read dictionary prelude until first "__" delimiter."""
-        while True:
-            self.readline()
-            if self.eof:
-                raise ParseException("There are no articles...")
-            if self.SEPARATOR_RE.match(self.line):
-                break
-
-    def parse_article(self):
-        """Try to match article until next "__" delimiter. Assume that `self.line` point to "__" delimiter."""
-        self.words = None
-        self.tran = None
-        self.parse_empty_line()
-        self.parse_headlines()
-        self.parse_translation()
-        self.dom.append((self.words, self.tran))
-
-    def parse_empty_line(self):
-        self.readline()
-        if self.eof or len(self.line) != 1:
-            raise ParseException(""""__" delimiter should followed by empty line...""")
-
-    def parse_headlines(self):
-        """Try to match word variations with attributed. Assume that `self.line` on preceding empty line."""
-        self.words = {}
-        self.readline()
-        if self.eof:
-            raise ParseException("""There are no definition after "__" delimiter...""")
-        m = self.HEADWORD_RE.match(self.line)
-        if m is None:
-            raise ParseException("""There are no headword after "__" delimiter...""")
-        word = m.group(1)
-        pron = None
-        attrs = set()
-        while True:
-            self.readline()
-            if self.eof or len(self.line) == 1:
-                break
-            m = self.HEADWORD_RE.match(self.line)
-            if m is not None:
-                if word is None:
-                    raise ParseException("""Didn't match previous headword...""")
-                self.words[word] = (pron, attrs)
-                word = m.group(1)
-                pron = None
-                attrs = set()
-                continue
-            m = self.HEADWORD_PRON_RE.match(self.line)
-            if m is not None:
-                if pron is not None:
-                    raise ParseException("""Pronunciation is redefined...""")
-                pron = m.group(1)
-                continue
-            m = self.HEADWORD_VAR_RE.match(self.line)
-            if m is not None:
-                attrs.add(m.group(1))
-                continue
-            raise ParseException("""Line is not headword or translation or headword attribute...""")
-        self.words[word] = (pron, attrs)
-
-    def parse_translation(self):
-        senses = []
-        pos = None
-        tr = []
-        ex = []
-        while True:
-            self.readline()
-            if self.eof:
-                break
-            m = self.SEPARATOR_RE.match(self.line)
-            if m is not None:
-                break
-            if len(self.line) == 1:
-                senses.append((pos, tr, ex))
-                pos = None
-                tr = []
-                ex = []
-                continue
-            m = self.TRANSL_POS_RE.match(self.line)
-            if m is not None:
-                if pos is not None:
-                    raise ParseException("""Each translation should have only one part of speech marker...""")
-                pos = m.group(0)
-                continue
-            m = self.TRANSL_RE.match(self.line)
-            if m is not None:
-                tr.append((m.group(1), m.group(2)))
-                continue
-            m = self.TRANSL_EX_RE.match(self.line)
-            if m is not None:
-                ex.append((m.group(1), m.group(2)))
-                continue
-            raise ParseException("""Uknown syntax...""")
-        if len(tr) > 0:
-            senses.append((pos, tr, ex))
-        self.tran = senses
-
-parser = Parser()
-dom = parser.parse(fin)
-fin.close()
-
-for idx in range(1, len(dom)):
-    article = dom[idx]
-    fout.write("_____\n\n")
-    title = "; ".join(article[0].keys())
-    fout.write(title)
-    fout.write("\n\n")
-    for (word, (pron, attrs)) in article[0].items():
-        if word == "approach":
-            fout.write(str(article[0]))
-        fout.write("  ")
-        fout.write(word)
-        fout.write("\n")
-        if pron is not None:
-            fout.write("    [")
-            fout.write(pron)
-            fout.write("]\n")
-        if len(attrs) > 0:
-            fout.write("    ")
-            l = list(attrs)
-            l.sort()
-            fout.write(", ".join(l))
-            fout.write("\n")
-    fout.write("\n")
-    for (pos, trs, exs) in article[1]:
-        fout.write("  ")
-        if pos is not None:
-            fout.write("⟨")
-            fout.write(pos)
-            fout.write("⟩ ")
-        for (lang, tr) in trs:
-            if lang == "ru":
-                fout.write(tr)
-                break
-        fout.write("\n")
-
-    # fout.write(str(article[0])+"\n")
-

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/py/gadict.py	Sun Mar 27 16:44:14 2016 +0300
@@ -0,0 +1,206 @@
+
+import io
+import sys
+# import re
+import regex
+
+
+# fgadict = "gadict_en-ru+ua.gadict"
+fgadict = None
+fnout = None
+if len(sys.argv) >= 2:
+    fgadict = sys.argv[1]
+if len(sys.argv) >= 3:
+    fnout = sys.argv[2]
+
+fin = io.open(fgadict, mode='r', buffering=1, encoding="utf-8")
+if fnout is None:
+    fout = sys.stdout
+else:
+    fout = open(fnout, "w")
+
+
+class ParseException(Exception):
+
+    def __init__(self, msg):
+        self.msg = msg
+
+    def __repr__(self):
+        return self.msg
+
+
+class Parser:
+
+    SEPARATOR_RE = regex.compile(r"^__$")
+    HEADWORD_RE = regex.compile(r"^(\p{L}.*)$")
+    HEADWORD_VAR_RE = regex.compile(r"^ +(s|pl|v[123]|male|female|comp|super)$")
+    HEADWORD_PRON_RE = regex.compile(r"^ +\[([\p{L}' ]+)\]$")
+    TRANSL_POS_RE = regex.compile(r"^n|pron|adj|v|adv|prep|conj|num|int|phr\.v|abbr$")
+    TRANSL_RE = regex.compile(r"^(ru|uk|la|en): ([\p{L}(][\p{L}\p{P}~ ]*)$")
+    TRANSL_EX_RE = regex.compile(r"^(ru|uk|la|en)> (\p{L}.*)$")
+
+    TRAILING_SPACES_RE = regex.compile(r"\p{Z}+$")
+
+    def __init__(self):
+        pass
+
+    def readline(self):
+        self.line = self.stream.readline()
+        self.eof = len(self.line) == 0
+        if not self.eof:
+            self.lineno += 1
+
+    def parse(self, stream):
+        self.lineno = 0
+        self.stream = stream
+        self.dom = []
+        try:
+            self.parse_prelude()
+            while not self.eof:
+                self.parse_article()
+        except ParseException as ex:
+            if self.TRAILING_SPACES_RE.match(self.line):
+                fout.write("{:s}:{:d}: {:s}".format(fgadict, self.lineno, "Traling spaces detected...\n"))
+            fout.write("{:s}:{:d}: {:s}\nLINE: {:s}\n".format(fgadict, self.lineno, str(ex), self.line))
+            raise Exception(ex)
+        return self.dom
+
+    def parse_prelude(self):
+        """Read dictionary prelude until first "__" delimiter."""
+        while True:
+            self.readline()
+            if self.eof:
+                raise ParseException("There are no articles...")
+            if self.SEPARATOR_RE.match(self.line):
+                break
+
+    def parse_article(self):
+        """Try to match article until next "__" delimiter. Assume that `self.line` point to "__" delimiter."""
+        self.words = None
+        self.tran = None
+        self.parse_empty_line()
+        self.parse_headlines()
+        self.parse_translation()
+        self.dom.append((self.words, self.tran))
+
+    def parse_empty_line(self):
+        self.readline()
+        if self.eof or len(self.line) != 1:
+            raise ParseException(""""__" delimiter should followed by empty line...""")
+
+    def parse_headlines(self):
+        """Try to match word variations with attributed. Assume that `self.line` on preceding empty line."""
+        self.words = {}
+        self.readline()
+        if self.eof:
+            raise ParseException("""There are no definition after "__" delimiter...""")
+        m = self.HEADWORD_RE.match(self.line)
+        if m is None:
+            raise ParseException("""There are no headword after "__" delimiter...""")
+        word = m.group(1)
+        pron = None
+        attrs = set()
+        while True:
+            self.readline()
+            if self.eof or len(self.line) == 1:
+                break
+            m = self.HEADWORD_RE.match(self.line)
+            if m is not None:
+                if word is None:
+                    raise ParseException("""Didn't match previous headword...""")
+                self.words[word] = (pron, attrs)
+                word = m.group(1)
+                pron = None
+                attrs = set()
+                continue
+            m = self.HEADWORD_PRON_RE.match(self.line)
+            if m is not None:
+                if pron is not None:
+                    raise ParseException("""Pronunciation is redefined...""")
+                pron = m.group(1)
+                continue
+            m = self.HEADWORD_VAR_RE.match(self.line)
+            if m is not None:
+                attrs.add(m.group(1))
+                continue
+            raise ParseException("""Line is not headword or translation or headword attribute...""")
+        self.words[word] = (pron, attrs)
+
+    def parse_translation(self):
+        senses = []
+        pos = None
+        tr = []
+        ex = []
+        while True:
+            self.readline()
+            if self.eof:
+                break
+            m = self.SEPARATOR_RE.match(self.line)
+            if m is not None:
+                break
+            if len(self.line) == 1:
+                senses.append((pos, tr, ex))
+                pos = None
+                tr = []
+                ex = []
+                continue
+            m = self.TRANSL_POS_RE.match(self.line)
+            if m is not None:
+                if pos is not None:
+                    raise ParseException("""Each translation should have only one part of speech marker...""")
+                pos = m.group(0)
+                continue
+            m = self.TRANSL_RE.match(self.line)
+            if m is not None:
+                tr.append((m.group(1), m.group(2)))
+                continue
+            m = self.TRANSL_EX_RE.match(self.line)
+            if m is not None:
+                ex.append((m.group(1), m.group(2)))
+                continue
+            raise ParseException("""Uknown syntax...""")
+        if len(tr) > 0:
+            senses.append((pos, tr, ex))
+        self.tran = senses
+
+parser = Parser()
+dom = parser.parse(fin)
+fin.close()
+
+for idx in range(1, len(dom)):
+    article = dom[idx]
+    fout.write("_____\n\n")
+    title = "; ".join(article[0].keys())
+    fout.write(title)
+    fout.write("\n\n")
+    for (word, (pron, attrs)) in article[0].items():
+        if word == "approach":
+            fout.write(str(article[0]))
+        fout.write("  ")
+        fout.write(word)
+        fout.write("\n")
+        if pron is not None:
+            fout.write("    [")
+            fout.write(pron)
+            fout.write("]\n")
+        if len(attrs) > 0:
+            fout.write("    ")
+            l = list(attrs)
+            l.sort()
+            fout.write(", ".join(l))
+            fout.write("\n")
+    fout.write("\n")
+    for (pos, trs, exs) in article[1]:
+        fout.write("  ")
+        if pos is not None:
+            fout.write("⟨")
+            fout.write(pos)
+            fout.write("⟩ ")
+        for (lang, tr) in trs:
+            if lang == "ru":
+                fout.write(tr)
+                break
+        fout.write("\n")
+
+    # fout.write(str(article[0])+"\n")
+

author	Oleksandr Gavenko <gavenkoa@gmail.com>
	Sun, 27 Mar 2016 16:44:14 +0300
changeset 393	2756a6deca7e
parent 392	4657b44ad9af
child 394	4d45194c71b6

Makefile		file \| annotate \| diff \| comparison \| revisions
gadict.py		file \| annotate \| diff \| comparison \| revisions
py/gadict.py		file \| annotate \| diff \| comparison \| revisions