py/gadict_headwords.py
changeset 643 c2c32f45dde6
child 720 b5a4b476eddf
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/py/gadict_headwords.py	Tue Nov 08 17:44:04 2016 +0200
@@ -0,0 +1,70 @@
+
+import sys
+import codecs
+import io
+import regex
+
+FINAME = None
+FONAME = None
+if len(sys.argv) >= 2:
+    FINAME = sys.argv[1]
+if len(sys.argv) >= 3:
+    FONAME = sys.argv[2]
+
+FIN = io.open(FINAME, mode='r', buffering=1, encoding="utf-8")
+if FONAME is None:
+    FOUT = sys.stdout
+else:
+    FOUT = codecs.open(FONAME, "w", "utf-8")
+
+
+class GadictParser:
+
+    SEPARATOR_RE = regex.compile(u"^__$")
+    EMPTY_RE = regex.compile( u"^$" )
+    HEADWORD_RE = regex.compile( u"^(\\p{L}.*)$" )
+
+    def __init__(self, stream):
+        self.stream = stream
+        self.lineno = 0
+
+    def parse(self):
+        wlist = []
+        while True:
+            line = self.stream.readline()
+            if len(line) == 0:
+                break
+            self.lineno += 1
+            m = self.SEPARATOR_RE.match(line)
+            if not m:
+                continue
+
+            line = self.stream.readline()
+            if len(line) == 0:
+                break
+            self.lineno += 1
+            m = self.EMPTY_RE.match(line)
+            if not m:
+                raise Exception("Line {:d}: '{:s}' is not empty line\n".format(self.lineno, line))
+
+            line = self.stream.readline()
+            if len(line) == 0:
+                break
+            line = line.strip()
+            self.lineno += 1
+            m = self.HEADWORD_RE.match(line)
+            if not m:
+                raise Exception("Line {:d}: '{:s}' is not a headword\n".format(self.lineno, line))
+
+            wlist.append(line)
+        return wlist
+
+try:
+    parser = GadictParser(FIN)
+    for headword in parser.parse():
+        FOUT.write(headword)
+        FOUT.write("\n")
+finally:
+    FIN.close()
+    FOUT.close()
+