Include all headwords into list not just first.
authorOleksandr Gavenko <gavenkoa@gmail.com>
Wed, 28 Dec 2016 19:47:51 +0200
changeset 720 b5a4b476eddf
parent 719 ebc16c3a9129
child 721 e3cb06917795
Include all headwords into list not just first.
py/gadict_headwords.py
--- a/py/gadict_headwords.py	Fri Dec 23 21:56:42 2016 +0200
+++ b/py/gadict_headwords.py	Wed Dec 28 19:47:51 2016 +0200
@@ -22,6 +22,7 @@
 
     SEPARATOR_RE = regex.compile(u"^__$")
     EMPTY_RE = regex.compile( u"^$" )
+    HEADWORD_ATTR_RE = regex.compile( u"^ " )
     HEADWORD_RE = regex.compile( u"^(\\p{L}.*)$" )
 
     def __init__(self, stream):
@@ -33,7 +34,7 @@
         while True:
             line = self.stream.readline()
             if len(line) == 0:
-                break
+                return wlist
             self.lineno += 1
             m = self.SEPARATOR_RE.match(line)
             if not m:
@@ -41,22 +42,27 @@
 
             line = self.stream.readline()
             if len(line) == 0:
-                break
+                return wlist
             self.lineno += 1
             m = self.EMPTY_RE.match(line)
             if not m:
                 raise Exception("Line {:d}: '{:s}' is not empty line\n".format(self.lineno, line))
 
-            line = self.stream.readline()
-            if len(line) == 0:
-                break
-            line = line.strip()
-            self.lineno += 1
-            m = self.HEADWORD_RE.match(line)
-            if not m:
-                raise Exception("Line {:d}: '{:s}' is not a headword\n".format(self.lineno, line))
-
-            wlist.append(line)
+            while True:
+                line = self.stream.readline()
+                if len(line) == 0:
+                    return wlist
+                self.lineno += 1
+                m = self.HEADWORD_ATTR_RE.match(line)
+                if m:
+                    continue
+                line = line.strip()
+                if len(line) == 0:
+                    break
+                m = self.HEADWORD_RE.match(line)
+                if not m:
+                    raise Exception("{:d}: '{:s}' is not a headword\n".format(self.lineno, line))
+                wlist.append(line)
         return wlist
 
 try:
@@ -64,6 +70,9 @@
     for headword in parser.parse():
         FOUT.write(headword)
         FOUT.write("\n")
+except Exception as ex:
+    print("{}:{}".format(FINAME, str(ex)))
+    raise ex
 finally:
     FIN.close()
     FOUT.close()