py/gadict.py
changeset 757 5417f2102dc5
parent 740 77668cb05069
child 871 1dfca1e1f42a
--- a/py/gadict.py	Tue Feb 21 10:03:54 2017 +0200
+++ b/py/gadict.py	Tue Feb 21 10:10:03 2017 +0200
@@ -4,7 +4,7 @@
 """
 
 import sys
-import regex
+import re
 
 
 class Prelude:
@@ -125,32 +125,32 @@
 class Parser:
     """gadict dictionary format parser."""
 
-    COMMENT_RE = regex.compile(r"^# ")
+    COMMENT_RE = re.compile("^# ")
 
-    SEPARATOR_RE = regex.compile(u"^__$")
-    HEADWORD_RE = regex.compile( u"^(\\p{L}.*)$" )
-    HEADWORD_VAR_RE = regex.compile(u"^ +(rare|s|pl|v[123]|male|female|comp|super|abbr|Am|Br|Au)$")
-    HEADWORD_PRON_RE = regex.compile(u"^ +\\[([a-zˌˈːəæɛɒʊɪɔɜɑʌɚɹʃʧθðɡʒŋɾ ]+)\\]$")
-    TRANSL_POS_RE = regex.compile(u"^(?:n|det|pron|adj|v|adv|prep|conj|num|int|phr|phr\\.v|contr|abbr|prefix)$")
-    TRANSL_RE = regex.compile(u"^(ru|uk|la|en): ([\\p{L}(].*)$")
-    TRANSL_EX_RE = regex.compile(u"""^(ru|uk|la|en)> ([-'"\\p{L}].*)$""")
-    TRANSL_GLOS_RE = regex.compile(u"^(ru|uk|la|en)= ([-\\p{L}\\p{N}].*)$")
-    TOPIC_RE = regex.compile(u"^topic: (\\p{L}.*)$")
-    SYN_RE = regex.compile(u"^syn: (\\p{L}.*)$")
-    ANT_RE = regex.compile(u"^ant: (\\p{L}.*)$")
-    REL_RE = regex.compile(u"^rel: (\\p{L}.*)$")
-    HYPER_RE = regex.compile(u"^hyper: (\\p{L}.*)$")
-    HYPO_RE = regex.compile(u"^hypo: (\\p{L}.*)$")
+    SEPARATOR_RE = re.compile(u"^__$", re.UNICODE)
+    HEADWORD_RE = re.compile( u"^(\\w.*)$" )
+    HEADWORD_VAR_RE = re.compile(u"^ +(rare|s|pl|v[123]|male|female|comp|super|abbr|Am|Br|Au)$", re.UNICODE)
+    HEADWORD_PRON_RE = re.compile(u"^ +\\[([a-zˌˈːəæɛɒʊɪɔɜɑʌɚɹʃʧθðɡʒŋɾ ]+)\\]$", re.UNICODE)
+    TRANSL_POS_RE = re.compile(u"^(?:n|det|pron|adj|v|adv|prep|conj|num|int|phr|phr\\.v|contr|abbr|prefix)$", re.UNICODE)
+    TRANSL_RE = re.compile(u"^(ru|uk|la|en): ([\\w(].*)$", re.UNICODE)
+    TRANSL_EX_RE = re.compile(u"""^(ru|uk|la|en)> ([-'"\\w].*)$""", re.UNICODE)
+    TRANSL_GLOS_RE = re.compile(u"^(ru|uk|la|en)= ([-\\w\\d].*)$", re.UNICODE)
+    TOPIC_RE = re.compile(u"^topic: (\\w.*)$", re.UNICODE)
+    SYN_RE = re.compile(u"^syn: (\\w.*)$", re.UNICODE)
+    ANT_RE = re.compile(u"^ant: (\\w.*)$", re.UNICODE)
+    REL_RE = re.compile(u"^rel: (\\w.*)$", re.UNICODE)
+    HYPER_RE = re.compile(u"^hyper: (\\w.*)$", re.UNICODE)
+    HYPO_RE = re.compile(u"^hypo: (\\w.*)$", re.UNICODE)
 
-    CONT_RE = regex.compile(u"^ +(.*)")
+    CONT_RE = re.compile(u"^ +(.*)", re.UNICODE)
 
-    TRAILING_SPACES_RE = regex.compile(u"\\p{Z}+$")
+    TRAILING_SPACES_RE = re.compile(u"\\s+$", re.UNICODE)
 
-    PRELUDE_NAME_RE = regex.compile(u"^name: (.*)")
-    PRELUDE_URL_RE = regex.compile(u"^url: (.*)")
-    PRELUDE_AUTHOR_RE = regex.compile(u"^by: (.*)")
-    PRELUDE_LICENSE_RE = regex.compile(u"^term: (.*)")
-    PRELUDE_ABOUT_RE = regex.compile(u"^about: ?(.*)")
+    PRELUDE_NAME_RE = re.compile(u"^name: (.*)", re.UNICODE)
+    PRELUDE_URL_RE = re.compile(u"^url: (.*)", re.UNICODE)
+    PRELUDE_AUTHOR_RE = re.compile(u"^by: (.*)", re.UNICODE)
+    PRELUDE_LICENSE_RE = re.compile(u"^term: (.*)", re.UNICODE)
+    PRELUDE_ABOUT_RE = re.compile(u"^about: ?(.*)", re.UNICODE)
 
     def __init__(self):
         pass
@@ -161,6 +161,7 @@
             self.eof = len(self.line) == 0
             if not self.eof:
                 self.lineno += 1
+            self.line = self.line.rstrip('\n')
             if self.TRAILING_SPACES_RE.search(self.line):
                 raise ParseException("Traling spaces detected...\n")
             if self.COMMENT_RE.search(self.line):
@@ -191,7 +192,7 @@
             m = self.CONT_RE.match(self.line)
             if m is not None:
                 string += "\n" + m.group(1)
-            elif len(self.line) == 1:
+            elif len(self.line) == 0:
                 string += "\n"
             else:
                 return string
@@ -239,7 +240,7 @@
 
     def parse_empty_line(self):
         self.readline()
-        if self.eof or len(self.line) != 1:
+        if self.eof or len(self.line) != 0:
             raise ParseException(""""__" delimiter should followed by empty line...""")
 
     def parse_headlines(self):
@@ -256,7 +257,7 @@
         attrs = set()
         while True:
             self.readline()
-            if self.eof or len(self.line) == 1:
+            if self.eof or len(self.line) == 0:
                 break
             m = self.HEADWORD_RE.match(self.line)
             if m is not None:
@@ -309,7 +310,7 @@
                 if sense:
                     senses.append(sense)
                 break
-            if len(self.line) == 1:
+            if len(self.line) == 0:
                 if sense:
                     senses.append(sense)
                 sense = None