gadict: changeset 757:5417f2102dc5

--- a/py/gadict.py	Tue Feb 21 10:03:54 2017 +0200
+++ b/py/gadict.py	Tue Feb 21 10:10:03 2017 +0200
@@ -4,7 +4,7 @@
 """
 
 import sys
-import regex
+import re
 
 
 class Prelude:
@@ -125,32 +125,32 @@
 class Parser:
     """gadict dictionary format parser."""
 
-    COMMENT_RE = regex.compile(r"^# ")
+    COMMENT_RE = re.compile("^# ")
 
-    SEPARATOR_RE = regex.compile(u"^__$")
-    HEADWORD_RE = regex.compile( u"^(\\p{L}.*)$" )
-    HEADWORD_VAR_RE = regex.compile(u"^ +(rare|s|pl|v[123]|male|female|comp|super|abbr|Am|Br|Au)$")
-    HEADWORD_PRON_RE = regex.compile(u"^ +\\[([a-zˌˈːəæɛɒʊɪɔɜɑʌɚɹʃʧθðɡʒŋɾ ]+)\\]$")
-    TRANSL_POS_RE = regex.compile(u"^(?:n|det|pron|adj|v|adv|prep|conj|num|int|phr|phr\\.v|contr|abbr|prefix)$")
-    TRANSL_RE = regex.compile(u"^(ru|uk|la|en): ([\\p{L}(].*)$")
-    TRANSL_EX_RE = regex.compile(u"""^(ru|uk|la|en)> ([-'"\\p{L}].*)$""")
-    TRANSL_GLOS_RE = regex.compile(u"^(ru|uk|la|en)= ([-\\p{L}\\p{N}].*)$")
-    TOPIC_RE = regex.compile(u"^topic: (\\p{L}.*)$")
-    SYN_RE = regex.compile(u"^syn: (\\p{L}.*)$")
-    ANT_RE = regex.compile(u"^ant: (\\p{L}.*)$")
-    REL_RE = regex.compile(u"^rel: (\\p{L}.*)$")
-    HYPER_RE = regex.compile(u"^hyper: (\\p{L}.*)$")
-    HYPO_RE = regex.compile(u"^hypo: (\\p{L}.*)$")
+    SEPARATOR_RE = re.compile(u"^__$", re.UNICODE)
+    HEADWORD_RE = re.compile( u"^(\\w.*)$" )
+    HEADWORD_VAR_RE = re.compile(u"^ +(rare|s|pl|v[123]|male|female|comp|super|abbr|Am|Br|Au)$", re.UNICODE)
+    HEADWORD_PRON_RE = re.compile(u"^ +\\[([a-zˌˈːəæɛɒʊɪɔɜɑʌɚɹʃʧθðɡʒŋɾ ]+)\\]$", re.UNICODE)
+    TRANSL_POS_RE = re.compile(u"^(?:n|det|pron|adj|v|adv|prep|conj|num|int|phr|phr\\.v|contr|abbr|prefix)$", re.UNICODE)
+    TRANSL_RE = re.compile(u"^(ru|uk|la|en): ([\\w(].*)$", re.UNICODE)
+    TRANSL_EX_RE = re.compile(u"""^(ru|uk|la|en)> ([-'"\\w].*)$""", re.UNICODE)
+    TRANSL_GLOS_RE = re.compile(u"^(ru|uk|la|en)= ([-\\w\\d].*)$", re.UNICODE)
+    TOPIC_RE = re.compile(u"^topic: (\\w.*)$", re.UNICODE)
+    SYN_RE = re.compile(u"^syn: (\\w.*)$", re.UNICODE)
+    ANT_RE = re.compile(u"^ant: (\\w.*)$", re.UNICODE)
+    REL_RE = re.compile(u"^rel: (\\w.*)$", re.UNICODE)
+    HYPER_RE = re.compile(u"^hyper: (\\w.*)$", re.UNICODE)
+    HYPO_RE = re.compile(u"^hypo: (\\w.*)$", re.UNICODE)
 
-    CONT_RE = regex.compile(u"^ +(.*)")
+    CONT_RE = re.compile(u"^ +(.*)", re.UNICODE)
 
-    TRAILING_SPACES_RE = regex.compile(u"\\p{Z}+$")
+    TRAILING_SPACES_RE = re.compile(u"\\s+$", re.UNICODE)
 
-    PRELUDE_NAME_RE = regex.compile(u"^name: (.*)")
-    PRELUDE_URL_RE = regex.compile(u"^url: (.*)")
-    PRELUDE_AUTHOR_RE = regex.compile(u"^by: (.*)")
-    PRELUDE_LICENSE_RE = regex.compile(u"^term: (.*)")
-    PRELUDE_ABOUT_RE = regex.compile(u"^about: ?(.*)")
+    PRELUDE_NAME_RE = re.compile(u"^name: (.*)", re.UNICODE)
+    PRELUDE_URL_RE = re.compile(u"^url: (.*)", re.UNICODE)
+    PRELUDE_AUTHOR_RE = re.compile(u"^by: (.*)", re.UNICODE)
+    PRELUDE_LICENSE_RE = re.compile(u"^term: (.*)", re.UNICODE)
+    PRELUDE_ABOUT_RE = re.compile(u"^about: ?(.*)", re.UNICODE)
 
     def __init__(self):
         pass
@@ -161,6 +161,7 @@
             self.eof = len(self.line) == 0
             if not self.eof:
                 self.lineno += 1
+            self.line = self.line.rstrip('\n')
             if self.TRAILING_SPACES_RE.search(self.line):
                 raise ParseException("Traling spaces detected...\n")
             if self.COMMENT_RE.search(self.line):
@@ -191,7 +192,7 @@
             m = self.CONT_RE.match(self.line)
             if m is not None:
                 string += "\n" + m.group(1)
-            elif len(self.line) == 1:
+            elif len(self.line) == 0:
                 string += "\n"
             else:
                 return string
@@ -239,7 +240,7 @@
 
     def parse_empty_line(self):
         self.readline()
-        if self.eof or len(self.line) != 1:
+        if self.eof or len(self.line) != 0:
             raise ParseException(""""__" delimiter should followed by empty line...""")
 
     def parse_headlines(self):
@@ -256,7 +257,7 @@
         attrs = set()
         while True:
             self.readline()
-            if self.eof or len(self.line) == 1:
+            if self.eof or len(self.line) == 0:
                 break
             m = self.HEADWORD_RE.match(self.line)
             if m is not None:
@@ -309,7 +310,7 @@
                 if sense:
                     senses.append(sense)
                 break
-            if len(self.line) == 1:
+            if len(self.line) == 0:
                 if sense:
                     senses.append(sense)
                 sense = None

--- a/py/gadict_c5.py	Tue Feb 21 10:03:54 2017 +0200
+++ b/py/gadict_c5.py	Tue Feb 21 10:10:03 2017 +0200
@@ -4,7 +4,7 @@
 import io
 import sys
 import codecs
-import regex
+import re
 
 import gadict
 import gadict_freq
@@ -16,9 +16,9 @@
 FREQ_SOURCES = []
 
 # -lang:ru,uk
-ARG_LANG_RE = regex.compile("-lang:(.+)")
+ARG_LANG_RE = re.compile("-lang:(.+)")
 # -freq:var:TAG=FILE or -freq:freq:TAG=FILE
-ARG_FREQ_RE = regex.compile("-freq:(freq|var):([^=]+)=(.+)")
+ARG_FREQ_RE = re.compile("-freq:(freq|var):([^=]+)=(.+)")
 
 look_for_files = False
 for idx in range(1, len(sys.argv)):

--- a/py/gadict_freq.py	Tue Feb 21 10:03:54 2017 +0200
+++ b/py/gadict_freq.py	Tue Feb 21 10:10:03 2017 +0200
@@ -2,7 +2,7 @@
 import sys
 import codecs
 import io
-import regex
+import re
 
 class WordlistParser:
 
@@ -21,7 +21,7 @@
 
 class WordformParser:
 
-    BASEVAR_RE = regex.compile(u"^(\t)?(.*)$")
+    BASEVAR_RE = re.compile(u"^(\t)?(.*)$", re.UNICODE)
 
     def __init__(self, stream, limit = None):
         self.stream = stream
@@ -50,7 +50,7 @@
 
 class FreqlistParser:
 
-    FREQ_RE = regex.compile(u"^([0-9]+) (.*)$")
+    FREQ_RE = re.compile(u"^([0-9]+) (.*)$", re.UNICODE)
 
     def __init__(self, stream, limit = None):
         self.stream = stream
@@ -80,7 +80,7 @@
         raise Exception(USAGE)
     FINAME = sys.argv[1]
 
-    COMMAND_RE = regex.compile("([-+])([0-9]+)?([bf]):([^:]+)")
+    COMMAND_RE = re.compile("([-+])([0-9]+)?([bf]):([^:]+)")
 
     IN_SET = set()
     EX_SET = set()

--- a/py/gadict_headwords.py	Tue Feb 21 10:03:54 2017 +0200
+++ b/py/gadict_headwords.py	Tue Feb 21 10:10:03 2017 +0200
@@ -2,7 +2,7 @@
 import sys
 import codecs
 import io
-import regex
+import re
 
 FINAME = None
 FONAME = None
@@ -20,10 +20,10 @@
 
 class GadictParser:
 
-    SEPARATOR_RE = regex.compile(u"^__$")
-    EMPTY_RE = regex.compile( u"^$" )
-    HEADWORD_ATTR_RE = regex.compile( u"^ " )
-    HEADWORD_RE = regex.compile( u"^(\\p{L}.*)$" )
+    SEPARATOR_RE = re.compile(u"^__$", re.UNICODE)
+    EMPTY_RE = re.compile( u"^$" )
+    HEADWORD_ATTR_RE = re.compile( u"^ " )
+    HEADWORD_RE = re.compile(u"^(\\w.*)$", re.UNICODE)
 
     def __init__(self, stream):
         self.stream = stream

author	Oleksandr Gavenko <gavenkoa@gmail.com>
	Tue, 21 Feb 2017 10:10:03 +0200
changeset 757	5417f2102dc5
parent 756	c1d3555458ad
child 758	622415807da0

py/gadict.py		file \| annotate \| diff \| comparison \| revisions
py/gadict_c5.py		file \| annotate \| diff \| comparison \| revisions
py/gadict_freq.py		file \| annotate \| diff \| comparison \| revisions
py/gadict_headwords.py		file \| annotate \| diff \| comparison \| revisions