# HG changeset patch # User Oleksandr Gavenko # Date 1487664603 -7200 # Node ID 5417f2102dc5cb356c1d765e41a8fa8fa2264c1f # Parent c1d3555458ad2f1b9cbf92170ed87c647c6825ae Switch to built-in `re` Python module over `regex`. diff -r c1d3555458ad -r 5417f2102dc5 py/gadict.py --- a/py/gadict.py Tue Feb 21 10:03:54 2017 +0200 +++ b/py/gadict.py Tue Feb 21 10:10:03 2017 +0200 @@ -4,7 +4,7 @@ """ import sys -import regex +import re class Prelude: @@ -125,32 +125,32 @@ class Parser: """gadict dictionary format parser.""" - COMMENT_RE = regex.compile(r"^# ") + COMMENT_RE = re.compile("^# ") - SEPARATOR_RE = regex.compile(u"^__$") - HEADWORD_RE = regex.compile( u"^(\\p{L}.*)$" ) - HEADWORD_VAR_RE = regex.compile(u"^ +(rare|s|pl|v[123]|male|female|comp|super|abbr|Am|Br|Au)$") - HEADWORD_PRON_RE = regex.compile(u"^ +\\[([a-zˌˈːəæɛɒʊɪɔɜɑʌɚɹʃʧθðɡʒŋɾ ]+)\\]$") - TRANSL_POS_RE = regex.compile(u"^(?:n|det|pron|adj|v|adv|prep|conj|num|int|phr|phr\\.v|contr|abbr|prefix)$") - TRANSL_RE = regex.compile(u"^(ru|uk|la|en): ([\\p{L}(].*)$") - TRANSL_EX_RE = regex.compile(u"""^(ru|uk|la|en)> ([-'"\\p{L}].*)$""") - TRANSL_GLOS_RE = regex.compile(u"^(ru|uk|la|en)= ([-\\p{L}\\p{N}].*)$") - TOPIC_RE = regex.compile(u"^topic: (\\p{L}.*)$") - SYN_RE = regex.compile(u"^syn: (\\p{L}.*)$") - ANT_RE = regex.compile(u"^ant: (\\p{L}.*)$") - REL_RE = regex.compile(u"^rel: (\\p{L}.*)$") - HYPER_RE = regex.compile(u"^hyper: (\\p{L}.*)$") - HYPO_RE = regex.compile(u"^hypo: (\\p{L}.*)$") + SEPARATOR_RE = re.compile(u"^__$", re.UNICODE) + HEADWORD_RE = re.compile( u"^(\\w.*)$" ) + HEADWORD_VAR_RE = re.compile(u"^ +(rare|s|pl|v[123]|male|female|comp|super|abbr|Am|Br|Au)$", re.UNICODE) + HEADWORD_PRON_RE = re.compile(u"^ +\\[([a-zˌˈːəæɛɒʊɪɔɜɑʌɚɹʃʧθðɡʒŋɾ ]+)\\]$", re.UNICODE) + TRANSL_POS_RE = re.compile(u"^(?:n|det|pron|adj|v|adv|prep|conj|num|int|phr|phr\\.v|contr|abbr|prefix)$", re.UNICODE) + TRANSL_RE = re.compile(u"^(ru|uk|la|en): ([\\w(].*)$", re.UNICODE) + TRANSL_EX_RE = re.compile(u"""^(ru|uk|la|en)> ([-'"\\w].*)$""", re.UNICODE) + TRANSL_GLOS_RE = re.compile(u"^(ru|uk|la|en)= ([-\\w\\d].*)$", re.UNICODE) + TOPIC_RE = re.compile(u"^topic: (\\w.*)$", re.UNICODE) + SYN_RE = re.compile(u"^syn: (\\w.*)$", re.UNICODE) + ANT_RE = re.compile(u"^ant: (\\w.*)$", re.UNICODE) + REL_RE = re.compile(u"^rel: (\\w.*)$", re.UNICODE) + HYPER_RE = re.compile(u"^hyper: (\\w.*)$", re.UNICODE) + HYPO_RE = re.compile(u"^hypo: (\\w.*)$", re.UNICODE) - CONT_RE = regex.compile(u"^ +(.*)") + CONT_RE = re.compile(u"^ +(.*)", re.UNICODE) - TRAILING_SPACES_RE = regex.compile(u"\\p{Z}+$") + TRAILING_SPACES_RE = re.compile(u"\\s+$", re.UNICODE) - PRELUDE_NAME_RE = regex.compile(u"^name: (.*)") - PRELUDE_URL_RE = regex.compile(u"^url: (.*)") - PRELUDE_AUTHOR_RE = regex.compile(u"^by: (.*)") - PRELUDE_LICENSE_RE = regex.compile(u"^term: (.*)") - PRELUDE_ABOUT_RE = regex.compile(u"^about: ?(.*)") + PRELUDE_NAME_RE = re.compile(u"^name: (.*)", re.UNICODE) + PRELUDE_URL_RE = re.compile(u"^url: (.*)", re.UNICODE) + PRELUDE_AUTHOR_RE = re.compile(u"^by: (.*)", re.UNICODE) + PRELUDE_LICENSE_RE = re.compile(u"^term: (.*)", re.UNICODE) + PRELUDE_ABOUT_RE = re.compile(u"^about: ?(.*)", re.UNICODE) def __init__(self): pass @@ -161,6 +161,7 @@ self.eof = len(self.line) == 0 if not self.eof: self.lineno += 1 + self.line = self.line.rstrip('\n') if self.TRAILING_SPACES_RE.search(self.line): raise ParseException("Traling spaces detected...\n") if self.COMMENT_RE.search(self.line): @@ -191,7 +192,7 @@ m = self.CONT_RE.match(self.line) if m is not None: string += "\n" + m.group(1) - elif len(self.line) == 1: + elif len(self.line) == 0: string += "\n" else: return string @@ -239,7 +240,7 @@ def parse_empty_line(self): self.readline() - if self.eof or len(self.line) != 1: + if self.eof or len(self.line) != 0: raise ParseException(""""__" delimiter should followed by empty line...""") def parse_headlines(self): @@ -256,7 +257,7 @@ attrs = set() while True: self.readline() - if self.eof or len(self.line) == 1: + if self.eof or len(self.line) == 0: break m = self.HEADWORD_RE.match(self.line) if m is not None: @@ -309,7 +310,7 @@ if sense: senses.append(sense) break - if len(self.line) == 1: + if len(self.line) == 0: if sense: senses.append(sense) sense = None diff -r c1d3555458ad -r 5417f2102dc5 py/gadict_c5.py --- a/py/gadict_c5.py Tue Feb 21 10:03:54 2017 +0200 +++ b/py/gadict_c5.py Tue Feb 21 10:10:03 2017 +0200 @@ -4,7 +4,7 @@ import io import sys import codecs -import regex +import re import gadict import gadict_freq @@ -16,9 +16,9 @@ FREQ_SOURCES = [] # -lang:ru,uk -ARG_LANG_RE = regex.compile("-lang:(.+)") +ARG_LANG_RE = re.compile("-lang:(.+)") # -freq:var:TAG=FILE or -freq:freq:TAG=FILE -ARG_FREQ_RE = regex.compile("-freq:(freq|var):([^=]+)=(.+)") +ARG_FREQ_RE = re.compile("-freq:(freq|var):([^=]+)=(.+)") look_for_files = False for idx in range(1, len(sys.argv)): diff -r c1d3555458ad -r 5417f2102dc5 py/gadict_freq.py --- a/py/gadict_freq.py Tue Feb 21 10:03:54 2017 +0200 +++ b/py/gadict_freq.py Tue Feb 21 10:10:03 2017 +0200 @@ -2,7 +2,7 @@ import sys import codecs import io -import regex +import re class WordlistParser: @@ -21,7 +21,7 @@ class WordformParser: - BASEVAR_RE = regex.compile(u"^(\t)?(.*)$") + BASEVAR_RE = re.compile(u"^(\t)?(.*)$", re.UNICODE) def __init__(self, stream, limit = None): self.stream = stream @@ -50,7 +50,7 @@ class FreqlistParser: - FREQ_RE = regex.compile(u"^([0-9]+) (.*)$") + FREQ_RE = re.compile(u"^([0-9]+) (.*)$", re.UNICODE) def __init__(self, stream, limit = None): self.stream = stream @@ -80,7 +80,7 @@ raise Exception(USAGE) FINAME = sys.argv[1] - COMMAND_RE = regex.compile("([-+])([0-9]+)?([bf]):([^:]+)") + COMMAND_RE = re.compile("([-+])([0-9]+)?([bf]):([^:]+)") IN_SET = set() EX_SET = set() diff -r c1d3555458ad -r 5417f2102dc5 py/gadict_headwords.py --- a/py/gadict_headwords.py Tue Feb 21 10:03:54 2017 +0200 +++ b/py/gadict_headwords.py Tue Feb 21 10:10:03 2017 +0200 @@ -2,7 +2,7 @@ import sys import codecs import io -import regex +import re FINAME = None FONAME = None @@ -20,10 +20,10 @@ class GadictParser: - SEPARATOR_RE = regex.compile(u"^__$") - EMPTY_RE = regex.compile( u"^$" ) - HEADWORD_ATTR_RE = regex.compile( u"^ " ) - HEADWORD_RE = regex.compile( u"^(\\p{L}.*)$" ) + SEPARATOR_RE = re.compile(u"^__$", re.UNICODE) + EMPTY_RE = re.compile( u"^$" ) + HEADWORD_ATTR_RE = re.compile( u"^ " ) + HEADWORD_RE = re.compile(u"^(\\w.*)$", re.UNICODE) def __init__(self, stream): self.stream = stream