# HG changeset patch # User Oleksandr Gavenko # Date 1459086254 -10800 # Node ID 2756a6deca7ee85a6347b727541e39236a78de6c # Parent 4657b44ad9afa74467f5cf7568711f4d9781f7c8 Move python files to separate directory. diff -r 4657b44ad9af -r 2756a6deca7e Makefile --- a/Makefile Sun Mar 27 16:35:30 2016 +0300 +++ b/Makefile Sun Mar 27 16:44:14 2016 +0300 @@ -322,7 +322,7 @@ dictzip -c $< >$@ dist/dictd/%.c5: %.gadict | dist/dictd - python3 gadict.py $< $@ + python3 py/gadict.py $< $@ dist/dictd: mkdir -p $@ diff -r 4657b44ad9af -r 2756a6deca7e gadict.py --- a/gadict.py Sun Mar 27 16:35:30 2016 +0300 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,206 +0,0 @@ - -import io -import sys -# import re -import regex - - -# fgadict = "gadict_en-ru+ua.gadict" -fgadict = None -fnout = None -if len(sys.argv) >= 2: - fgadict = sys.argv[1] -if len(sys.argv) >= 3: - fnout = sys.argv[2] - -fin = io.open(fgadict, mode='r', buffering=1, encoding="utf-8") -if fnout is None: - fout = sys.stdout -else: - fout = open(fnout, "w") - - -class ParseException(Exception): - - def __init__(self, msg): - self.msg = msg - - def __repr__(self): - return self.msg - - -class Parser: - - SEPARATOR_RE = regex.compile(r"^__$") - HEADWORD_RE = regex.compile(r"^(\p{L}.*)$") - HEADWORD_VAR_RE = regex.compile(r"^ +(s|pl|v[123]|male|female|comp|super)$") - HEADWORD_PRON_RE = regex.compile(r"^ +\[([\p{L}' ]+)\]$") - TRANSL_POS_RE = regex.compile(r"^n|pron|adj|v|adv|prep|conj|num|int|phr\.v|abbr$") - TRANSL_RE = regex.compile(r"^(ru|uk|la|en): ([\p{L}(][\p{L}\p{P}~ ]*)$") - TRANSL_EX_RE = regex.compile(r"^(ru|uk|la|en)> (\p{L}.*)$") - - TRAILING_SPACES_RE = regex.compile(r"\p{Z}+$") - - def __init__(self): - pass - - def readline(self): - self.line = self.stream.readline() - self.eof = len(self.line) == 0 - if not self.eof: - self.lineno += 1 - - def parse(self, stream): - self.lineno = 0 - self.stream = stream - self.dom = [] - try: - self.parse_prelude() - while not self.eof: - self.parse_article() - except ParseException as ex: - if self.TRAILING_SPACES_RE.match(self.line): - fout.write("{:s}:{:d}: {:s}".format(fgadict, self.lineno, "Traling spaces detected...\n")) - fout.write("{:s}:{:d}: {:s}\nLINE: {:s}\n".format(fgadict, self.lineno, str(ex), self.line)) - raise Exception(ex) - return self.dom - - def parse_prelude(self): - """Read dictionary prelude until first "__" delimiter.""" - while True: - self.readline() - if self.eof: - raise ParseException("There are no articles...") - if self.SEPARATOR_RE.match(self.line): - break - - def parse_article(self): - """Try to match article until next "__" delimiter. Assume that `self.line` point to "__" delimiter.""" - self.words = None - self.tran = None - self.parse_empty_line() - self.parse_headlines() - self.parse_translation() - self.dom.append((self.words, self.tran)) - - def parse_empty_line(self): - self.readline() - if self.eof or len(self.line) != 1: - raise ParseException(""""__" delimiter should followed by empty line...""") - - def parse_headlines(self): - """Try to match word variations with attributed. Assume that `self.line` on preceding empty line.""" - self.words = {} - self.readline() - if self.eof: - raise ParseException("""There are no definition after "__" delimiter...""") - m = self.HEADWORD_RE.match(self.line) - if m is None: - raise ParseException("""There are no headword after "__" delimiter...""") - word = m.group(1) - pron = None - attrs = set() - while True: - self.readline() - if self.eof or len(self.line) == 1: - break - m = self.HEADWORD_RE.match(self.line) - if m is not None: - if word is None: - raise ParseException("""Didn't match previous headword...""") - self.words[word] = (pron, attrs) - word = m.group(1) - pron = None - attrs = set() - continue - m = self.HEADWORD_PRON_RE.match(self.line) - if m is not None: - if pron is not None: - raise ParseException("""Pronunciation is redefined...""") - pron = m.group(1) - continue - m = self.HEADWORD_VAR_RE.match(self.line) - if m is not None: - attrs.add(m.group(1)) - continue - raise ParseException("""Line is not headword or translation or headword attribute...""") - self.words[word] = (pron, attrs) - - def parse_translation(self): - senses = [] - pos = None - tr = [] - ex = [] - while True: - self.readline() - if self.eof: - break - m = self.SEPARATOR_RE.match(self.line) - if m is not None: - break - if len(self.line) == 1: - senses.append((pos, tr, ex)) - pos = None - tr = [] - ex = [] - continue - m = self.TRANSL_POS_RE.match(self.line) - if m is not None: - if pos is not None: - raise ParseException("""Each translation should have only one part of speech marker...""") - pos = m.group(0) - continue - m = self.TRANSL_RE.match(self.line) - if m is not None: - tr.append((m.group(1), m.group(2))) - continue - m = self.TRANSL_EX_RE.match(self.line) - if m is not None: - ex.append((m.group(1), m.group(2))) - continue - raise ParseException("""Uknown syntax...""") - if len(tr) > 0: - senses.append((pos, tr, ex)) - self.tran = senses - -parser = Parser() -dom = parser.parse(fin) -fin.close() - -for idx in range(1, len(dom)): - article = dom[idx] - fout.write("_____\n\n") - title = "; ".join(article[0].keys()) - fout.write(title) - fout.write("\n\n") - for (word, (pron, attrs)) in article[0].items(): - if word == "approach": - fout.write(str(article[0])) - fout.write(" ") - fout.write(word) - fout.write("\n") - if pron is not None: - fout.write(" [") - fout.write(pron) - fout.write("]\n") - if len(attrs) > 0: - fout.write(" ") - l = list(attrs) - l.sort() - fout.write(", ".join(l)) - fout.write("\n") - fout.write("\n") - for (pos, trs, exs) in article[1]: - fout.write(" ") - if pos is not None: - fout.write("⟨") - fout.write(pos) - fout.write("⟩ ") - for (lang, tr) in trs: - if lang == "ru": - fout.write(tr) - break - fout.write("\n") - - # fout.write(str(article[0])+"\n") - diff -r 4657b44ad9af -r 2756a6deca7e py/gadict.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/py/gadict.py Sun Mar 27 16:44:14 2016 +0300 @@ -0,0 +1,206 @@ + +import io +import sys +# import re +import regex + + +# fgadict = "gadict_en-ru+ua.gadict" +fgadict = None +fnout = None +if len(sys.argv) >= 2: + fgadict = sys.argv[1] +if len(sys.argv) >= 3: + fnout = sys.argv[2] + +fin = io.open(fgadict, mode='r', buffering=1, encoding="utf-8") +if fnout is None: + fout = sys.stdout +else: + fout = open(fnout, "w") + + +class ParseException(Exception): + + def __init__(self, msg): + self.msg = msg + + def __repr__(self): + return self.msg + + +class Parser: + + SEPARATOR_RE = regex.compile(r"^__$") + HEADWORD_RE = regex.compile(r"^(\p{L}.*)$") + HEADWORD_VAR_RE = regex.compile(r"^ +(s|pl|v[123]|male|female|comp|super)$") + HEADWORD_PRON_RE = regex.compile(r"^ +\[([\p{L}' ]+)\]$") + TRANSL_POS_RE = regex.compile(r"^n|pron|adj|v|adv|prep|conj|num|int|phr\.v|abbr$") + TRANSL_RE = regex.compile(r"^(ru|uk|la|en): ([\p{L}(][\p{L}\p{P}~ ]*)$") + TRANSL_EX_RE = regex.compile(r"^(ru|uk|la|en)> (\p{L}.*)$") + + TRAILING_SPACES_RE = regex.compile(r"\p{Z}+$") + + def __init__(self): + pass + + def readline(self): + self.line = self.stream.readline() + self.eof = len(self.line) == 0 + if not self.eof: + self.lineno += 1 + + def parse(self, stream): + self.lineno = 0 + self.stream = stream + self.dom = [] + try: + self.parse_prelude() + while not self.eof: + self.parse_article() + except ParseException as ex: + if self.TRAILING_SPACES_RE.match(self.line): + fout.write("{:s}:{:d}: {:s}".format(fgadict, self.lineno, "Traling spaces detected...\n")) + fout.write("{:s}:{:d}: {:s}\nLINE: {:s}\n".format(fgadict, self.lineno, str(ex), self.line)) + raise Exception(ex) + return self.dom + + def parse_prelude(self): + """Read dictionary prelude until first "__" delimiter.""" + while True: + self.readline() + if self.eof: + raise ParseException("There are no articles...") + if self.SEPARATOR_RE.match(self.line): + break + + def parse_article(self): + """Try to match article until next "__" delimiter. Assume that `self.line` point to "__" delimiter.""" + self.words = None + self.tran = None + self.parse_empty_line() + self.parse_headlines() + self.parse_translation() + self.dom.append((self.words, self.tran)) + + def parse_empty_line(self): + self.readline() + if self.eof or len(self.line) != 1: + raise ParseException(""""__" delimiter should followed by empty line...""") + + def parse_headlines(self): + """Try to match word variations with attributed. Assume that `self.line` on preceding empty line.""" + self.words = {} + self.readline() + if self.eof: + raise ParseException("""There are no definition after "__" delimiter...""") + m = self.HEADWORD_RE.match(self.line) + if m is None: + raise ParseException("""There are no headword after "__" delimiter...""") + word = m.group(1) + pron = None + attrs = set() + while True: + self.readline() + if self.eof or len(self.line) == 1: + break + m = self.HEADWORD_RE.match(self.line) + if m is not None: + if word is None: + raise ParseException("""Didn't match previous headword...""") + self.words[word] = (pron, attrs) + word = m.group(1) + pron = None + attrs = set() + continue + m = self.HEADWORD_PRON_RE.match(self.line) + if m is not None: + if pron is not None: + raise ParseException("""Pronunciation is redefined...""") + pron = m.group(1) + continue + m = self.HEADWORD_VAR_RE.match(self.line) + if m is not None: + attrs.add(m.group(1)) + continue + raise ParseException("""Line is not headword or translation or headword attribute...""") + self.words[word] = (pron, attrs) + + def parse_translation(self): + senses = [] + pos = None + tr = [] + ex = [] + while True: + self.readline() + if self.eof: + break + m = self.SEPARATOR_RE.match(self.line) + if m is not None: + break + if len(self.line) == 1: + senses.append((pos, tr, ex)) + pos = None + tr = [] + ex = [] + continue + m = self.TRANSL_POS_RE.match(self.line) + if m is not None: + if pos is not None: + raise ParseException("""Each translation should have only one part of speech marker...""") + pos = m.group(0) + continue + m = self.TRANSL_RE.match(self.line) + if m is not None: + tr.append((m.group(1), m.group(2))) + continue + m = self.TRANSL_EX_RE.match(self.line) + if m is not None: + ex.append((m.group(1), m.group(2))) + continue + raise ParseException("""Uknown syntax...""") + if len(tr) > 0: + senses.append((pos, tr, ex)) + self.tran = senses + +parser = Parser() +dom = parser.parse(fin) +fin.close() + +for idx in range(1, len(dom)): + article = dom[idx] + fout.write("_____\n\n") + title = "; ".join(article[0].keys()) + fout.write(title) + fout.write("\n\n") + for (word, (pron, attrs)) in article[0].items(): + if word == "approach": + fout.write(str(article[0])) + fout.write(" ") + fout.write(word) + fout.write("\n") + if pron is not None: + fout.write(" [") + fout.write(pron) + fout.write("]\n") + if len(attrs) > 0: + fout.write(" ") + l = list(attrs) + l.sort() + fout.write(", ".join(l)) + fout.write("\n") + fout.write("\n") + for (pos, trs, exs) in article[1]: + fout.write(" ") + if pos is not None: + fout.write("⟨") + fout.write(pos) + fout.write("⟩ ") + for (lang, tr) in trs: + if lang == "ru": + fout.write(tr) + break + fout.write("\n") + + # fout.write(str(article[0])+"\n") +