# HG changeset patch # User Oleksandr Gavenko # Date 1458774707 -7200 # Node ID 18284ce77c7ab7cf16f02e3e45f1d69a33620f6a # Parent 3af39c0b52299af3706a3f905c1c1c5acf608cb1 gadict format parser. diff -r 3af39c0b5229 -r 18284ce77c7a gadict.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gadict.py Thu Mar 24 01:11:47 2016 +0200 @@ -0,0 +1,206 @@ + +import io +import sys +# import re +import regex + + +# fgadict = "gadict_en-ru+ua.gadict" +fgadict = None +fnout = None +if len(sys.argv) >= 2: + fgadict = sys.argv[1] +if len(sys.argv) >= 3: + fnout = sys.argv[2] + +fin = io.open(fgadict, mode='r', buffering=1, encoding="utf-8") +if fnout is None: + fout = sys.stdout +else: + fout = open(fnout, "w") + + +class ParseException(Exception): + + def __init__(self, msg): + self.msg = msg + + def __repr__(self): + return self.msg + + +class Parser: + + SEPARATOR_RE = regex.compile(r"^__$") + HEADWORD_RE = regex.compile(r"^(\p{L}.*)$") + HEADWORD_VAR_RE = regex.compile(r"^ +(s|pl|v[123]|male|female|comp|super)$") + HEADWORD_PRON_RE = regex.compile(r"^ +\[([\p{L}' ]+)\]$") + TRANSL_POS_RE = regex.compile(r"^n|pron|adj|v|adv|prep|conj|num|int|phr\.v|abbr$") + TRANSL_RE = regex.compile(r"^(ru|uk|la|en): ([\p{L}(][\p{L}\p{P}~ ]*)$") + TRANSL_EX_RE = regex.compile(r"^(ru|uk|la|en)> (\p{L}.*)$") + + TRAILING_SPACES_RE = regex.compile(r"\p{Z}+$") + + def __init__(self): + pass + + def readline(self): + self.line = self.stream.readline() + self.eof = len(self.line) == 0 + if not self.eof: + self.lineno += 1 + + def parse(self, stream): + self.lineno = 0 + self.stream = stream + self.dom = [] + try: + self.parse_prelude() + while not self.eof: + self.parse_article() + except ParseException as ex: + if self.TRAILING_SPACES_RE.match(self.line): + fout.write("{:s}:{:d}: {:s}".format(fgadict, self.lineno, "Traling spaces detected...\n")) + fout.write("{:s}:{:d}: {:s}\nLINE: {:s}\n".format(fgadict, self.lineno, str(ex), self.line)) + raise Exception(ex) + return self.dom + + def parse_prelude(self): + """Read dictionary prelude until first "__" delimiter.""" + while True: + self.readline() + if self.eof: + raise ParseException("There are no articles...") + if self.SEPARATOR_RE.match(self.line): + break + + def parse_article(self): + """Try to match article until next "__" delimiter. Assume that `self.line` point to "__" delimiter.""" + self.words = None + self.tran = None + self.parse_empty_line() + self.parse_headlines() + self.parse_translation() + self.dom.append((self.words, self.tran)) + + def parse_empty_line(self): + self.readline() + if self.eof or len(self.line) != 1: + raise ParseException(""""__" delimiter should followed by empty line...""") + + def parse_headlines(self): + """Try to match word variations with attributed. Assume that `self.line` on preceding empty line.""" + self.words = {} + self.readline() + if self.eof: + raise ParseException("""There are no definition after "__" delimiter...""") + m = self.HEADWORD_RE.match(self.line) + if m is None: + raise ParseException("""There are no headword after "__" delimiter...""") + word = m.group(1) + pron = None + attrs = set() + while True: + self.readline() + if self.eof or len(self.line) == 1: + break + m = self.HEADWORD_RE.match(self.line) + if m is not None: + if word is None: + raise ParseException("""Didn't match previous headword...""") + self.words[word] = (pron, attrs) + word = m.group(1) + pron = None + attrs = set() + continue + m = self.HEADWORD_PRON_RE.match(self.line) + if m is not None: + if pron is not None: + raise ParseException("""Pronunciation is redefined...""") + pron = m.group(1) + continue + m = self.HEADWORD_VAR_RE.match(self.line) + if m is not None: + attrs.add(m.group(1)) + continue + raise ParseException("""Line is not headword or translation or headword attribute...""") + self.words[word] = (pron, attrs) + + def parse_translation(self): + senses = [] + pos = None + tr = [] + ex = [] + while True: + self.readline() + if self.eof: + break + m = self.SEPARATOR_RE.match(self.line) + if m is not None: + break + if len(self.line) == 1: + senses.append((pos, tr, ex)) + pos = None + tr = [] + ex = [] + continue + m = self.TRANSL_POS_RE.match(self.line) + if m is not None: + if pos is not None: + raise ParseException("""Each translation should have only one part of speech marker...""") + pos = m.group(0) + continue + m = self.TRANSL_RE.match(self.line) + if m is not None: + tr.append((m.group(1), m.group(2))) + continue + m = self.TRANSL_EX_RE.match(self.line) + if m is not None: + ex.append((m.group(1), m.group(2))) + continue + raise ParseException("""Uknown syntax...""") + if len(tr) > 0: + senses.append((pos, tr, ex)) + self.tran = senses + +parser = Parser() +dom = parser.parse(fin) +fin.close() + +for idx in range(1, len(dom)): + article = dom[idx] + fout.write("_____\n\n") + title = "; ".join(article[0].keys()) + fout.write(title) + fout.write("\n\n") + for (word, (pron, attrs)) in article[0].items(): + if word == "approach": + fout.write(str(article[0])) + fout.write(" ") + fout.write(word) + fout.write("\n") + if pron is not None: + fout.write(" [") + fout.write(pron) + fout.write("]\n") + if len(attrs) > 0: + fout.write(" ") + l = list(attrs) + l.sort() + fout.write(", ".join(l)) + fout.write("\n") + fout.write("\n") + for (pos, trs, exs) in article[1]: + fout.write(" ") + if pos is not None: + fout.write("⟨") + fout.write(pos) + fout.write("⟩ ") + for (lang, tr) in trs: + if lang == "ru": + fout.write(tr) + break + fout.write("\n") + + # fout.write(str(article[0])+"\n") +