gadict: py/gadict.py@2756a6deca7e



import io
import sys
# import re
import regex


# fgadict = "gadict_en-ru+ua.gadict"
fgadict = None
fnout = None
if len(sys.argv) >= 2:
    fgadict = sys.argv[1]
if len(sys.argv) >= 3:
    fnout = sys.argv[2]

fin = io.open(fgadict, mode='r', buffering=1, encoding="utf-8")
if fnout is None:
    fout = sys.stdout
else:
    fout = open(fnout, "w")


class ParseException(Exception):

    def __init__(self, msg):
        self.msg = msg

    def __repr__(self):
        return self.msg


class Parser:

    SEPARATOR_RE = regex.compile(r"^__$")
    HEADWORD_RE = regex.compile(r"^(\p{L}.*)$")
    HEADWORD_VAR_RE = regex.compile(r"^ +(s|pl|v[123]|male|female|comp|super)$")
    HEADWORD_PRON_RE = regex.compile(r"^ +\[([\p{L}' ]+)\]$")
    TRANSL_POS_RE = regex.compile(r"^n|pron|adj|v|adv|prep|conj|num|int|phr\.v|abbr$")
    TRANSL_RE = regex.compile(r"^(ru|uk|la|en): ([\p{L}(][\p{L}\p{P}~ ]*)$")
    TRANSL_EX_RE = regex.compile(r"^(ru|uk|la|en)> (\p{L}.*)$")

    TRAILING_SPACES_RE = regex.compile(r"\p{Z}+$")

    def __init__(self):
        pass

    def readline(self):
        self.line = self.stream.readline()
        self.eof = len(self.line) == 0
        if not self.eof:
            self.lineno += 1

    def parse(self, stream):
        self.lineno = 0
        self.stream = stream
        self.dom = []
        try:
            self.parse_prelude()
            while not self.eof:
                self.parse_article()
        except ParseException as ex:
            if self.TRAILING_SPACES_RE.match(self.line):
                fout.write("{:s}:{:d}: {:s}".format(fgadict, self.lineno, "Traling spaces detected...\n"))
            fout.write("{:s}:{:d}: {:s}\nLINE: {:s}\n".format(fgadict, self.lineno, str(ex), self.line))
            raise Exception(ex)
        return self.dom

    def parse_prelude(self):
        """Read dictionary prelude until first "__" delimiter."""
        while True:
            self.readline()
            if self.eof:
                raise ParseException("There are no articles...")
            if self.SEPARATOR_RE.match(self.line):
                break

    def parse_article(self):
        """Try to match article until next "__" delimiter. Assume that `self.line` point to "__" delimiter."""
        self.words = None
        self.tran = None
        self.parse_empty_line()
        self.parse_headlines()
        self.parse_translation()
        self.dom.append((self.words, self.tran))

    def parse_empty_line(self):
        self.readline()
        if self.eof or len(self.line) != 1:
            raise ParseException(""""__" delimiter should followed by empty line...""")

    def parse_headlines(self):
        """Try to match word variations with attributed. Assume that `self.line` on preceding empty line."""
        self.words = {}
        self.readline()
        if self.eof:
            raise ParseException("""There are no definition after "__" delimiter...""")
        m = self.HEADWORD_RE.match(self.line)
        if m is None:
            raise ParseException("""There are no headword after "__" delimiter...""")
        word = m.group(1)
        pron = None
        attrs = set()
        while True:
            self.readline()
            if self.eof or len(self.line) == 1:
                break
            m = self.HEADWORD_RE.match(self.line)
            if m is not None:
                if word is None:
                    raise ParseException("""Didn't match previous headword...""")
                self.words[word] = (pron, attrs)
                word = m.group(1)
                pron = None
                attrs = set()
                continue
            m = self.HEADWORD_PRON_RE.match(self.line)
            if m is not None:
                if pron is not None:
                    raise ParseException("""Pronunciation is redefined...""")
                pron = m.group(1)
                continue
            m = self.HEADWORD_VAR_RE.match(self.line)
            if m is not None:
                attrs.add(m.group(1))
                continue
            raise ParseException("""Line is not headword or translation or headword attribute...""")
        self.words[word] = (pron, attrs)

    def parse_translation(self):
        senses = []
        pos = None
        tr = []
        ex = []
        while True:
            self.readline()
            if self.eof:
                break
            m = self.SEPARATOR_RE.match(self.line)
            if m is not None:
                break
            if len(self.line) == 1:
                senses.append((pos, tr, ex))
                pos = None
                tr = []
                ex = []
                continue
            m = self.TRANSL_POS_RE.match(self.line)
            if m is not None:
                if pos is not None:
                    raise ParseException("""Each translation should have only one part of speech marker...""")
                pos = m.group(0)
                continue
            m = self.TRANSL_RE.match(self.line)
            if m is not None:
                tr.append((m.group(1), m.group(2)))
                continue
            m = self.TRANSL_EX_RE.match(self.line)
            if m is not None:
                ex.append((m.group(1), m.group(2)))
                continue
            raise ParseException("""Uknown syntax...""")
        if len(tr) > 0:
            senses.append((pos, tr, ex))
        self.tran = senses

parser = Parser()
dom = parser.parse(fin)
fin.close()

for idx in range(1, len(dom)):
    article = dom[idx]
    fout.write("_____\n\n")
    title = "; ".join(article[0].keys())
    fout.write(title)
    fout.write("\n\n")
    for (word, (pron, attrs)) in article[0].items():
        if word == "approach":
            fout.write(str(article[0]))
        fout.write("  ")
        fout.write(word)
        fout.write("\n")
        if pron is not None:
            fout.write("    [")
            fout.write(pron)
            fout.write("]\n")
        if len(attrs) > 0:
            fout.write("    ")
            l = list(attrs)
            l.sort()
            fout.write(", ".join(l))
            fout.write("\n")
    fout.write("\n")
    for (pos, trs, exs) in article[1]:
        fout.write("  ")
        if pos is not None:
            fout.write("⟨")
            fout.write(pos)
            fout.write("⟩ ")
        for (lang, tr) in trs:
            if lang == "ru":
                fout.write(tr)
                break
        fout.write("\n")

    # fout.write(str(article[0])+"\n")
author	Oleksandr Gavenko <gavenkoa@gmail.com>
	Sun, 27 Mar 2016 16:44:14 +0300
changeset 393	2756a6deca7e
parent 385	gadict.py@18284ce77c7a
child 394	4d45194c71b6
permissions	-rw-r--r--