obsolete/conv-c5-to-gadict.py
author Oleksandr Gavenko <gavenkoa@gmail.com>
Thu, 29 Dec 2016 01:20:21 +0200
changeset 724 98fd211d27db
parent 319 75430afe1a43
permissions -rw-r--r--
Target to show combined statistics about missing words.

"""
Put *.dict-c5 files to directory with script and run:

  $ python conv-c5-to-gadict.py

Output saved to `conv.gadict` file.
"""

import re

ARTICLE_SEP = """
_____

"""


class Variance:
    def __init__(self, pron=None, attrs=None):
        self.pron = pron
        self.attrs = set()
        self.addAttrs(attrs)

    def addAttrs(self, attrs):
        if attrs is None:
            return
        if isinstance(attrs, str):
            self.attrs.add(attrs)
        elif isinstance(attrs, set):
            self.attrs.update(attrs)
        else:
            raise TypeError("Should be str or set...", type(attrs))

    def __repr__(self):
        return "<pron: " + self.pron + ">"


class Variances:
    def __init__(self):
        self.store = {}

    def add(self, word, pron=None, attrs=None):
        """`word`is `str`, `pron` is a `str`, `atts` is a `list` of `str`."""
        if word not in self.store:
            self.store[word] = Variance()
        var = self.store[word]
        if pron is not None:
            if var.pron is not None:
                print("two pronunciations detected!!", pron, var.pron)
                # raise Exception("two pronunciations detected!!", pron, var.pron)
            var.pron = pron
        var.addAttrs(attrs)

    def __repr__(self):
        return repr(self.store)


class Translation:
    def __init__(self, pos, tran):
        """`pos` is `str` like 'n' or 'v', `tran` is `str`."""
        self.pos = pos
        self.tran = tran

    def __repr__(self):
        return "<pos: " + self.pos + ", tran: " + self.tran + ">"


class Article:
    def __init__(self):
        self.vars = Variances()
        self.trans = []

    def addVar(self, word, pron=None, attrs=None):
        """`var` is `list`"""
        self.vars.add(word, pron, attrs)

    def addTransl(self, pos, tran):
        """`trans` is `Translation`."""
        self.trans.append(Translation(pos, tran))


DICT = {}

f = open('gadict-irregular-verbs-en-ru.dict-c5')
content = f.read()
content = content.split(ARTICLE_SEP)
content = iter(content)
next(content)

PRON_LINE_RE = re.compile(r'^  \[')
PRON_RE = re.compile(r'\[([^]]+)]')

V1_RE = re.compile(r"^  (?:inf\. )?([^/]+)/?(.*)?")
V2_RE = re.compile(r"^  (?:p. )?([^/ ]+)/?(.*)?")
V3_RE = re.compile(r"^  (?:p\.p\. )?([^/ ]+)/?(.*)?")
V3_RE = re.compile(r"^  (?:p\.p\. )?([^/ ]+)/?(.*)?")
RU_RE = re.compile(r"^  (.+)")

for piece in content:
    article = Article()
    lines = piece.split("\n")
    headwords = lines[0].split("; ")
    for title in headwords:
        article.addVar(title)
    assert lines[1] == ""
    curr = 2
    line = lines[curr]
    if PRON_LINE_RE.match(line):
        curr += 1
        prons = PRON_RE.findall(line)
        if len(prons) != len(headwords):
            raise Exception("some prononsiation missing for", headwords, prons)
        for i in range(len(headwords)):
            article.addVar(headwords[i], pron=prons[i])
    line = lines[curr]
    m = V1_RE.match(line)
    if m:
        article.addVar(word=m.group(1), attrs="v1")
        if len(m.group(2)) > 0:
            article.addVar(word=m.group(2), attrs="v1")
    else:
        raise Exception("Can't match", line)
    curr += 1
    line = lines[curr]
    m = V2_RE.match(line)
    if m:
        article.addVar(m.group(1), attrs="v2")
        if len(m.group(2)) > 0:
            article.addVar(m.group(2), attrs="v2")
    else:
        raise Exception("Can't match", line)
    curr += 1
    line = lines[curr]
    m = V3_RE.match(line)
    if m:
        article.addVar(m.group(1), attrs="v3")
        if len(m.group(2)) > 0:
            article.addVar(m.group(2), attrs="v3")
    else:
        raise Exception("Can't match", line)
    try:
        curr += 1
        line = lines[curr]
    except IndexError:
        raise IndexError("No translation after", lines[curr-1], lines)
    m = RU_RE.match(line)
    if m:
        article.addTransl(pos="v", tran=m.group(1))
    else:
        raise Exception("Can't match", line)
    if len(lines) != curr+1:
        raise Exception("Unknown line", line, lines, len(lines), curr)
    DICT[headwords[0]] = article

f.close()

HEADWORD_RE = re.compile(r"^([a-zA-Z][a-z -]*)$")
PRON_RE = re.compile(r"^  \[([^]]+)\]$")
TRANSL_RE = re.compile(r"^  (?:[1-9]\) )?(.+)$")
EMPTY_PRON_RE = re.compile(r"^  \[\]$")


def parse_entry(entry, pos):
    lines = entry.split("\n")
    m = HEADWORD_RE.match(lines[0])
    if m is None:
        raise Exception("Fail to parse headword", lines)
    if len(lines[1]) != 0:
        raise Exception("Headword is not separated from article", lines)
    headword = m.group(1)
    if headword in DICT:
        article = DICT[headword]
    else:
        article = Article()
        DICT[headword] = article
    curr = 2
    line = lines[curr]
    m = PRON_RE.match(line)
    pron = None
    if m is not None:
        curr += 1
        pron = m.group(1)
        if len(pron) == 0:
            pron = None
    article.addVar(headword, pron=pron)
    while curr < len(lines):
        line = lines[curr]
        curr += 1
        if EMPTY_PRON_RE.match(line):
            continue
        m = TRANSL_RE.match(line)
        if m:
            article.addTransl(pos=pos, tran=m.group(1))


def parse_file(fn, pos):
    f = open(fn)
    content = f.read()
    content = content.split(ARTICLE_SEP)
    content = iter(content)
    next(content)
    for entry in content:
        try:
            parse_entry(entry, pos)
        except Exception as e:
            raise Exception(e, fn, entry)
    f.close()


parse_file('gadict-regular-verbs-en-ru.dict-c5', 'v')
parse_file('gadict-adjective-en-ru.dict-c5', 'adj')
parse_file('gadict-adverb-en-ru.dict-c5', 'adv')
parse_file('gadict-conjunction-en-ru.dict-c5', 'conj')
parse_file('gadict-en-ru.dict-c5', 'n')
parse_file('gadict-numeral-en-ru.dict-c5', 'num')
parse_file('gadict-phrasal-verbs-en-ru.dict-c5', 'phr.v')
parse_file('gadict-preposition-en-ru.dict-c5', 'prep')
parse_file('gadict-pronoun-en-ru.dict-c5', 'pron')
# parse_file('', '')
# parse_file('', '')
# parse_file('', '')
# parse_file('', '')

f = open("conv.gadict", "w")

for baseword in sorted(DICT.keys()):
    art = DICT[baseword]
    f.write('__\n\n')
    for (headword, var) in art.vars.store.items():
        f.write(headword)
        f.write('\n')
        if var.pron is not None:
            f.write(' [')
            f.write(var.pron)
            f.write(']\n')
        for attr in var.attrs:
            f.write(' ')
            f.write(attr)
            f.write('\n')
    for tran in art.trans:
        f.write('\n')
        f.write(tran.pos)
        f.write('\nru: ')
        f.write(tran.tran)
        f.write('\n')

f.close()