Script for converting C5 format to gadict format.
authorOleksandr Gavenko <gavenkoa@gmail.com>
Sun, 13 Mar 2016 01:51:56 +0200
changeset 319 75430afe1a43
parent 318 d4767f21ca59
child 320 4272023569f3
Script for converting C5 format to gadict format.
obsolete/conv-c5-to-gadict.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/obsolete/conv-c5-to-gadict.py	Sun Mar 13 01:51:56 2016 +0200
@@ -0,0 +1,246 @@
+"""
+Put *.dict-c5 files to directory with script and run:
+
+  $ python conv-c5-to-gadict.py
+
+Output saved to `conv.gadict` file.
+"""
+
+import re
+
+ARTICLE_SEP = """
+_____
+
+"""
+
+
+class Variance:
+    def __init__(self, pron=None, attrs=None):
+        self.pron = pron
+        self.attrs = set()
+        self.addAttrs(attrs)
+
+    def addAttrs(self, attrs):
+        if attrs is None:
+            return
+        if isinstance(attrs, str):
+            self.attrs.add(attrs)
+        elif isinstance(attrs, set):
+            self.attrs.update(attrs)
+        else:
+            raise TypeError("Should be str or set...", type(attrs))
+
+    def __repr__(self):
+        return "<pron: " + self.pron + ">"
+
+
+class Variances:
+    def __init__(self):
+        self.store = {}
+
+    def add(self, word, pron=None, attrs=None):
+        """`word`is `str`, `pron` is a `str`, `atts` is a `list` of `str`."""
+        if word not in self.store:
+            self.store[word] = Variance()
+        var = self.store[word]
+        if pron is not None:
+            if var.pron is not None:
+                print("two pronunciations detected!!", pron, var.pron)
+                # raise Exception("two pronunciations detected!!", pron, var.pron)
+            var.pron = pron
+        var.addAttrs(attrs)
+
+    def __repr__(self):
+        return repr(self.store)
+
+
+class Translation:
+    def __init__(self, pos, tran):
+        """`pos` is `str` like 'n' or 'v', `tran` is `str`."""
+        self.pos = pos
+        self.tran = tran
+
+    def __repr__(self):
+        return "<pos: " + self.pos + ", tran: " + self.tran + ">"
+
+
+class Article:
+    def __init__(self):
+        self.vars = Variances()
+        self.trans = []
+
+    def addVar(self, word, pron=None, attrs=None):
+        """`var` is `list`"""
+        self.vars.add(word, pron, attrs)
+
+    def addTransl(self, pos, tran):
+        """`trans` is `Translation`."""
+        self.trans.append(Translation(pos, tran))
+
+
+DICT = {}
+
+f = open('gadict-irregular-verbs-en-ru.dict-c5')
+content = f.read()
+content = content.split(ARTICLE_SEP)
+content = iter(content)
+next(content)
+
+PRON_LINE_RE = re.compile(r'^  \[')
+PRON_RE = re.compile(r'\[([^]]+)]')
+
+V1_RE = re.compile(r"^  (?:inf\. )?([^/]+)/?(.*)?")
+V2_RE = re.compile(r"^  (?:p. )?([^/ ]+)/?(.*)?")
+V3_RE = re.compile(r"^  (?:p\.p\. )?([^/ ]+)/?(.*)?")
+V3_RE = re.compile(r"^  (?:p\.p\. )?([^/ ]+)/?(.*)?")
+RU_RE = re.compile(r"^  (.+)")
+
+for piece in content:
+    article = Article()
+    lines = piece.split("\n")
+    headwords = lines[0].split("; ")
+    for title in headwords:
+        article.addVar(title)
+    assert lines[1] == ""
+    curr = 2
+    line = lines[curr]
+    if PRON_LINE_RE.match(line):
+        curr += 1
+        prons = PRON_RE.findall(line)
+        if len(prons) != len(headwords):
+            raise Exception("some prononsiation missing for", headwords, prons)
+        for i in range(len(headwords)):
+            article.addVar(headwords[i], pron=prons[i])
+    line = lines[curr]
+    m = V1_RE.match(line)
+    if m:
+        article.addVar(word=m.group(1), attrs="v1")
+        if len(m.group(2)) > 0:
+            article.addVar(word=m.group(2), attrs="v1")
+    else:
+        raise Exception("Can't match", line)
+    curr += 1
+    line = lines[curr]
+    m = V2_RE.match(line)
+    if m:
+        article.addVar(m.group(1), attrs="v2")
+        if len(m.group(2)) > 0:
+            article.addVar(m.group(2), attrs="v2")
+    else:
+        raise Exception("Can't match", line)
+    curr += 1
+    line = lines[curr]
+    m = V3_RE.match(line)
+    if m:
+        article.addVar(m.group(1), attrs="v3")
+        if len(m.group(2)) > 0:
+            article.addVar(m.group(2), attrs="v3")
+    else:
+        raise Exception("Can't match", line)
+    try:
+        curr += 1
+        line = lines[curr]
+    except IndexError:
+        raise IndexError("No translation after", lines[curr-1], lines)
+    m = RU_RE.match(line)
+    if m:
+        article.addTransl(pos="v", tran=m.group(1))
+    else:
+        raise Exception("Can't match", line)
+    if len(lines) != curr+1:
+        raise Exception("Unknown line", line, lines, len(lines), curr)
+    DICT[headwords[0]] = article
+
+f.close()
+
+HEADWORD_RE = re.compile(r"^([a-zA-Z][a-z -]*)$")
+PRON_RE = re.compile(r"^  \[([^]]+)\]$")
+TRANSL_RE = re.compile(r"^  (?:[1-9]\) )?(.+)$")
+EMPTY_PRON_RE = re.compile(r"^  \[\]$")
+
+
+def parse_entry(entry, pos):
+    lines = entry.split("\n")
+    m = HEADWORD_RE.match(lines[0])
+    if m is None:
+        raise Exception("Fail to parse headword", lines)
+    if len(lines[1]) != 0:
+        raise Exception("Headword is not separated from article", lines)
+    headword = m.group(1)
+    if headword in DICT:
+        article = DICT[headword]
+    else:
+        article = Article()
+        DICT[headword] = article
+    curr = 2
+    line = lines[curr]
+    m = PRON_RE.match(line)
+    pron = None
+    if m is not None:
+        curr += 1
+        pron = m.group(1)
+        if len(pron) == 0:
+            pron = None
+    article.addVar(headword, pron=pron)
+    while curr < len(lines):
+        line = lines[curr]
+        curr += 1
+        if EMPTY_PRON_RE.match(line):
+            continue
+        m = TRANSL_RE.match(line)
+        if m:
+            article.addTransl(pos=pos, tran=m.group(1))
+
+
+def parse_file(fn, pos):
+    f = open(fn)
+    content = f.read()
+    content = content.split(ARTICLE_SEP)
+    content = iter(content)
+    next(content)
+    for entry in content:
+        try:
+            parse_entry(entry, pos)
+        except Exception as e:
+            raise Exception(e, fn, entry)
+    f.close()
+
+
+parse_file('gadict-regular-verbs-en-ru.dict-c5', 'v')
+parse_file('gadict-adjective-en-ru.dict-c5', 'adj')
+parse_file('gadict-adverb-en-ru.dict-c5', 'adv')
+parse_file('gadict-conjunction-en-ru.dict-c5', 'conj')
+parse_file('gadict-en-ru.dict-c5', 'n')
+parse_file('gadict-numeral-en-ru.dict-c5', 'num')
+parse_file('gadict-phrasal-verbs-en-ru.dict-c5', 'phr.v')
+parse_file('gadict-preposition-en-ru.dict-c5', 'prep')
+parse_file('gadict-pronoun-en-ru.dict-c5', 'pron')
+# parse_file('', '')
+# parse_file('', '')
+# parse_file('', '')
+# parse_file('', '')
+
+f = open("conv.gadict", "w")
+
+for baseword in sorted(DICT.keys()):
+    art = DICT[baseword]
+    f.write('__\n\n')
+    for (headword, var) in art.vars.store.items():
+        f.write(headword)
+        f.write('\n')
+        if var.pron is not None:
+            f.write(' [')
+            f.write(var.pron)
+            f.write(']\n')
+        for attr in var.attrs:
+            f.write(' ')
+            f.write(attr)
+            f.write('\n')
+    for tran in art.trans:
+        f.write('\n')
+        f.write(tran.pos)
+        f.write('\nru: ')
+        f.write(tran.tran)
+        f.write('\n')
+
+f.close()