# HG changeset patch # User Oleksandr Gavenko # Date 1457826716 -7200 # Node ID 75430afe1a430879aeb41970d588eddb2525a330 # Parent d4767f21ca59df45d1094ca0cb1b23f439f32f8d Script for converting C5 format to gadict format. diff -r d4767f21ca59 -r 75430afe1a43 obsolete/conv-c5-to-gadict.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/obsolete/conv-c5-to-gadict.py Sun Mar 13 01:51:56 2016 +0200 @@ -0,0 +1,246 @@ +""" +Put *.dict-c5 files to directory with script and run: + + $ python conv-c5-to-gadict.py + +Output saved to `conv.gadict` file. +""" + +import re + +ARTICLE_SEP = """ +_____ + +""" + + +class Variance: + def __init__(self, pron=None, attrs=None): + self.pron = pron + self.attrs = set() + self.addAttrs(attrs) + + def addAttrs(self, attrs): + if attrs is None: + return + if isinstance(attrs, str): + self.attrs.add(attrs) + elif isinstance(attrs, set): + self.attrs.update(attrs) + else: + raise TypeError("Should be str or set...", type(attrs)) + + def __repr__(self): + return "" + + +class Variances: + def __init__(self): + self.store = {} + + def add(self, word, pron=None, attrs=None): + """`word`is `str`, `pron` is a `str`, `atts` is a `list` of `str`.""" + if word not in self.store: + self.store[word] = Variance() + var = self.store[word] + if pron is not None: + if var.pron is not None: + print("two pronunciations detected!!", pron, var.pron) + # raise Exception("two pronunciations detected!!", pron, var.pron) + var.pron = pron + var.addAttrs(attrs) + + def __repr__(self): + return repr(self.store) + + +class Translation: + def __init__(self, pos, tran): + """`pos` is `str` like 'n' or 'v', `tran` is `str`.""" + self.pos = pos + self.tran = tran + + def __repr__(self): + return "" + + +class Article: + def __init__(self): + self.vars = Variances() + self.trans = [] + + def addVar(self, word, pron=None, attrs=None): + """`var` is `list`""" + self.vars.add(word, pron, attrs) + + def addTransl(self, pos, tran): + """`trans` is `Translation`.""" + self.trans.append(Translation(pos, tran)) + + +DICT = {} + +f = open('gadict-irregular-verbs-en-ru.dict-c5') +content = f.read() +content = content.split(ARTICLE_SEP) +content = iter(content) +next(content) + +PRON_LINE_RE = re.compile(r'^ \[') +PRON_RE = re.compile(r'\[([^]]+)]') + +V1_RE = re.compile(r"^ (?:inf\. )?([^/]+)/?(.*)?") +V2_RE = re.compile(r"^ (?:p. )?([^/ ]+)/?(.*)?") +V3_RE = re.compile(r"^ (?:p\.p\. )?([^/ ]+)/?(.*)?") +V3_RE = re.compile(r"^ (?:p\.p\. )?([^/ ]+)/?(.*)?") +RU_RE = re.compile(r"^ (.+)") + +for piece in content: + article = Article() + lines = piece.split("\n") + headwords = lines[0].split("; ") + for title in headwords: + article.addVar(title) + assert lines[1] == "" + curr = 2 + line = lines[curr] + if PRON_LINE_RE.match(line): + curr += 1 + prons = PRON_RE.findall(line) + if len(prons) != len(headwords): + raise Exception("some prononsiation missing for", headwords, prons) + for i in range(len(headwords)): + article.addVar(headwords[i], pron=prons[i]) + line = lines[curr] + m = V1_RE.match(line) + if m: + article.addVar(word=m.group(1), attrs="v1") + if len(m.group(2)) > 0: + article.addVar(word=m.group(2), attrs="v1") + else: + raise Exception("Can't match", line) + curr += 1 + line = lines[curr] + m = V2_RE.match(line) + if m: + article.addVar(m.group(1), attrs="v2") + if len(m.group(2)) > 0: + article.addVar(m.group(2), attrs="v2") + else: + raise Exception("Can't match", line) + curr += 1 + line = lines[curr] + m = V3_RE.match(line) + if m: + article.addVar(m.group(1), attrs="v3") + if len(m.group(2)) > 0: + article.addVar(m.group(2), attrs="v3") + else: + raise Exception("Can't match", line) + try: + curr += 1 + line = lines[curr] + except IndexError: + raise IndexError("No translation after", lines[curr-1], lines) + m = RU_RE.match(line) + if m: + article.addTransl(pos="v", tran=m.group(1)) + else: + raise Exception("Can't match", line) + if len(lines) != curr+1: + raise Exception("Unknown line", line, lines, len(lines), curr) + DICT[headwords[0]] = article + +f.close() + +HEADWORD_RE = re.compile(r"^([a-zA-Z][a-z -]*)$") +PRON_RE = re.compile(r"^ \[([^]]+)\]$") +TRANSL_RE = re.compile(r"^ (?:[1-9]\) )?(.+)$") +EMPTY_PRON_RE = re.compile(r"^ \[\]$") + + +def parse_entry(entry, pos): + lines = entry.split("\n") + m = HEADWORD_RE.match(lines[0]) + if m is None: + raise Exception("Fail to parse headword", lines) + if len(lines[1]) != 0: + raise Exception("Headword is not separated from article", lines) + headword = m.group(1) + if headword in DICT: + article = DICT[headword] + else: + article = Article() + DICT[headword] = article + curr = 2 + line = lines[curr] + m = PRON_RE.match(line) + pron = None + if m is not None: + curr += 1 + pron = m.group(1) + if len(pron) == 0: + pron = None + article.addVar(headword, pron=pron) + while curr < len(lines): + line = lines[curr] + curr += 1 + if EMPTY_PRON_RE.match(line): + continue + m = TRANSL_RE.match(line) + if m: + article.addTransl(pos=pos, tran=m.group(1)) + + +def parse_file(fn, pos): + f = open(fn) + content = f.read() + content = content.split(ARTICLE_SEP) + content = iter(content) + next(content) + for entry in content: + try: + parse_entry(entry, pos) + except Exception as e: + raise Exception(e, fn, entry) + f.close() + + +parse_file('gadict-regular-verbs-en-ru.dict-c5', 'v') +parse_file('gadict-adjective-en-ru.dict-c5', 'adj') +parse_file('gadict-adverb-en-ru.dict-c5', 'adv') +parse_file('gadict-conjunction-en-ru.dict-c5', 'conj') +parse_file('gadict-en-ru.dict-c5', 'n') +parse_file('gadict-numeral-en-ru.dict-c5', 'num') +parse_file('gadict-phrasal-verbs-en-ru.dict-c5', 'phr.v') +parse_file('gadict-preposition-en-ru.dict-c5', 'prep') +parse_file('gadict-pronoun-en-ru.dict-c5', 'pron') +# parse_file('', '') +# parse_file('', '') +# parse_file('', '') +# parse_file('', '') + +f = open("conv.gadict", "w") + +for baseword in sorted(DICT.keys()): + art = DICT[baseword] + f.write('__\n\n') + for (headword, var) in art.vars.store.items(): + f.write(headword) + f.write('\n') + if var.pron is not None: + f.write(' [') + f.write(var.pron) + f.write(']\n') + for attr in var.attrs: + f.write(' ') + f.write(attr) + f.write('\n') + for tran in art.trans: + f.write('\n') + f.write(tran.pos) + f.write('\nru: ') + f.write(tran.tran) + f.write('\n') + +f.close()