--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/obsolete/conv-c5-to-gadict.py Sun Mar 13 01:51:56 2016 +0200
@@ -0,0 +1,246 @@
+"""
+Put *.dict-c5 files to directory with script and run:
+
+ $ python conv-c5-to-gadict.py
+
+Output saved to `conv.gadict` file.
+"""
+
+import re
+
+ARTICLE_SEP = """
+_____
+
+"""
+
+
+class Variance:
+ def __init__(self, pron=None, attrs=None):
+ self.pron = pron
+ self.attrs = set()
+ self.addAttrs(attrs)
+
+ def addAttrs(self, attrs):
+ if attrs is None:
+ return
+ if isinstance(attrs, str):
+ self.attrs.add(attrs)
+ elif isinstance(attrs, set):
+ self.attrs.update(attrs)
+ else:
+ raise TypeError("Should be str or set...", type(attrs))
+
+ def __repr__(self):
+ return "<pron: " + self.pron + ">"
+
+
+class Variances:
+ def __init__(self):
+ self.store = {}
+
+ def add(self, word, pron=None, attrs=None):
+ """`word`is `str`, `pron` is a `str`, `atts` is a `list` of `str`."""
+ if word not in self.store:
+ self.store[word] = Variance()
+ var = self.store[word]
+ if pron is not None:
+ if var.pron is not None:
+ print("two pronunciations detected!!", pron, var.pron)
+ # raise Exception("two pronunciations detected!!", pron, var.pron)
+ var.pron = pron
+ var.addAttrs(attrs)
+
+ def __repr__(self):
+ return repr(self.store)
+
+
+class Translation:
+ def __init__(self, pos, tran):
+ """`pos` is `str` like 'n' or 'v', `tran` is `str`."""
+ self.pos = pos
+ self.tran = tran
+
+ def __repr__(self):
+ return "<pos: " + self.pos + ", tran: " + self.tran + ">"
+
+
+class Article:
+ def __init__(self):
+ self.vars = Variances()
+ self.trans = []
+
+ def addVar(self, word, pron=None, attrs=None):
+ """`var` is `list`"""
+ self.vars.add(word, pron, attrs)
+
+ def addTransl(self, pos, tran):
+ """`trans` is `Translation`."""
+ self.trans.append(Translation(pos, tran))
+
+
+DICT = {}
+
+f = open('gadict-irregular-verbs-en-ru.dict-c5')
+content = f.read()
+content = content.split(ARTICLE_SEP)
+content = iter(content)
+next(content)
+
+PRON_LINE_RE = re.compile(r'^ \[')
+PRON_RE = re.compile(r'\[([^]]+)]')
+
+V1_RE = re.compile(r"^ (?:inf\. )?([^/]+)/?(.*)?")
+V2_RE = re.compile(r"^ (?:p. )?([^/ ]+)/?(.*)?")
+V3_RE = re.compile(r"^ (?:p\.p\. )?([^/ ]+)/?(.*)?")
+V3_RE = re.compile(r"^ (?:p\.p\. )?([^/ ]+)/?(.*)?")
+RU_RE = re.compile(r"^ (.+)")
+
+for piece in content:
+ article = Article()
+ lines = piece.split("\n")
+ headwords = lines[0].split("; ")
+ for title in headwords:
+ article.addVar(title)
+ assert lines[1] == ""
+ curr = 2
+ line = lines[curr]
+ if PRON_LINE_RE.match(line):
+ curr += 1
+ prons = PRON_RE.findall(line)
+ if len(prons) != len(headwords):
+ raise Exception("some prononsiation missing for", headwords, prons)
+ for i in range(len(headwords)):
+ article.addVar(headwords[i], pron=prons[i])
+ line = lines[curr]
+ m = V1_RE.match(line)
+ if m:
+ article.addVar(word=m.group(1), attrs="v1")
+ if len(m.group(2)) > 0:
+ article.addVar(word=m.group(2), attrs="v1")
+ else:
+ raise Exception("Can't match", line)
+ curr += 1
+ line = lines[curr]
+ m = V2_RE.match(line)
+ if m:
+ article.addVar(m.group(1), attrs="v2")
+ if len(m.group(2)) > 0:
+ article.addVar(m.group(2), attrs="v2")
+ else:
+ raise Exception("Can't match", line)
+ curr += 1
+ line = lines[curr]
+ m = V3_RE.match(line)
+ if m:
+ article.addVar(m.group(1), attrs="v3")
+ if len(m.group(2)) > 0:
+ article.addVar(m.group(2), attrs="v3")
+ else:
+ raise Exception("Can't match", line)
+ try:
+ curr += 1
+ line = lines[curr]
+ except IndexError:
+ raise IndexError("No translation after", lines[curr-1], lines)
+ m = RU_RE.match(line)
+ if m:
+ article.addTransl(pos="v", tran=m.group(1))
+ else:
+ raise Exception("Can't match", line)
+ if len(lines) != curr+1:
+ raise Exception("Unknown line", line, lines, len(lines), curr)
+ DICT[headwords[0]] = article
+
+f.close()
+
+HEADWORD_RE = re.compile(r"^([a-zA-Z][a-z -]*)$")
+PRON_RE = re.compile(r"^ \[([^]]+)\]$")
+TRANSL_RE = re.compile(r"^ (?:[1-9]\) )?(.+)$")
+EMPTY_PRON_RE = re.compile(r"^ \[\]$")
+
+
+def parse_entry(entry, pos):
+ lines = entry.split("\n")
+ m = HEADWORD_RE.match(lines[0])
+ if m is None:
+ raise Exception("Fail to parse headword", lines)
+ if len(lines[1]) != 0:
+ raise Exception("Headword is not separated from article", lines)
+ headword = m.group(1)
+ if headword in DICT:
+ article = DICT[headword]
+ else:
+ article = Article()
+ DICT[headword] = article
+ curr = 2
+ line = lines[curr]
+ m = PRON_RE.match(line)
+ pron = None
+ if m is not None:
+ curr += 1
+ pron = m.group(1)
+ if len(pron) == 0:
+ pron = None
+ article.addVar(headword, pron=pron)
+ while curr < len(lines):
+ line = lines[curr]
+ curr += 1
+ if EMPTY_PRON_RE.match(line):
+ continue
+ m = TRANSL_RE.match(line)
+ if m:
+ article.addTransl(pos=pos, tran=m.group(1))
+
+
+def parse_file(fn, pos):
+ f = open(fn)
+ content = f.read()
+ content = content.split(ARTICLE_SEP)
+ content = iter(content)
+ next(content)
+ for entry in content:
+ try:
+ parse_entry(entry, pos)
+ except Exception as e:
+ raise Exception(e, fn, entry)
+ f.close()
+
+
+parse_file('gadict-regular-verbs-en-ru.dict-c5', 'v')
+parse_file('gadict-adjective-en-ru.dict-c5', 'adj')
+parse_file('gadict-adverb-en-ru.dict-c5', 'adv')
+parse_file('gadict-conjunction-en-ru.dict-c5', 'conj')
+parse_file('gadict-en-ru.dict-c5', 'n')
+parse_file('gadict-numeral-en-ru.dict-c5', 'num')
+parse_file('gadict-phrasal-verbs-en-ru.dict-c5', 'phr.v')
+parse_file('gadict-preposition-en-ru.dict-c5', 'prep')
+parse_file('gadict-pronoun-en-ru.dict-c5', 'pron')
+# parse_file('', '')
+# parse_file('', '')
+# parse_file('', '')
+# parse_file('', '')
+
+f = open("conv.gadict", "w")
+
+for baseword in sorted(DICT.keys()):
+ art = DICT[baseword]
+ f.write('__\n\n')
+ for (headword, var) in art.vars.store.items():
+ f.write(headword)
+ f.write('\n')
+ if var.pron is not None:
+ f.write(' [')
+ f.write(var.pron)
+ f.write(']\n')
+ for attr in var.attrs:
+ f.write(' ')
+ f.write(attr)
+ f.write('\n')
+ for tran in art.trans:
+ f.write('\n')
+ f.write(tran.pos)
+ f.write('\nru: ')
+ f.write(tran.tran)
+ f.write('\n')
+
+f.close()