gadict: comparison obsolete/conv-c5-to-gadict.py

equal deleted inserted replaced

-:d4767f21ca59
+:75430afe1a43
+"""
+Put *.dict-c5 files to directory with script and run:
+$ python conv-c5-to-gadict.py
+Output saved to `conv.gadict` file.
+"""
+import re
+ARTICLE_SEP = """
+_____
+"""
+class Variance:
+def __init__(self, pron=None, attrs=None):
+self.pron = pron
+self.attrs = set()
+self.addAttrs(attrs)
+def addAttrs(self, attrs):
+if attrs is None:
+return
+if isinstance(attrs, str):
+self.attrs.add(attrs)
+elif isinstance(attrs, set):
+self.attrs.update(attrs)
+else:
+raise TypeError("Should be str or set...", type(attrs))
+def __repr__(self):
+return "<pron: " + self.pron + ">"
+class Variances:
+def __init__(self):
+self.store = {}
+def add(self, word, pron=None, attrs=None):
+"""`word`is `str`, `pron` is a `str`, `atts` is a `list` of `str`."""
+if word not in self.store:
+self.store[word] = Variance()
+var = self.store[word]
+if pron is not None:
+if var.pron is not None:
+print("two pronunciations detected!!", pron, var.pron)
+# raise Exception("two pronunciations detected!!", pron, var.pron)
+var.pron = pron
+var.addAttrs(attrs)
+def __repr__(self):
+return repr(self.store)
+class Translation:
+def __init__(self, pos, tran):
+"""`pos` is `str` like 'n' or 'v', `tran` is `str`."""
+self.pos = pos
+self.tran = tran
+def __repr__(self):
+return "<pos: " + self.pos + ", tran: " + self.tran + ">"
+class Article:
+def __init__(self):
+self.vars = Variances()
+self.trans = []
+def addVar(self, word, pron=None, attrs=None):
+"""`var` is `list`"""
+self.vars.add(word, pron, attrs)
+def addTransl(self, pos, tran):
+"""`trans` is `Translation`."""
+self.trans.append(Translation(pos, tran))
+DICT = {}
+f = open('gadict-irregular-verbs-en-ru.dict-c5')
+content = f.read()
+content = content.split(ARTICLE_SEP)
+content = iter(content)
+next(content)
+PRON_LINE_RE = re.compile(r'^  \[')
+PRON_RE = re.compile(r'\[([^]]+)]')
+V1_RE = re.compile(r"^  (?:inf\. )?([^/]+)/?(.*)?")
+V2_RE = re.compile(r"^  (?:p. )?([^/ ]+)/?(.*)?")
+V3_RE = re.compile(r"^  (?:p\.p\. )?([^/ ]+)/?(.*)?")
+V3_RE = re.compile(r"^  (?:p\.p\. )?([^/ ]+)/?(.*)?")
+RU_RE = re.compile(r"^  (.+)")
+for piece in content:
+article = Article()
+lines = piece.split("\n")
+headwords = lines[0].split("; ")
+for title in headwords:
+article.addVar(title)
+assert lines[1] == ""
+curr = 2
+line = lines[curr]
+if PRON_LINE_RE.match(line):
+curr += 1
+prons = PRON_RE.findall(line)
+if len(prons) != len(headwords):
+raise Exception("some prononsiation missing for", headwords, prons)
+for i in range(len(headwords)):
+article.addVar(headwords[i], pron=prons[i])
+line = lines[curr]
+m = V1_RE.match(line)
+if m:
+article.addVar(word=m.group(1), attrs="v1")
+if len(m.group(2)) > 0:
+article.addVar(word=m.group(2), attrs="v1")
+else:
+raise Exception("Can't match", line)
+curr += 1
+line = lines[curr]
+m = V2_RE.match(line)
+if m:
+article.addVar(m.group(1), attrs="v2")
+if len(m.group(2)) > 0:
+article.addVar(m.group(2), attrs="v2")
+else:
+raise Exception("Can't match", line)
+curr += 1
+line = lines[curr]
+m = V3_RE.match(line)
+if m:
+article.addVar(m.group(1), attrs="v3")
+if len(m.group(2)) > 0:
+article.addVar(m.group(2), attrs="v3")
+else:
+raise Exception("Can't match", line)
+try:
+curr += 1
+line = lines[curr]
+except IndexError:
+raise IndexError("No translation after", lines[curr-1], lines)
+m = RU_RE.match(line)
+if m:
+article.addTransl(pos="v", tran=m.group(1))
+else:
+raise Exception("Can't match", line)
+if len(lines) != curr+1:
+raise Exception("Unknown line", line, lines, len(lines), curr)
+DICT[headwords[0]] = article
+f.close()
+HEADWORD_RE = re.compile(r"^([a-zA-Z][a-z -]*)$")
+PRON_RE = re.compile(r"^  \[([^]]+)\]$")
+TRANSL_RE = re.compile(r"^  (?:[1-9]\) )?(.+)$")
+EMPTY_PRON_RE = re.compile(r"^  \[\]$")
+def parse_entry(entry, pos):
+lines = entry.split("\n")
+m = HEADWORD_RE.match(lines[0])
+if m is None:
+raise Exception("Fail to parse headword", lines)
+if len(lines[1]) != 0:
+raise Exception("Headword is not separated from article", lines)
+headword = m.group(1)
+if headword in DICT:
+article = DICT[headword]
+else:
+article = Article()
+DICT[headword] = article
+curr = 2
+line = lines[curr]
+m = PRON_RE.match(line)
+pron = None
+if m is not None:
+curr += 1
+pron = m.group(1)
+if len(pron) == 0:
+pron = None
+article.addVar(headword, pron=pron)
+while curr < len(lines):
+line = lines[curr]
+curr += 1
+if EMPTY_PRON_RE.match(line):
+continue
+m = TRANSL_RE.match(line)
+if m:
+article.addTransl(pos=pos, tran=m.group(1))
+def parse_file(fn, pos):
+f = open(fn)
+content = f.read()
+content = content.split(ARTICLE_SEP)
+content = iter(content)
+next(content)
+for entry in content:
+try:
+parse_entry(entry, pos)
+except Exception as e:
+raise Exception(e, fn, entry)
+f.close()
+parse_file('gadict-regular-verbs-en-ru.dict-c5', 'v')
+parse_file('gadict-adjective-en-ru.dict-c5', 'adj')
+parse_file('gadict-adverb-en-ru.dict-c5', 'adv')
+parse_file('gadict-conjunction-en-ru.dict-c5', 'conj')
+parse_file('gadict-en-ru.dict-c5', 'n')
+parse_file('gadict-numeral-en-ru.dict-c5', 'num')
+parse_file('gadict-phrasal-verbs-en-ru.dict-c5', 'phr.v')
+parse_file('gadict-preposition-en-ru.dict-c5', 'prep')
+parse_file('gadict-pronoun-en-ru.dict-c5', 'pron')
+# parse_file('', '')
+# parse_file('', '')
+# parse_file('', '')
+# parse_file('', '')
+f = open("conv.gadict", "w")
+for baseword in sorted(DICT.keys()):
+art = DICT[baseword]
+f.write('__\n\n')
+for (headword, var) in art.vars.store.items():
+f.write(headword)
+f.write('\n')
+if var.pron is not None:
+f.write(' [')
+f.write(var.pron)
+f.write(']\n')
+for attr in var.attrs:
+f.write(' ')
+f.write(attr)
+f.write('\n')
+for tran in art.trans:
+f.write('\n')
+f.write(tran.pos)
+f.write('\nru: ')
+f.write(tran.tran)
+f.write('\n')
+f.close()