Removed BOM marker.
"""
Put *.dict-c5 files to directory with script and run:
$ python conv-c5-to-gadict.py
Output saved to `conv.gadict` file.
"""
import re
ARTICLE_SEP = """
_____
"""
class Variance:
def __init__(self, pron=None, attrs=None):
self.pron = pron
self.attrs = set()
self.addAttrs(attrs)
def addAttrs(self, attrs):
if attrs is None:
return
if isinstance(attrs, str):
self.attrs.add(attrs)
elif isinstance(attrs, set):
self.attrs.update(attrs)
else:
raise TypeError("Should be str or set...", type(attrs))
def __repr__(self):
return "<pron: " + self.pron + ">"
class Variances:
def __init__(self):
self.store = {}
def add(self, word, pron=None, attrs=None):
"""`word`is `str`, `pron` is a `str`, `atts` is a `list` of `str`."""
if word not in self.store:
self.store[word] = Variance()
var = self.store[word]
if pron is not None:
if var.pron is not None:
print("two pronunciations detected!!", pron, var.pron)
# raise Exception("two pronunciations detected!!", pron, var.pron)
var.pron = pron
var.addAttrs(attrs)
def __repr__(self):
return repr(self.store)
class Translation:
def __init__(self, pos, tran):
"""`pos` is `str` like 'n' or 'v', `tran` is `str`."""
self.pos = pos
self.tran = tran
def __repr__(self):
return "<pos: " + self.pos + ", tran: " + self.tran + ">"
class Article:
def __init__(self):
self.vars = Variances()
self.trans = []
def addVar(self, word, pron=None, attrs=None):
"""`var` is `list`"""
self.vars.add(word, pron, attrs)
def addTransl(self, pos, tran):
"""`trans` is `Translation`."""
self.trans.append(Translation(pos, tran))
DICT = {}
f = open('gadict-irregular-verbs-en-ru.dict-c5')
content = f.read()
content = content.split(ARTICLE_SEP)
content = iter(content)
next(content)
PRON_LINE_RE = re.compile(r'^ \[')
PRON_RE = re.compile(r'\[([^]]+)]')
V1_RE = re.compile(r"^ (?:inf\. )?([^/]+)/?(.*)?")
V2_RE = re.compile(r"^ (?:p. )?([^/ ]+)/?(.*)?")
V3_RE = re.compile(r"^ (?:p\.p\. )?([^/ ]+)/?(.*)?")
V3_RE = re.compile(r"^ (?:p\.p\. )?([^/ ]+)/?(.*)?")
RU_RE = re.compile(r"^ (.+)")
for piece in content:
article = Article()
lines = piece.split("\n")
headwords = lines[0].split("; ")
for title in headwords:
article.addVar(title)
assert lines[1] == ""
curr = 2
line = lines[curr]
if PRON_LINE_RE.match(line):
curr += 1
prons = PRON_RE.findall(line)
if len(prons) != len(headwords):
raise Exception("some prononsiation missing for", headwords, prons)
for i in range(len(headwords)):
article.addVar(headwords[i], pron=prons[i])
line = lines[curr]
m = V1_RE.match(line)
if m:
article.addVar(word=m.group(1), attrs="v1")
if len(m.group(2)) > 0:
article.addVar(word=m.group(2), attrs="v1")
else:
raise Exception("Can't match", line)
curr += 1
line = lines[curr]
m = V2_RE.match(line)
if m:
article.addVar(m.group(1), attrs="v2")
if len(m.group(2)) > 0:
article.addVar(m.group(2), attrs="v2")
else:
raise Exception("Can't match", line)
curr += 1
line = lines[curr]
m = V3_RE.match(line)
if m:
article.addVar(m.group(1), attrs="v3")
if len(m.group(2)) > 0:
article.addVar(m.group(2), attrs="v3")
else:
raise Exception("Can't match", line)
try:
curr += 1
line = lines[curr]
except IndexError:
raise IndexError("No translation after", lines[curr-1], lines)
m = RU_RE.match(line)
if m:
article.addTransl(pos="v", tran=m.group(1))
else:
raise Exception("Can't match", line)
if len(lines) != curr+1:
raise Exception("Unknown line", line, lines, len(lines), curr)
DICT[headwords[0]] = article
f.close()
HEADWORD_RE = re.compile(r"^([a-zA-Z][a-z -]*)$")
PRON_RE = re.compile(r"^ \[([^]]+)\]$")
TRANSL_RE = re.compile(r"^ (?:[1-9]\) )?(.+)$")
EMPTY_PRON_RE = re.compile(r"^ \[\]$")
def parse_entry(entry, pos):
lines = entry.split("\n")
m = HEADWORD_RE.match(lines[0])
if m is None:
raise Exception("Fail to parse headword", lines)
if len(lines[1]) != 0:
raise Exception("Headword is not separated from article", lines)
headword = m.group(1)
if headword in DICT:
article = DICT[headword]
else:
article = Article()
DICT[headword] = article
curr = 2
line = lines[curr]
m = PRON_RE.match(line)
pron = None
if m is not None:
curr += 1
pron = m.group(1)
if len(pron) == 0:
pron = None
article.addVar(headword, pron=pron)
while curr < len(lines):
line = lines[curr]
curr += 1
if EMPTY_PRON_RE.match(line):
continue
m = TRANSL_RE.match(line)
if m:
article.addTransl(pos=pos, tran=m.group(1))
def parse_file(fn, pos):
f = open(fn)
content = f.read()
content = content.split(ARTICLE_SEP)
content = iter(content)
next(content)
for entry in content:
try:
parse_entry(entry, pos)
except Exception as e:
raise Exception(e, fn, entry)
f.close()
parse_file('gadict-regular-verbs-en-ru.dict-c5', 'v')
parse_file('gadict-adjective-en-ru.dict-c5', 'adj')
parse_file('gadict-adverb-en-ru.dict-c5', 'adv')
parse_file('gadict-conjunction-en-ru.dict-c5', 'conj')
parse_file('gadict-en-ru.dict-c5', 'n')
parse_file('gadict-numeral-en-ru.dict-c5', 'num')
parse_file('gadict-phrasal-verbs-en-ru.dict-c5', 'phr.v')
parse_file('gadict-preposition-en-ru.dict-c5', 'prep')
parse_file('gadict-pronoun-en-ru.dict-c5', 'pron')
# parse_file('', '')
# parse_file('', '')
# parse_file('', '')
# parse_file('', '')
f = open("conv.gadict", "w")
for baseword in sorted(DICT.keys()):
art = DICT[baseword]
f.write('__\n\n')
for (headword, var) in art.vars.store.items():
f.write(headword)
f.write('\n')
if var.pron is not None:
f.write(' [')
f.write(var.pron)
f.write(']\n')
for attr in var.attrs:
f.write(' ')
f.write(attr)
f.write('\n')
for tran in art.trans:
f.write('\n')
f.write(tran.pos)
f.write('\nru: ')
f.write(tran.tran)
f.write('\n')
f.close()