author | Oleksandr Gavenko <gavenkoa@gmail.com> |
Thu, 29 Dec 2016 01:20:21 +0200 | |
changeset 724 | 98fd211d27db |
parent 319 | 75430afe1a43 |
permissions | -rw-r--r-- |
319
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
1 |
""" |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
2 |
Put *.dict-c5 files to directory with script and run: |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
3 |
|
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
4 |
$ python conv-c5-to-gadict.py |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
5 |
|
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
6 |
Output saved to `conv.gadict` file. |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
7 |
""" |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
8 |
|
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
9 |
import re |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
10 |
|
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
11 |
ARTICLE_SEP = """ |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
12 |
_____ |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
13 |
|
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
14 |
""" |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
15 |
|
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
16 |
|
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
17 |
class Variance: |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
18 |
def __init__(self, pron=None, attrs=None): |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
19 |
self.pron = pron |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
20 |
self.attrs = set() |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
21 |
self.addAttrs(attrs) |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
22 |
|
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
23 |
def addAttrs(self, attrs): |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
24 |
if attrs is None: |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
25 |
return |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
26 |
if isinstance(attrs, str): |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
27 |
self.attrs.add(attrs) |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
28 |
elif isinstance(attrs, set): |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
29 |
self.attrs.update(attrs) |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
30 |
else: |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
31 |
raise TypeError("Should be str or set...", type(attrs)) |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
32 |
|
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
33 |
def __repr__(self): |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
34 |
return "<pron: " + self.pron + ">" |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
35 |
|
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
36 |
|
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
37 |
class Variances: |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
38 |
def __init__(self): |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
39 |
self.store = {} |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
40 |
|
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
41 |
def add(self, word, pron=None, attrs=None): |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
42 |
"""`word`is `str`, `pron` is a `str`, `atts` is a `list` of `str`.""" |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
43 |
if word not in self.store: |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
44 |
self.store[word] = Variance() |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
45 |
var = self.store[word] |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
46 |
if pron is not None: |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
47 |
if var.pron is not None: |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
48 |
print("two pronunciations detected!!", pron, var.pron) |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
49 |
# raise Exception("two pronunciations detected!!", pron, var.pron) |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
50 |
var.pron = pron |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
51 |
var.addAttrs(attrs) |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
52 |
|
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
53 |
def __repr__(self): |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
54 |
return repr(self.store) |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
55 |
|
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
56 |
|
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
57 |
class Translation: |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
58 |
def __init__(self, pos, tran): |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
59 |
"""`pos` is `str` like 'n' or 'v', `tran` is `str`.""" |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
60 |
self.pos = pos |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
61 |
self.tran = tran |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
62 |
|
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
63 |
def __repr__(self): |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
64 |
return "<pos: " + self.pos + ", tran: " + self.tran + ">" |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
65 |
|
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
66 |
|
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
67 |
class Article: |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
68 |
def __init__(self): |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
69 |
self.vars = Variances() |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
70 |
self.trans = [] |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
71 |
|
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
72 |
def addVar(self, word, pron=None, attrs=None): |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
73 |
"""`var` is `list`""" |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
74 |
self.vars.add(word, pron, attrs) |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
75 |
|
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
76 |
def addTransl(self, pos, tran): |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
77 |
"""`trans` is `Translation`.""" |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
78 |
self.trans.append(Translation(pos, tran)) |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
79 |
|
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
80 |
|
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
81 |
DICT = {} |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
82 |
|
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
83 |
f = open('gadict-irregular-verbs-en-ru.dict-c5') |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
84 |
content = f.read() |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
85 |
content = content.split(ARTICLE_SEP) |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
86 |
content = iter(content) |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
87 |
next(content) |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
88 |
|
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
89 |
PRON_LINE_RE = re.compile(r'^ \[') |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
90 |
PRON_RE = re.compile(r'\[([^]]+)]') |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
91 |
|
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
92 |
V1_RE = re.compile(r"^ (?:inf\. )?([^/]+)/?(.*)?") |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
93 |
V2_RE = re.compile(r"^ (?:p. )?([^/ ]+)/?(.*)?") |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
94 |
V3_RE = re.compile(r"^ (?:p\.p\. )?([^/ ]+)/?(.*)?") |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
95 |
V3_RE = re.compile(r"^ (?:p\.p\. )?([^/ ]+)/?(.*)?") |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
96 |
RU_RE = re.compile(r"^ (.+)") |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
97 |
|
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
98 |
for piece in content: |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
99 |
article = Article() |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
100 |
lines = piece.split("\n") |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
101 |
headwords = lines[0].split("; ") |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
102 |
for title in headwords: |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
103 |
article.addVar(title) |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
104 |
assert lines[1] == "" |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
105 |
curr = 2 |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
106 |
line = lines[curr] |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
107 |
if PRON_LINE_RE.match(line): |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
108 |
curr += 1 |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
109 |
prons = PRON_RE.findall(line) |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
110 |
if len(prons) != len(headwords): |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
111 |
raise Exception("some prononsiation missing for", headwords, prons) |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
112 |
for i in range(len(headwords)): |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
113 |
article.addVar(headwords[i], pron=prons[i]) |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
114 |
line = lines[curr] |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
115 |
m = V1_RE.match(line) |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
116 |
if m: |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
117 |
article.addVar(word=m.group(1), attrs="v1") |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
118 |
if len(m.group(2)) > 0: |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
119 |
article.addVar(word=m.group(2), attrs="v1") |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
120 |
else: |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
121 |
raise Exception("Can't match", line) |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
122 |
curr += 1 |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
123 |
line = lines[curr] |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
124 |
m = V2_RE.match(line) |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
125 |
if m: |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
126 |
article.addVar(m.group(1), attrs="v2") |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
127 |
if len(m.group(2)) > 0: |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
128 |
article.addVar(m.group(2), attrs="v2") |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
129 |
else: |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
130 |
raise Exception("Can't match", line) |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
131 |
curr += 1 |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
132 |
line = lines[curr] |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
133 |
m = V3_RE.match(line) |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
134 |
if m: |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
135 |
article.addVar(m.group(1), attrs="v3") |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
136 |
if len(m.group(2)) > 0: |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
137 |
article.addVar(m.group(2), attrs="v3") |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
138 |
else: |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
139 |
raise Exception("Can't match", line) |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
140 |
try: |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
141 |
curr += 1 |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
142 |
line = lines[curr] |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
143 |
except IndexError: |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
144 |
raise IndexError("No translation after", lines[curr-1], lines) |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
145 |
m = RU_RE.match(line) |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
146 |
if m: |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
147 |
article.addTransl(pos="v", tran=m.group(1)) |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
148 |
else: |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
149 |
raise Exception("Can't match", line) |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
150 |
if len(lines) != curr+1: |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
151 |
raise Exception("Unknown line", line, lines, len(lines), curr) |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
152 |
DICT[headwords[0]] = article |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
153 |
|
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
154 |
f.close() |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
155 |
|
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
156 |
HEADWORD_RE = re.compile(r"^([a-zA-Z][a-z -]*)$") |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
157 |
PRON_RE = re.compile(r"^ \[([^]]+)\]$") |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
158 |
TRANSL_RE = re.compile(r"^ (?:[1-9]\) )?(.+)$") |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
159 |
EMPTY_PRON_RE = re.compile(r"^ \[\]$") |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
160 |
|
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
161 |
|
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
162 |
def parse_entry(entry, pos): |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
163 |
lines = entry.split("\n") |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
164 |
m = HEADWORD_RE.match(lines[0]) |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
165 |
if m is None: |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
166 |
raise Exception("Fail to parse headword", lines) |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
167 |
if len(lines[1]) != 0: |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
168 |
raise Exception("Headword is not separated from article", lines) |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
169 |
headword = m.group(1) |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
170 |
if headword in DICT: |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
171 |
article = DICT[headword] |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
172 |
else: |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
173 |
article = Article() |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
174 |
DICT[headword] = article |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
175 |
curr = 2 |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
176 |
line = lines[curr] |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
177 |
m = PRON_RE.match(line) |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
178 |
pron = None |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
179 |
if m is not None: |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
180 |
curr += 1 |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
181 |
pron = m.group(1) |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
182 |
if len(pron) == 0: |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
183 |
pron = None |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
184 |
article.addVar(headword, pron=pron) |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
185 |
while curr < len(lines): |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
186 |
line = lines[curr] |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
187 |
curr += 1 |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
188 |
if EMPTY_PRON_RE.match(line): |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
189 |
continue |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
190 |
m = TRANSL_RE.match(line) |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
191 |
if m: |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
192 |
article.addTransl(pos=pos, tran=m.group(1)) |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
193 |
|
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
194 |
|
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
195 |
def parse_file(fn, pos): |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
196 |
f = open(fn) |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
197 |
content = f.read() |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
198 |
content = content.split(ARTICLE_SEP) |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
199 |
content = iter(content) |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
200 |
next(content) |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
201 |
for entry in content: |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
202 |
try: |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
203 |
parse_entry(entry, pos) |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
204 |
except Exception as e: |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
205 |
raise Exception(e, fn, entry) |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
206 |
f.close() |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
207 |
|
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
208 |
|
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
209 |
parse_file('gadict-regular-verbs-en-ru.dict-c5', 'v') |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
210 |
parse_file('gadict-adjective-en-ru.dict-c5', 'adj') |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
211 |
parse_file('gadict-adverb-en-ru.dict-c5', 'adv') |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
212 |
parse_file('gadict-conjunction-en-ru.dict-c5', 'conj') |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
213 |
parse_file('gadict-en-ru.dict-c5', 'n') |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
214 |
parse_file('gadict-numeral-en-ru.dict-c5', 'num') |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
215 |
parse_file('gadict-phrasal-verbs-en-ru.dict-c5', 'phr.v') |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
216 |
parse_file('gadict-preposition-en-ru.dict-c5', 'prep') |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
217 |
parse_file('gadict-pronoun-en-ru.dict-c5', 'pron') |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
218 |
# parse_file('', '') |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
219 |
# parse_file('', '') |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
220 |
# parse_file('', '') |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
221 |
# parse_file('', '') |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
222 |
|
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
223 |
f = open("conv.gadict", "w") |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
224 |
|
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
225 |
for baseword in sorted(DICT.keys()): |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
226 |
art = DICT[baseword] |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
227 |
f.write('__\n\n') |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
228 |
for (headword, var) in art.vars.store.items(): |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
229 |
f.write(headword) |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
230 |
f.write('\n') |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
231 |
if var.pron is not None: |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
232 |
f.write(' [') |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
233 |
f.write(var.pron) |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
234 |
f.write(']\n') |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
235 |
for attr in var.attrs: |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
236 |
f.write(' ') |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
237 |
f.write(attr) |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
238 |
f.write('\n') |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
239 |
for tran in art.trans: |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
240 |
f.write('\n') |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
241 |
f.write(tran.pos) |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
242 |
f.write('\nru: ') |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
243 |
f.write(tran.tran) |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
244 |
f.write('\n') |
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
245 |
|
75430afe1a43
Script for converting C5 format to gadict format.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
246 |
f.close() |