|
1 """ |
|
2 Put *.dict-c5 files to directory with script and run: |
|
3 |
|
4 $ python conv-c5-to-gadict.py |
|
5 |
|
6 Output saved to `conv.gadict` file. |
|
7 """ |
|
8 |
|
9 import re |
|
10 |
|
11 ARTICLE_SEP = """ |
|
12 _____ |
|
13 |
|
14 """ |
|
15 |
|
16 |
|
17 class Variance: |
|
18 def __init__(self, pron=None, attrs=None): |
|
19 self.pron = pron |
|
20 self.attrs = set() |
|
21 self.addAttrs(attrs) |
|
22 |
|
23 def addAttrs(self, attrs): |
|
24 if attrs is None: |
|
25 return |
|
26 if isinstance(attrs, str): |
|
27 self.attrs.add(attrs) |
|
28 elif isinstance(attrs, set): |
|
29 self.attrs.update(attrs) |
|
30 else: |
|
31 raise TypeError("Should be str or set...", type(attrs)) |
|
32 |
|
33 def __repr__(self): |
|
34 return "<pron: " + self.pron + ">" |
|
35 |
|
36 |
|
37 class Variances: |
|
38 def __init__(self): |
|
39 self.store = {} |
|
40 |
|
41 def add(self, word, pron=None, attrs=None): |
|
42 """`word`is `str`, `pron` is a `str`, `atts` is a `list` of `str`.""" |
|
43 if word not in self.store: |
|
44 self.store[word] = Variance() |
|
45 var = self.store[word] |
|
46 if pron is not None: |
|
47 if var.pron is not None: |
|
48 print("two pronunciations detected!!", pron, var.pron) |
|
49 # raise Exception("two pronunciations detected!!", pron, var.pron) |
|
50 var.pron = pron |
|
51 var.addAttrs(attrs) |
|
52 |
|
53 def __repr__(self): |
|
54 return repr(self.store) |
|
55 |
|
56 |
|
57 class Translation: |
|
58 def __init__(self, pos, tran): |
|
59 """`pos` is `str` like 'n' or 'v', `tran` is `str`.""" |
|
60 self.pos = pos |
|
61 self.tran = tran |
|
62 |
|
63 def __repr__(self): |
|
64 return "<pos: " + self.pos + ", tran: " + self.tran + ">" |
|
65 |
|
66 |
|
67 class Article: |
|
68 def __init__(self): |
|
69 self.vars = Variances() |
|
70 self.trans = [] |
|
71 |
|
72 def addVar(self, word, pron=None, attrs=None): |
|
73 """`var` is `list`""" |
|
74 self.vars.add(word, pron, attrs) |
|
75 |
|
76 def addTransl(self, pos, tran): |
|
77 """`trans` is `Translation`.""" |
|
78 self.trans.append(Translation(pos, tran)) |
|
79 |
|
80 |
|
81 DICT = {} |
|
82 |
|
83 f = open('gadict-irregular-verbs-en-ru.dict-c5') |
|
84 content = f.read() |
|
85 content = content.split(ARTICLE_SEP) |
|
86 content = iter(content) |
|
87 next(content) |
|
88 |
|
89 PRON_LINE_RE = re.compile(r'^ \[') |
|
90 PRON_RE = re.compile(r'\[([^]]+)]') |
|
91 |
|
92 V1_RE = re.compile(r"^ (?:inf\. )?([^/]+)/?(.*)?") |
|
93 V2_RE = re.compile(r"^ (?:p. )?([^/ ]+)/?(.*)?") |
|
94 V3_RE = re.compile(r"^ (?:p\.p\. )?([^/ ]+)/?(.*)?") |
|
95 V3_RE = re.compile(r"^ (?:p\.p\. )?([^/ ]+)/?(.*)?") |
|
96 RU_RE = re.compile(r"^ (.+)") |
|
97 |
|
98 for piece in content: |
|
99 article = Article() |
|
100 lines = piece.split("\n") |
|
101 headwords = lines[0].split("; ") |
|
102 for title in headwords: |
|
103 article.addVar(title) |
|
104 assert lines[1] == "" |
|
105 curr = 2 |
|
106 line = lines[curr] |
|
107 if PRON_LINE_RE.match(line): |
|
108 curr += 1 |
|
109 prons = PRON_RE.findall(line) |
|
110 if len(prons) != len(headwords): |
|
111 raise Exception("some prononsiation missing for", headwords, prons) |
|
112 for i in range(len(headwords)): |
|
113 article.addVar(headwords[i], pron=prons[i]) |
|
114 line = lines[curr] |
|
115 m = V1_RE.match(line) |
|
116 if m: |
|
117 article.addVar(word=m.group(1), attrs="v1") |
|
118 if len(m.group(2)) > 0: |
|
119 article.addVar(word=m.group(2), attrs="v1") |
|
120 else: |
|
121 raise Exception("Can't match", line) |
|
122 curr += 1 |
|
123 line = lines[curr] |
|
124 m = V2_RE.match(line) |
|
125 if m: |
|
126 article.addVar(m.group(1), attrs="v2") |
|
127 if len(m.group(2)) > 0: |
|
128 article.addVar(m.group(2), attrs="v2") |
|
129 else: |
|
130 raise Exception("Can't match", line) |
|
131 curr += 1 |
|
132 line = lines[curr] |
|
133 m = V3_RE.match(line) |
|
134 if m: |
|
135 article.addVar(m.group(1), attrs="v3") |
|
136 if len(m.group(2)) > 0: |
|
137 article.addVar(m.group(2), attrs="v3") |
|
138 else: |
|
139 raise Exception("Can't match", line) |
|
140 try: |
|
141 curr += 1 |
|
142 line = lines[curr] |
|
143 except IndexError: |
|
144 raise IndexError("No translation after", lines[curr-1], lines) |
|
145 m = RU_RE.match(line) |
|
146 if m: |
|
147 article.addTransl(pos="v", tran=m.group(1)) |
|
148 else: |
|
149 raise Exception("Can't match", line) |
|
150 if len(lines) != curr+1: |
|
151 raise Exception("Unknown line", line, lines, len(lines), curr) |
|
152 DICT[headwords[0]] = article |
|
153 |
|
154 f.close() |
|
155 |
|
156 HEADWORD_RE = re.compile(r"^([a-zA-Z][a-z -]*)$") |
|
157 PRON_RE = re.compile(r"^ \[([^]]+)\]$") |
|
158 TRANSL_RE = re.compile(r"^ (?:[1-9]\) )?(.+)$") |
|
159 EMPTY_PRON_RE = re.compile(r"^ \[\]$") |
|
160 |
|
161 |
|
162 def parse_entry(entry, pos): |
|
163 lines = entry.split("\n") |
|
164 m = HEADWORD_RE.match(lines[0]) |
|
165 if m is None: |
|
166 raise Exception("Fail to parse headword", lines) |
|
167 if len(lines[1]) != 0: |
|
168 raise Exception("Headword is not separated from article", lines) |
|
169 headword = m.group(1) |
|
170 if headword in DICT: |
|
171 article = DICT[headword] |
|
172 else: |
|
173 article = Article() |
|
174 DICT[headword] = article |
|
175 curr = 2 |
|
176 line = lines[curr] |
|
177 m = PRON_RE.match(line) |
|
178 pron = None |
|
179 if m is not None: |
|
180 curr += 1 |
|
181 pron = m.group(1) |
|
182 if len(pron) == 0: |
|
183 pron = None |
|
184 article.addVar(headword, pron=pron) |
|
185 while curr < len(lines): |
|
186 line = lines[curr] |
|
187 curr += 1 |
|
188 if EMPTY_PRON_RE.match(line): |
|
189 continue |
|
190 m = TRANSL_RE.match(line) |
|
191 if m: |
|
192 article.addTransl(pos=pos, tran=m.group(1)) |
|
193 |
|
194 |
|
195 def parse_file(fn, pos): |
|
196 f = open(fn) |
|
197 content = f.read() |
|
198 content = content.split(ARTICLE_SEP) |
|
199 content = iter(content) |
|
200 next(content) |
|
201 for entry in content: |
|
202 try: |
|
203 parse_entry(entry, pos) |
|
204 except Exception as e: |
|
205 raise Exception(e, fn, entry) |
|
206 f.close() |
|
207 |
|
208 |
|
209 parse_file('gadict-regular-verbs-en-ru.dict-c5', 'v') |
|
210 parse_file('gadict-adjective-en-ru.dict-c5', 'adj') |
|
211 parse_file('gadict-adverb-en-ru.dict-c5', 'adv') |
|
212 parse_file('gadict-conjunction-en-ru.dict-c5', 'conj') |
|
213 parse_file('gadict-en-ru.dict-c5', 'n') |
|
214 parse_file('gadict-numeral-en-ru.dict-c5', 'num') |
|
215 parse_file('gadict-phrasal-verbs-en-ru.dict-c5', 'phr.v') |
|
216 parse_file('gadict-preposition-en-ru.dict-c5', 'prep') |
|
217 parse_file('gadict-pronoun-en-ru.dict-c5', 'pron') |
|
218 # parse_file('', '') |
|
219 # parse_file('', '') |
|
220 # parse_file('', '') |
|
221 # parse_file('', '') |
|
222 |
|
223 f = open("conv.gadict", "w") |
|
224 |
|
225 for baseword in sorted(DICT.keys()): |
|
226 art = DICT[baseword] |
|
227 f.write('__\n\n') |
|
228 for (headword, var) in art.vars.store.items(): |
|
229 f.write(headword) |
|
230 f.write('\n') |
|
231 if var.pron is not None: |
|
232 f.write(' [') |
|
233 f.write(var.pron) |
|
234 f.write(']\n') |
|
235 for attr in var.attrs: |
|
236 f.write(' ') |
|
237 f.write(attr) |
|
238 f.write('\n') |
|
239 for tran in art.trans: |
|
240 f.write('\n') |
|
241 f.write(tran.pos) |
|
242 f.write('\nru: ') |
|
243 f.write(tran.tran) |
|
244 f.write('\n') |
|
245 |
|
246 f.close() |