equal
deleted
inserted
replaced
36 |
36 |
37 SEPARATOR_RE = regex.compile(r"^__$") |
37 SEPARATOR_RE = regex.compile(r"^__$") |
38 HEADWORD_RE = regex.compile(r"^(\p{L}.*)$") |
38 HEADWORD_RE = regex.compile(r"^(\p{L}.*)$") |
39 HEADWORD_VAR_RE = regex.compile(r"^ +(s|pl|v[123]|male|female|comp|super)$") |
39 HEADWORD_VAR_RE = regex.compile(r"^ +(s|pl|v[123]|male|female|comp|super)$") |
40 HEADWORD_PRON_RE = regex.compile(r"^ +\[([\p{L}' ]+)\]$") |
40 HEADWORD_PRON_RE = regex.compile(r"^ +\[([\p{L}' ]+)\]$") |
41 TRANSL_POS_RE = regex.compile(r"^n|pron|adj|v|adv|prep|conj|num|int|phr\.v|abbr$") |
41 TRANSL_POS_RE = regex.compile(r"^n|pron|adj|v|adv|prep|conj|num|int|phr\.v|abbr|prefix$") |
42 TRANSL_RE = regex.compile(r"^(ru|uk|la|en): ([\p{L}(][\p{L}\p{P}~ ]*)$") |
42 TRANSL_RE = regex.compile(r"^(ru|uk|la|en): ([\p{L}(].*)$") |
43 TRANSL_EX_RE = regex.compile(r"^(ru|uk|la|en)> (\p{L}.*)$") |
43 TRANSL_EX_RE = regex.compile(r"^(ru|uk|la|en)> (\p{L}.*)$") |
|
44 TOPIC_RE = regex.compile(r"^topic: (\p{L}.*)$") |
44 |
45 |
45 CONT_RE = regex.compile(r"^ +(.*)") |
46 CONT_RE = regex.compile(r"^ +(.*)") |
46 |
47 |
47 TRAILING_SPACES_RE = regex.compile(r"\p{Z}+$") |
48 TRAILING_SPACES_RE = regex.compile(r"\p{Z}+$") |
48 |
49 |
73 self.parse_article() |
74 self.parse_article() |
74 except ParseException as ex: |
75 except ParseException as ex: |
75 raise ParseException(ex.msg, self.lineno, self.line) |
76 raise ParseException(ex.msg, self.lineno, self.line) |
76 return self.dom |
77 return self.dom |
77 |
78 |
78 def parse_continuation(self): |
79 def parse_prelude_continuation(self): |
79 string = "" |
80 string = "" |
80 while True: |
81 while True: |
81 self.readline() |
82 self.readline() |
82 if self.eof: |
83 if self.eof: |
83 return string |
84 return string |
96 self.readline() |
97 self.readline() |
97 if self.eof: |
98 if self.eof: |
98 raise ParseException("There are no articles...") |
99 raise ParseException("There are no articles...") |
99 m = self.PRELUDE_ABOUT_RE.match(self.line) |
100 m = self.PRELUDE_ABOUT_RE.match(self.line) |
100 if m: |
101 if m: |
101 pre.about += m.group(1) + self.parse_continuation() |
102 pre.about += m.group(1) + self.parse_prelude_continuation() |
102 if self.eof: |
103 if self.eof: |
103 raise ParseException("There are no articles...") |
104 raise ParseException("There are no articles...") |
104 if self.SEPARATOR_RE.match(self.line): |
105 if self.SEPARATOR_RE.match(self.line): |
105 break |
106 break |
106 m = self.PRELUDE_NAME_RE.match(self.line) |
107 m = self.PRELUDE_NAME_RE.match(self.line) |
171 attrs.add(m.group(1)) |
172 attrs.add(m.group(1)) |
172 continue |
173 continue |
173 raise ParseException("""Line is not a headword or translation or headword attribute...""") |
174 raise ParseException("""Line is not a headword or translation or headword attribute...""") |
174 self.words[word] = (pron, attrs) |
175 self.words[word] = (pron, attrs) |
175 |
176 |
|
177 def parse_translation_continuation(self): |
|
178 string = "" |
|
179 while True: |
|
180 self.readline() |
|
181 if self.eof: |
|
182 return string |
|
183 m = self.CONT_RE.match(self.line) |
|
184 if m is not None: |
|
185 string += "\n" + m.group(1) |
|
186 else: |
|
187 return string |
|
188 |
176 def parse_translation(self): |
189 def parse_translation(self): |
177 senses = [] |
190 senses = [] |
178 pos = None |
191 pos = None |
179 tr = [] |
192 tr = [] |
180 ex = [] |
193 ex = [] |
181 while True: |
194 read = True |
182 self.readline() |
195 while True: |
|
196 if read: |
|
197 self.readline() |
|
198 read = True |
183 if self.eof: |
199 if self.eof: |
184 break |
200 break |
185 m = self.SEPARATOR_RE.match(self.line) |
201 m = self.SEPARATOR_RE.match(self.line) |
186 if m is not None: |
202 if m is not None: |
187 break |
203 break |
195 if m is not None: |
211 if m is not None: |
196 if pos is not None: |
212 if pos is not None: |
197 raise ParseException("""Each translation should have only one part of speech marker...""") |
213 raise ParseException("""Each translation should have only one part of speech marker...""") |
198 pos = m.group(0) |
214 pos = m.group(0) |
199 continue |
215 continue |
|
216 m = self.TOPIC_RE.match(self.line) |
|
217 if m is not None: |
|
218 # TODO |
|
219 continue |
200 m = self.TRANSL_RE.match(self.line) |
220 m = self.TRANSL_RE.match(self.line) |
201 if m is not None: |
221 if m is not None: |
202 tr.append((m.group(1), m.group(2))) |
222 tr.append((m.group(1), m.group(2) + self.parse_translation_continuation())) |
|
223 read = False |
203 continue |
224 continue |
204 m = self.TRANSL_EX_RE.match(self.line) |
225 m = self.TRANSL_EX_RE.match(self.line) |
205 if m is not None: |
226 if m is not None: |
206 ex.append((m.group(1), m.group(2))) |
227 ex.append((m.group(1), m.group(2) + self.parse_translation_continuation())) |
|
228 read = False |
207 continue |
229 continue |
208 raise ParseException("""Uknown syntax...""") |
230 raise ParseException("""Uknown syntax...""") |
209 if len(tr) > 0: |
231 if len(tr) > 0: |
210 senses.append((pos, tr, ex)) |
232 senses.append((pos, tr, ex)) |
211 self.tran = senses |
233 self.tran = senses |