385
|
1 |
|
|
2 |
import regex
|
|
3 |
|
|
4 |
|
|
5 |
class ParseException(Exception):
|
|
6 |
|
|
7 |
def __init__(self, msg):
|
|
8 |
self.msg = msg
|
|
9 |
|
|
10 |
def __repr__(self):
|
|
11 |
return self.msg
|
|
12 |
|
|
13 |
|
|
14 |
class Parser:
|
|
15 |
|
|
16 |
SEPARATOR_RE = regex.compile(r"^__$")
|
|
17 |
HEADWORD_RE = regex.compile(r"^(\p{L}.*)$")
|
|
18 |
HEADWORD_VAR_RE = regex.compile(r"^ +(s|pl|v[123]|male|female|comp|super)$")
|
|
19 |
HEADWORD_PRON_RE = regex.compile(r"^ +\[([\p{L}' ]+)\]$")
|
|
20 |
TRANSL_POS_RE = regex.compile(r"^n|pron|adj|v|adv|prep|conj|num|int|phr\.v|abbr$")
|
|
21 |
TRANSL_RE = regex.compile(r"^(ru|uk|la|en): ([\p{L}(][\p{L}\p{P}~ ]*)$")
|
|
22 |
TRANSL_EX_RE = regex.compile(r"^(ru|uk|la|en)> (\p{L}.*)$")
|
|
23 |
|
|
24 |
TRAILING_SPACES_RE = regex.compile(r"\p{Z}+$")
|
|
25 |
|
|
26 |
def __init__(self):
|
|
27 |
pass
|
|
28 |
|
|
29 |
def readline(self):
|
|
30 |
self.line = self.stream.readline()
|
|
31 |
self.eof = len(self.line) == 0
|
|
32 |
if not self.eof:
|
|
33 |
self.lineno += 1
|
|
34 |
|
|
35 |
def parse(self, stream):
|
|
36 |
self.lineno = 0
|
|
37 |
self.stream = stream
|
|
38 |
self.dom = []
|
|
39 |
try:
|
|
40 |
self.parse_prelude()
|
|
41 |
while not self.eof:
|
|
42 |
self.parse_article()
|
|
43 |
except ParseException as ex:
|
|
44 |
if self.TRAILING_SPACES_RE.match(self.line):
|
|
45 |
fout.write("{:s}:{:d}: {:s}".format(fgadict, self.lineno, "Traling spaces detected...\n"))
|
|
46 |
fout.write("{:s}:{:d}: {:s}\nLINE: {:s}\n".format(fgadict, self.lineno, str(ex), self.line))
|
|
47 |
raise Exception(ex)
|
|
48 |
return self.dom
|
|
49 |
|
|
50 |
def parse_prelude(self):
|
|
51 |
"""Read dictionary prelude until first "__" delimiter."""
|
|
52 |
while True:
|
|
53 |
self.readline()
|
|
54 |
if self.eof:
|
|
55 |
raise ParseException("There are no articles...")
|
|
56 |
if self.SEPARATOR_RE.match(self.line):
|
|
57 |
break
|
|
58 |
|
|
59 |
def parse_article(self):
|
|
60 |
"""Try to match article until next "__" delimiter. Assume that `self.line` point to "__" delimiter."""
|
|
61 |
self.words = None
|
|
62 |
self.tran = None
|
|
63 |
self.parse_empty_line()
|
|
64 |
self.parse_headlines()
|
|
65 |
self.parse_translation()
|
|
66 |
self.dom.append((self.words, self.tran))
|
|
67 |
|
|
68 |
def parse_empty_line(self):
|
|
69 |
self.readline()
|
|
70 |
if self.eof or len(self.line) != 1:
|
|
71 |
raise ParseException(""""__" delimiter should followed by empty line...""")
|
|
72 |
|
|
73 |
def parse_headlines(self):
|
|
74 |
"""Try to match word variations with attributed. Assume that `self.line` on preceding empty line."""
|
|
75 |
self.words = {}
|
|
76 |
self.readline()
|
|
77 |
if self.eof:
|
|
78 |
raise ParseException("""There are no definition after "__" delimiter...""")
|
|
79 |
m = self.HEADWORD_RE.match(self.line)
|
|
80 |
if m is None:
|
|
81 |
raise ParseException("""There are no headword after "__" delimiter...""")
|
|
82 |
word = m.group(1)
|
|
83 |
pron = None
|
|
84 |
attrs = set()
|
|
85 |
while True:
|
|
86 |
self.readline()
|
|
87 |
if self.eof or len(self.line) == 1:
|
|
88 |
break
|
|
89 |
m = self.HEADWORD_RE.match(self.line)
|
|
90 |
if m is not None:
|
|
91 |
if word is None:
|
|
92 |
raise ParseException("""Didn't match previous headword...""")
|
|
93 |
self.words[word] = (pron, attrs)
|
|
94 |
word = m.group(1)
|
|
95 |
pron = None
|
|
96 |
attrs = set()
|
|
97 |
continue
|
|
98 |
m = self.HEADWORD_PRON_RE.match(self.line)
|
|
99 |
if m is not None:
|
|
100 |
if pron is not None:
|
|
101 |
raise ParseException("""Pronunciation is redefined...""")
|
|
102 |
pron = m.group(1)
|
|
103 |
continue
|
|
104 |
m = self.HEADWORD_VAR_RE.match(self.line)
|
|
105 |
if m is not None:
|
|
106 |
attrs.add(m.group(1))
|
|
107 |
continue
|
|
108 |
raise ParseException("""Line is not headword or translation or headword attribute...""")
|
|
109 |
self.words[word] = (pron, attrs)
|
|
110 |
|
|
111 |
def parse_translation(self):
|
|
112 |
senses = []
|
|
113 |
pos = None
|
|
114 |
tr = []
|
|
115 |
ex = []
|
|
116 |
while True:
|
|
117 |
self.readline()
|
|
118 |
if self.eof:
|
|
119 |
break
|
|
120 |
m = self.SEPARATOR_RE.match(self.line)
|
|
121 |
if m is not None:
|
|
122 |
break
|
|
123 |
if len(self.line) == 1:
|
|
124 |
senses.append((pos, tr, ex))
|
|
125 |
pos = None
|
|
126 |
tr = []
|
|
127 |
ex = []
|
|
128 |
continue
|
|
129 |
m = self.TRANSL_POS_RE.match(self.line)
|
|
130 |
if m is not None:
|
|
131 |
if pos is not None:
|
|
132 |
raise ParseException("""Each translation should have only one part of speech marker...""")
|
|
133 |
pos = m.group(0)
|
|
134 |
continue
|
|
135 |
m = self.TRANSL_RE.match(self.line)
|
|
136 |
if m is not None:
|
|
137 |
tr.append((m.group(1), m.group(2)))
|
|
138 |
continue
|
|
139 |
m = self.TRANSL_EX_RE.match(self.line)
|
|
140 |
if m is not None:
|
|
141 |
ex.append((m.group(1), m.group(2)))
|
|
142 |
continue
|
|
143 |
raise ParseException("""Uknown syntax...""")
|
|
144 |
if len(tr) > 0:
|
|
145 |
senses.append((pos, tr, ex))
|
|
146 |
self.tran = senses
|
|
147 |
|