123 return "<Sense {}>".format(str(self)) |
123 return "<Sense {}>".format(str(self)) |
124 |
124 |
125 class Parser: |
125 class Parser: |
126 """gadict dictionary format parser.""" |
126 """gadict dictionary format parser.""" |
127 |
127 |
128 COMMENT_RE = regex.compile(r"^# ") |
128 COMMENT_RE = re.compile("^# ") |
129 |
129 |
130 SEPARATOR_RE = regex.compile(u"^__$") |
130 SEPARATOR_RE = re.compile(u"^__$", re.UNICODE) |
131 HEADWORD_RE = regex.compile( u"^(\\p{L}.*)$" ) |
131 HEADWORD_RE = re.compile( u"^(\\w.*)$" ) |
132 HEADWORD_VAR_RE = regex.compile(u"^ +(rare|s|pl|v[123]|male|female|comp|super|abbr|Am|Br|Au)$") |
132 HEADWORD_VAR_RE = re.compile(u"^ +(rare|s|pl|v[123]|male|female|comp|super|abbr|Am|Br|Au)$", re.UNICODE) |
133 HEADWORD_PRON_RE = regex.compile(u"^ +\\[([a-zˌˈːəæɛɒʊɪɔɜɑʌɚɹʃʧθðɡʒŋɾ ]+)\\]$") |
133 HEADWORD_PRON_RE = re.compile(u"^ +\\[([a-zˌˈːəæɛɒʊɪɔɜɑʌɚɹʃʧθðɡʒŋɾ ]+)\\]$", re.UNICODE) |
134 TRANSL_POS_RE = regex.compile(u"^(?:n|det|pron|adj|v|adv|prep|conj|num|int|phr|phr\\.v|contr|abbr|prefix)$") |
134 TRANSL_POS_RE = re.compile(u"^(?:n|det|pron|adj|v|adv|prep|conj|num|int|phr|phr\\.v|contr|abbr|prefix)$", re.UNICODE) |
135 TRANSL_RE = regex.compile(u"^(ru|uk|la|en): ([\\p{L}(].*)$") |
135 TRANSL_RE = re.compile(u"^(ru|uk|la|en): ([\\w(].*)$", re.UNICODE) |
136 TRANSL_EX_RE = regex.compile(u"""^(ru|uk|la|en)> ([-'"\\p{L}].*)$""") |
136 TRANSL_EX_RE = re.compile(u"""^(ru|uk|la|en)> ([-'"\\w].*)$""", re.UNICODE) |
137 TRANSL_GLOS_RE = regex.compile(u"^(ru|uk|la|en)= ([-\\p{L}\\p{N}].*)$") |
137 TRANSL_GLOS_RE = re.compile(u"^(ru|uk|la|en)= ([-\\w\\d].*)$", re.UNICODE) |
138 TOPIC_RE = regex.compile(u"^topic: (\\p{L}.*)$") |
138 TOPIC_RE = re.compile(u"^topic: (\\w.*)$", re.UNICODE) |
139 SYN_RE = regex.compile(u"^syn: (\\p{L}.*)$") |
139 SYN_RE = re.compile(u"^syn: (\\w.*)$", re.UNICODE) |
140 ANT_RE = regex.compile(u"^ant: (\\p{L}.*)$") |
140 ANT_RE = re.compile(u"^ant: (\\w.*)$", re.UNICODE) |
141 REL_RE = regex.compile(u"^rel: (\\p{L}.*)$") |
141 REL_RE = re.compile(u"^rel: (\\w.*)$", re.UNICODE) |
142 HYPER_RE = regex.compile(u"^hyper: (\\p{L}.*)$") |
142 HYPER_RE = re.compile(u"^hyper: (\\w.*)$", re.UNICODE) |
143 HYPO_RE = regex.compile(u"^hypo: (\\p{L}.*)$") |
143 HYPO_RE = re.compile(u"^hypo: (\\w.*)$", re.UNICODE) |
144 |
144 |
145 CONT_RE = regex.compile(u"^ +(.*)") |
145 CONT_RE = re.compile(u"^ +(.*)", re.UNICODE) |
146 |
146 |
147 TRAILING_SPACES_RE = regex.compile(u"\\p{Z}+$") |
147 TRAILING_SPACES_RE = re.compile(u"\\s+$", re.UNICODE) |
148 |
148 |
149 PRELUDE_NAME_RE = regex.compile(u"^name: (.*)") |
149 PRELUDE_NAME_RE = re.compile(u"^name: (.*)", re.UNICODE) |
150 PRELUDE_URL_RE = regex.compile(u"^url: (.*)") |
150 PRELUDE_URL_RE = re.compile(u"^url: (.*)", re.UNICODE) |
151 PRELUDE_AUTHOR_RE = regex.compile(u"^by: (.*)") |
151 PRELUDE_AUTHOR_RE = re.compile(u"^by: (.*)", re.UNICODE) |
152 PRELUDE_LICENSE_RE = regex.compile(u"^term: (.*)") |
152 PRELUDE_LICENSE_RE = re.compile(u"^term: (.*)", re.UNICODE) |
153 PRELUDE_ABOUT_RE = regex.compile(u"^about: ?(.*)") |
153 PRELUDE_ABOUT_RE = re.compile(u"^about: ?(.*)", re.UNICODE) |
154 |
154 |
155 def __init__(self): |
155 def __init__(self): |
156 pass |
156 pass |
157 |
157 |
158 def readline(self): |
158 def readline(self): |
159 while True: |
159 while True: |
160 self.line = self.stream.readline() |
160 self.line = self.stream.readline() |
161 self.eof = len(self.line) == 0 |
161 self.eof = len(self.line) == 0 |
162 if not self.eof: |
162 if not self.eof: |
163 self.lineno += 1 |
163 self.lineno += 1 |
|
164 self.line = self.line.rstrip('\n') |
164 if self.TRAILING_SPACES_RE.search(self.line): |
165 if self.TRAILING_SPACES_RE.search(self.line): |
165 raise ParseException("Traling spaces detected...\n") |
166 raise ParseException("Traling spaces detected...\n") |
166 if self.COMMENT_RE.search(self.line): |
167 if self.COMMENT_RE.search(self.line): |
167 continue |
168 continue |
168 break |
169 break |
237 self.parse_translation() |
238 self.parse_translation() |
238 self.dom.append((self.words, self.tran)) |
239 self.dom.append((self.words, self.tran)) |
239 |
240 |
240 def parse_empty_line(self): |
241 def parse_empty_line(self): |
241 self.readline() |
242 self.readline() |
242 if self.eof or len(self.line) != 1: |
243 if self.eof or len(self.line) != 0: |
243 raise ParseException(""""__" delimiter should followed by empty line...""") |
244 raise ParseException(""""__" delimiter should followed by empty line...""") |
244 |
245 |
245 def parse_headlines(self): |
246 def parse_headlines(self): |
246 """Try to match word variations with attributed. Assume that `self.line` on preceding empty line.""" |
247 """Try to match word variations with attributed. Assume that `self.line` on preceding empty line.""" |
247 self.words = [] |
248 self.words = [] |