author | Oleksandr Gavenko <gavenkoa@gmail.com> |
Mon, 27 Feb 2023 00:55:27 +0200 | |
changeset 1342 | d6413e1d20b0 |
parent 1223 | d592572cc546 |
permissions | -rw-r--r-- |
1223
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
1 |
import re |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
2 |
|
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
3 |
from gadict_util import ParseException |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
4 |
|
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
5 |
class Parser: |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
6 |
""" |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
7 |
Parser of gadialog files of form: |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
8 |
|
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
9 |
# num1 |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
10 |
- sentence1 |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
11 |
- sentence2 |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
12 |
# num2 |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
13 |
- sentence1 |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
14 |
... |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
15 |
|
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
16 |
converting them to map: |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
17 |
|
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
18 |
obj.dom[num1] = [sentence1, sentence2, ...] |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
19 |
""" |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
20 |
|
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
21 |
COMMENT_RE = re.compile("^; ") |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
22 |
NUM_RE = re.compile(u"^# ([1-9][0-9]*)$") |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
23 |
PHRASE_START_RE = re.compile(u"^- (.*)") |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
24 |
|
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
25 |
def __init__(self): |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
26 |
pass |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
27 |
|
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
28 |
def readline(self): |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
29 |
while True: |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
30 |
self.line = self.stream.readline() |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
31 |
self.eof = len(self.line) == 0 |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
32 |
if self.eof: |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
33 |
break |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
34 |
self.lineno += 1 |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
35 |
if self.COMMENT_RE.search(self.line): |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
36 |
continue |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
37 |
self.line = self.line.strip(' \n\t') |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
38 |
if len(self.line) > 0: |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
39 |
break |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
40 |
|
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
41 |
def parse(self, stream): |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
42 |
self.lineno = 0 |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
43 |
self.stream = stream |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
44 |
self.dom = dict() |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
45 |
self.eof = False |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
46 |
try: |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
47 |
self.parse_prelude() |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
48 |
while not self.eof: |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
49 |
self.parse_article() |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
50 |
except ParseException as ex: |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
51 |
if sys.version_info.major == 2: |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
52 |
import traceback |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
53 |
traceback.print_exc() |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
54 |
raise ParseException(ex.msg, self.lineno, self.line) |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
55 |
return self.dom |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
56 |
|
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
57 |
def parse_prelude(self): |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
58 |
while True: |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
59 |
self.readline() |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
60 |
if self.eof: |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
61 |
return |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
62 |
m = self.NUM_RE.match(self.line) |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
63 |
if m: |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
64 |
self.num = m.group(1) |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
65 |
break |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
66 |
|
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
67 |
def parse_article(self): |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
68 |
"""Assume we are at ``# NUM`` line.""" |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
69 |
num = self.num |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
70 |
phrase_buf = [] |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
71 |
phrases = [] |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
72 |
while True: |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
73 |
self.readline() |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
74 |
if self.eof: |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
75 |
if len(phrase_buf) > 0: |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
76 |
phrases.append(" ".join(phrase_buf)) |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
77 |
break |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
78 |
m = self.NUM_RE.match(self.line) |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
79 |
if m: |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
80 |
if len(phrase_buf) > 0: |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
81 |
phrases.append(" ".join(phrase_buf)) |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
82 |
self.num = m.group(1) |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
83 |
break |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
84 |
m = self.PHRASE_START_RE.match(self.line) |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
85 |
if m: |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
86 |
if len(phrase_buf) > 0: |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
87 |
phrases.append(" ".join(phrase_buf)) |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
88 |
phrase_buf = [m.group(1)] |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
89 |
else: |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
90 |
phrase_buf.append(self.line) |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
91 |
if len(phrases) == 0: |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
92 |
raise ParseException("""There are no any phrases...""") |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
93 |
if num in self.dom: |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
94 |
raise ParseException("""Conflicting key: {}...""".format(num)) |
d592572cc546
Extracted parser of gadialog files to separate module.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
95 |
self.dom[num] = phrases |