gadict: comparison py/gadict.py

equal deleted inserted replaced

-:4657b44ad9af
+:2756a6deca7e
+import io
+import sys
+# import re
+import regex
+# fgadict = "gadict_en-ru+ua.gadict"
+fgadict = None
+fnout = None
+if len(sys.argv) >= 2:
+fgadict = sys.argv[1]
+if len(sys.argv) >= 3:
+fnout = sys.argv[2]
+fin = io.open(fgadict, mode='r', buffering=1, encoding="utf-8")
+if fnout is None:
+fout = sys.stdout
+else:
+fout = open(fnout, "w")
+class ParseException(Exception):
+def __init__(self, msg):
+self.msg = msg
+def __repr__(self):
+return self.msg
+class Parser:
+SEPARATOR_RE = regex.compile(r"^__$")
+HEADWORD_RE = regex.compile(r"^(\p{L}.*)$")
+HEADWORD_VAR_RE = regex.compile(r"^ +(s|pl|v[123]|male|female|comp|super)$")
+HEADWORD_PRON_RE = regex.compile(r"^ +\[([\p{L}' ]+)\]$")
+TRANSL_POS_RE = regex.compile(r"^n|pron|adj|v|adv|prep|conj|num|int|phr\.v|abbr$")
+TRANSL_RE = regex.compile(r"^(ru|uk|la|en): ([\p{L}(][\p{L}\p{P}~ ]*)$")
+TRANSL_EX_RE = regex.compile(r"^(ru|uk|la|en)> (\p{L}.*)$")
+TRAILING_SPACES_RE = regex.compile(r"\p{Z}+$")
+def __init__(self):
+pass
+def readline(self):
+self.line = self.stream.readline()
+self.eof = len(self.line) == 0
+if not self.eof:
+self.lineno += 1
+def parse(self, stream):
+self.lineno = 0
+self.stream = stream
+self.dom = []
+try:
+self.parse_prelude()
+while not self.eof:
+self.parse_article()
+except ParseException as ex:
+if self.TRAILING_SPACES_RE.match(self.line):
+fout.write("{:s}:{:d}: {:s}".format(fgadict, self.lineno, "Traling spaces detected...\n"))
+fout.write("{:s}:{:d}: {:s}\nLINE: {:s}\n".format(fgadict, self.lineno, str(ex), self.line))
+raise Exception(ex)
+return self.dom
+def parse_prelude(self):
+"""Read dictionary prelude until first "__" delimiter."""
+while True:
+self.readline()
+if self.eof:
+raise ParseException("There are no articles...")
+if self.SEPARATOR_RE.match(self.line):
+break
+def parse_article(self):
+"""Try to match article until next "__" delimiter. Assume that `self.line` point to "__" delimiter."""
+self.words = None
+self.tran = None
+self.parse_empty_line()
+self.parse_headlines()
+self.parse_translation()
+self.dom.append((self.words, self.tran))
+def parse_empty_line(self):
+self.readline()
+if self.eof or len(self.line) != 1:
+raise ParseException(""""__" delimiter should followed by empty line...""")
+def parse_headlines(self):
+"""Try to match word variations with attributed. Assume that `self.line` on preceding empty line."""
+self.words = {}
+self.readline()
+if self.eof:
+raise ParseException("""There are no definition after "__" delimiter...""")
+m = self.HEADWORD_RE.match(self.line)
+if m is None:
+raise ParseException("""There are no headword after "__" delimiter...""")
+word = m.group(1)
+pron = None
+attrs = set()
+while True:
+self.readline()
+if self.eof or len(self.line) == 1:
+break
+m = self.HEADWORD_RE.match(self.line)
+if m is not None:
+if word is None:
+raise ParseException("""Didn't match previous headword...""")
+self.words[word] = (pron, attrs)
+word = m.group(1)
+pron = None
+attrs = set()
+continue
+m = self.HEADWORD_PRON_RE.match(self.line)
+if m is not None:
+if pron is not None:
+raise ParseException("""Pronunciation is redefined...""")
+pron = m.group(1)
+continue
+m = self.HEADWORD_VAR_RE.match(self.line)
+if m is not None:
+attrs.add(m.group(1))
+continue
+raise ParseException("""Line is not headword or translation or headword attribute...""")
+self.words[word] = (pron, attrs)
+def parse_translation(self):
+senses = []
+pos = None
+tr = []
+ex = []
+while True:
+self.readline()
+if self.eof:
+break
+m = self.SEPARATOR_RE.match(self.line)
+if m is not None:
+break
+if len(self.line) == 1:
+senses.append((pos, tr, ex))
+pos = None
+tr = []
+ex = []
+continue
+m = self.TRANSL_POS_RE.match(self.line)
+if m is not None:
+if pos is not None:
+raise ParseException("""Each translation should have only one part of speech marker...""")
+pos = m.group(0)
+continue
+m = self.TRANSL_RE.match(self.line)
+if m is not None:
+tr.append((m.group(1), m.group(2)))
+continue
+m = self.TRANSL_EX_RE.match(self.line)
+if m is not None:
+ex.append((m.group(1), m.group(2)))
+continue
+raise ParseException("""Uknown syntax...""")
+if len(tr) > 0:
+senses.append((pos, tr, ex))
+self.tran = senses
+parser = Parser()
+dom = parser.parse(fin)
+fin.close()
+for idx in range(1, len(dom)):
+article = dom[idx]
+fout.write("_____\n\n")
+title = "; ".join(article[0].keys())
+fout.write(title)
+fout.write("\n\n")
+for (word, (pron, attrs)) in article[0].items():
+if word == "approach":
+fout.write(str(article[0]))
+fout.write("  ")
+fout.write(word)
+fout.write("\n")
+if pron is not None:
+fout.write("    [")
+fout.write(pron)
+fout.write("]\n")
+if len(attrs) > 0:
+fout.write("    ")
+l = list(attrs)
+l.sort()
+fout.write(", ".join(l))
+fout.write("\n")
+fout.write("\n")
+for (pos, trs, exs) in article[1]:
+fout.write("  ")
+if pos is not None:
+fout.write("⟨")
+fout.write(pos)
+fout.write("⟩ ")
+for (lang, tr) in trs:
+if lang == "ru":
+fout.write(tr)
+break
+fout.write("\n")
+# fout.write(str(article[0])+"\n")

changeset 393	2756a6deca7e
parent 385	18284ce77c7a
child 394	4d45194c71b6