Move python files to separate directory.
import io
import sys
# import re
import regex
# fgadict = "gadict_en-ru+ua.gadict"
fgadict = None
fnout = None
if len(sys.argv) >= 2:
fgadict = sys.argv[1]
if len(sys.argv) >= 3:
fnout = sys.argv[2]
fin = io.open(fgadict, mode='r', buffering=1, encoding="utf-8")
if fnout is None:
fout = sys.stdout
else:
fout = open(fnout, "w")
class ParseException(Exception):
def __init__(self, msg):
self.msg = msg
def __repr__(self):
return self.msg
class Parser:
SEPARATOR_RE = regex.compile(r"^__$")
HEADWORD_RE = regex.compile(r"^(\p{L}.*)$")
HEADWORD_VAR_RE = regex.compile(r"^ +(s|pl|v[123]|male|female|comp|super)$")
HEADWORD_PRON_RE = regex.compile(r"^ +\[([\p{L}' ]+)\]$")
TRANSL_POS_RE = regex.compile(r"^n|pron|adj|v|adv|prep|conj|num|int|phr\.v|abbr$")
TRANSL_RE = regex.compile(r"^(ru|uk|la|en): ([\p{L}(][\p{L}\p{P}~ ]*)$")
TRANSL_EX_RE = regex.compile(r"^(ru|uk|la|en)> (\p{L}.*)$")
TRAILING_SPACES_RE = regex.compile(r"\p{Z}+$")
def __init__(self):
pass
def readline(self):
self.line = self.stream.readline()
self.eof = len(self.line) == 0
if not self.eof:
self.lineno += 1
def parse(self, stream):
self.lineno = 0
self.stream = stream
self.dom = []
try:
self.parse_prelude()
while not self.eof:
self.parse_article()
except ParseException as ex:
if self.TRAILING_SPACES_RE.match(self.line):
fout.write("{:s}:{:d}: {:s}".format(fgadict, self.lineno, "Traling spaces detected...\n"))
fout.write("{:s}:{:d}: {:s}\nLINE: {:s}\n".format(fgadict, self.lineno, str(ex), self.line))
raise Exception(ex)
return self.dom
def parse_prelude(self):
"""Read dictionary prelude until first "__" delimiter."""
while True:
self.readline()
if self.eof:
raise ParseException("There are no articles...")
if self.SEPARATOR_RE.match(self.line):
break
def parse_article(self):
"""Try to match article until next "__" delimiter. Assume that `self.line` point to "__" delimiter."""
self.words = None
self.tran = None
self.parse_empty_line()
self.parse_headlines()
self.parse_translation()
self.dom.append((self.words, self.tran))
def parse_empty_line(self):
self.readline()
if self.eof or len(self.line) != 1:
raise ParseException(""""__" delimiter should followed by empty line...""")
def parse_headlines(self):
"""Try to match word variations with attributed. Assume that `self.line` on preceding empty line."""
self.words = {}
self.readline()
if self.eof:
raise ParseException("""There are no definition after "__" delimiter...""")
m = self.HEADWORD_RE.match(self.line)
if m is None:
raise ParseException("""There are no headword after "__" delimiter...""")
word = m.group(1)
pron = None
attrs = set()
while True:
self.readline()
if self.eof or len(self.line) == 1:
break
m = self.HEADWORD_RE.match(self.line)
if m is not None:
if word is None:
raise ParseException("""Didn't match previous headword...""")
self.words[word] = (pron, attrs)
word = m.group(1)
pron = None
attrs = set()
continue
m = self.HEADWORD_PRON_RE.match(self.line)
if m is not None:
if pron is not None:
raise ParseException("""Pronunciation is redefined...""")
pron = m.group(1)
continue
m = self.HEADWORD_VAR_RE.match(self.line)
if m is not None:
attrs.add(m.group(1))
continue
raise ParseException("""Line is not headword or translation or headword attribute...""")
self.words[word] = (pron, attrs)
def parse_translation(self):
senses = []
pos = None
tr = []
ex = []
while True:
self.readline()
if self.eof:
break
m = self.SEPARATOR_RE.match(self.line)
if m is not None:
break
if len(self.line) == 1:
senses.append((pos, tr, ex))
pos = None
tr = []
ex = []
continue
m = self.TRANSL_POS_RE.match(self.line)
if m is not None:
if pos is not None:
raise ParseException("""Each translation should have only one part of speech marker...""")
pos = m.group(0)
continue
m = self.TRANSL_RE.match(self.line)
if m is not None:
tr.append((m.group(1), m.group(2)))
continue
m = self.TRANSL_EX_RE.match(self.line)
if m is not None:
ex.append((m.group(1), m.group(2)))
continue
raise ParseException("""Uknown syntax...""")
if len(tr) > 0:
senses.append((pos, tr, ex))
self.tran = senses
parser = Parser()
dom = parser.parse(fin)
fin.close()
for idx in range(1, len(dom)):
article = dom[idx]
fout.write("_____\n\n")
title = "; ".join(article[0].keys())
fout.write(title)
fout.write("\n\n")
for (word, (pron, attrs)) in article[0].items():
if word == "approach":
fout.write(str(article[0]))
fout.write(" ")
fout.write(word)
fout.write("\n")
if pron is not None:
fout.write(" [")
fout.write(pron)
fout.write("]\n")
if len(attrs) > 0:
fout.write(" ")
l = list(attrs)
l.sort()
fout.write(", ".join(l))
fout.write("\n")
fout.write("\n")
for (pos, trs, exs) in article[1]:
fout.write(" ")
if pos is not None:
fout.write("⟨")
fout.write(pos)
fout.write("⟩ ")
for (lang, tr) in trs:
if lang == "ru":
fout.write(tr)
break
fout.write("\n")
# fout.write(str(article[0])+"\n")