Added new articles.
# -*- coding: utf-8 -*-
"""
gadict dictionary format parser.
"""
import sys
import re
from gadict_util import ParseException
class Prelude:
"""Dictionary metainfo structure."""
name = None
about = ""
urls = []
authors = []
licences = []
class Headword:
def __init__(self, headword, pron = None, attrs = None, homo = None):
self.headword = headword
self.pron = pron
self.attrs = attrs
self.homo = homo
def __str__(self):
return self.headword
def __repr__(self):
return "<Headword {}>".format(self.headword)
class Sense:
def __init__(self, pos, tr_list = None, ex_list = None, glos_list = None, ant_list = None, syn_list = None, rel_list = None, topic_list = None, hyper_list = None, hypo_list = None, col_list = None, countable = None):
if not pos:
raise ParseException("Part of speech expected...\n")
self.pos = pos
self.tr_list = tr_list
self.ex_list = ex_list
self.glos_list = glos_list
self.ant_list = ant_list
self.syn_list = syn_list
self.rel_list = rel_list
self.topic_list = topic_list
self.hyper_list = hyper_list
self.hypo_list = hypo_list
self.col_list = col_list
self.countable = countable
def add_tr(self, tr):
if self.tr_list:
self.tr_list.append(tr)
else:
self.tr_list = [tr]
def add_ex(self, ex):
if self.ex_list:
self.ex_list.append(ex)
else:
self.ex_list = [ex]
def add_glos(self, glos):
if self.glos_list:
self.glos_list.append(glos)
else:
self.glos_list = [glos]
def add_ant(self, ant):
if self.ant_list:
self.ant_list.append(ant)
else:
self.ant_list = [ant]
def add_syn(self, syn):
if self.syn_list:
self.syn_list.append(syn)
else:
self.syn_list = [syn]
def add_rel(self, rel):
if self.rel_list:
self.rel_list.append(rel)
else:
self.rel_list = [rel]
def add_topic(self, topic):
if self.topic_list:
self.topic_list.append(topic)
else:
self.topic_list = [topic]
def add_hyper(self, hyper):
if self.hyper_list:
self.hyper_list.append(hyper)
else:
self.hyper_list = [hyper]
def add_hypo(self, hypo):
if self.hypo_list:
self.hypo_list.append(hypo)
else:
self.hypo_list = [hypo]
def add_col(self, col):
if self.col_list:
self.col_list.append(col)
else:
self.col_list = [col]
def set_countable(self, countable):
if isinstance(countable, str) or isinstance(countable, unicode):
if countable == 'yes':
self.countable = True
elif countable == 'no':
self.countable = False
else:
raise ParseException("Countable can only be yes/no.")
elif isinstance(countable, bool):
self.countable = countable
else:
raise ParseException("Countable can only be yes/no or bool.")
def __str__(self):
if tr_list:
(lang, text) = self.tr_list[0]
return "{}: {}".format(lang, text)
return "<empy sense>"
def __repr__(self):
return "<Sense {}>".format(str(self))
class Parser:
"""gadict dictionary format parser."""
COMMENT_RE = re.compile("^# ")
SEPARATOR_RE = re.compile(u"^__$", re.UNICODE)
HEADWORD_RE = re.compile( u"^(\\w.*)$" )
HEADWORD_VAR_RE = re.compile(u"^ +(rare|s|pl|v[123]|male|female|baby|comp|super|abbr|Am|Br|Au)$", re.UNICODE)
HEADWORD_PRON_RE = re.compile(u"^ +\\[([a-zˌˈːəæɛɒʊɪɔɜɑʌɚɐɹʃʧθðɡʒŋɾʔ ]+)\\]$", re.UNICODE)
HEADWORD_HOMO_RE = re.compile(u"^ +homo: (\\w|\\w[-'\\w ;]*\\w)$", re.UNICODE)
TRANSL_POS_RE = re.compile(u"^(?:n|det|pron|adj|v|adv|prep|conj|num|int|phr|phr\\.v|contr|abbr|prefix)$", re.UNICODE)
TRANSL_RE = re.compile(u"""^(ru|uk|la|en): ([- .,;/'"?!()0-9²A-Za-zА-Яа-яЄєІіЇїҐґ]+)$""", re.UNICODE)
TRANSL_EX_RE = re.compile(u"""^(ru|uk|la|en)> ([-'"\\w].*)$""", re.UNICODE)
TRANSL_GLOS_RE = re.compile(u"^(ru|uk|la|en)= ([-\\w\\d].*)$", re.UNICODE)
CNT_RE = re.compile(u"^cnt: (yes|no)$", re.UNICODE)
TOPIC_RE = re.compile(u"^topic: (\\w.*)$", re.UNICODE)
SYN_RE = re.compile(u"^syn: (\\w.*)$", re.UNICODE)
ANT_RE = re.compile(u"^ant: (\\w.*)$", re.UNICODE)
REL_RE = re.compile(u"^rel: (\\w.*)$", re.UNICODE)
HYPER_RE = re.compile(u"^hyper: (\\w.*)$", re.UNICODE)
HYPO_RE = re.compile(u"^hypo: (\\w.*)$", re.UNICODE)
COL_RE = re.compile(u"^col: (\\w.*)$", re.UNICODE)
CONT_RE = re.compile(u"^ +(.*)", re.UNICODE)
TRAILING_SPACES_RE = re.compile(u"\\s+$", re.UNICODE)
PRELUDE_NAME_RE = re.compile(u"^name: (.*)", re.UNICODE)
PRELUDE_URL_RE = re.compile(u"^url: (.*)", re.UNICODE)
PRELUDE_AUTHOR_RE = re.compile(u"^by: (.*)", re.UNICODE)
PRELUDE_LICENSE_RE = re.compile(u"^term: (.*)", re.UNICODE)
PRELUDE_ABOUT_RE = re.compile(u"^about: ?(.*)", re.UNICODE)
def __init__(self):
pass
def readline(self):
while True:
self.line = self.stream.readline()
self.eof = len(self.line) == 0
if not self.eof:
self.lineno += 1
self.line = self.line.rstrip('\n')
if self.TRAILING_SPACES_RE.search(self.line):
raise ParseException("Traling spaces detected...\n")
if self.COMMENT_RE.search(self.line):
continue
break
def parse(self, stream):
self.lineno = 0
self.stream = stream
self.dom = []
try:
self.parse_prelude()
while not self.eof:
self.parse_article()
except ParseException as ex:
if sys.version_info.major == 2:
import traceback
traceback.print_exc()
raise ParseException(ex.msg, self.lineno, self.line)
return self.dom
def parse_prelude_continuation(self):
string = ""
while True:
self.readline()
if self.eof:
return string
m = self.CONT_RE.match(self.line)
if m is not None:
string += "\n" + m.group(1)
elif len(self.line) == 0:
string += "\n"
else:
return string
def parse_prelude(self):
"""Read dictionary prelude until first "__" delimiter."""
pre = Prelude()
while True:
self.readline()
if self.eof:
raise ParseException("There are no articles...")
m = self.PRELUDE_ABOUT_RE.match(self.line)
if m:
pre.about += m.group(1) + self.parse_prelude_continuation()
if self.eof:
raise ParseException("There are no articles...")
if self.SEPARATOR_RE.match(self.line):
break
m = self.PRELUDE_NAME_RE.match(self.line)
if m:
pre.name = m.group(1)
continue
m = self.PRELUDE_URL_RE.match(self.line)
if m:
pre.urls.append(m.group(1))
continue
m = self.PRELUDE_AUTHOR_RE.match(self.line)
if m:
pre.authors.append(m.group(1))
continue
m = self.PRELUDE_LICENSE_RE.match(self.line)
if m:
pre.licences.append(m.group(1))
continue
self.dom.append(pre)
def parse_article(self):
"""Try to match article until next "__" delimiter. Assume that `self.line` point to "__" delimiter."""
self.words = None
self.tran = None
self.parse_empty_line()
self.parse_headlines()
self.parse_translation()
self.dom.append((self.words, self.tran))
def parse_empty_line(self):
self.readline()
if self.eof or len(self.line) != 0:
raise ParseException(""""__" delimiter should followed by empty line...""")
def parse_headlines(self):
"""Try to match word variations with attributed. Assume that `self.line` on preceding empty line."""
self.words = []
self.readline()
if self.eof:
raise ParseException("""There are no definition after "__" delimiter...""")
m = self.HEADWORD_RE.match(self.line)
if m is None:
raise ParseException("""There are no headword after "__" delimiter...""")
word = m.group(1)
pron = None
attrs = set()
homo = None
while True:
self.readline()
if self.eof or len(self.line) == 0:
break
m = self.HEADWORD_RE.match(self.line)
if m is not None:
if word is None:
raise ParseException("""Didn't match previous headword...""")
self.words.append(Headword(word, pron, attrs, homo = homo))
word = m.group(1)
pron = None
attrs = set()
homo = None
continue
m = self.HEADWORD_PRON_RE.match(self.line)
if m is not None:
if pron is not None:
raise ParseException("""Pronunciation is redefined...""")
pron = m.group(1)
continue
m = self.HEADWORD_VAR_RE.match(self.line)
if m is not None:
attrs.add(m.group(1))
continue
m = self.HEADWORD_HOMO_RE.match(self.line)
if m is not None:
if homo is not None:
raise ParseException("""Homophones are redefined...""")
homo = [s.strip() for s in m.group(1).split(";")]
continue
raise ParseException("""Line is not a headword or translation or headword attribute...""")
self.words.append(Headword(word, pron, attrs, homo))
def parse_translation_continuation(self):
string = ""
while True:
self.readline()
if self.eof:
return string
m = self.CONT_RE.match(self.line)
if m is not None:
string += " " + m.group(1)
else:
return string
def parse_translation(self):
senses = []
sense = None
read = True
while True:
if read:
self.readline()
read = True
if self.eof:
if sense:
senses.append(sense)
break
m = self.SEPARATOR_RE.match(self.line)
if m is not None:
if sense:
senses.append(sense)
break
if len(self.line) == 0:
if sense:
senses.append(sense)
sense = None
continue
m = self.TRANSL_POS_RE.match(self.line)
if m is not None:
if sense is not None:
raise ParseException("""Each translation should have only one part of speech marker...""")
pos = m.group(0)
sense = Sense(pos)
continue
if not sense:
raise ParseException("""Missing part of speech marker...""")
m = self.CNT_RE.match(self.line)
if m is not None:
sense.set_countable(m.group(1))
continue
m = self.TOPIC_RE.match(self.line)
if m is not None:
topics = m.group(1).split(";")
for topic in topics:
topic = topic.strip()
if len(topic) == 0:
raise ParseException("""Empty topic...""")
sense.add_topic(topic)
continue
m = self.SYN_RE.match(self.line)
if m is not None:
syns = m.group(1).split(";")
for syn in syns:
syn = syn.strip()
if len(syn) == 0:
raise ParseException("""Empty synonym...""")
sense.add_syn(syn)
continue
m = self.ANT_RE.match(self.line)
if m is not None:
ants = m.group(1).split(";")
for ant in ants:
ant = ant.strip()
if len(ant) == 0:
raise ParseException("""Empty antonym...""")
sense.add_ant(ant)
continue
m = self.REL_RE.match(self.line)
if m is not None:
rels = m.group(1).split(";")
for rel in rels:
rel = rel.strip()
if len(rel) == 0:
raise ParseException("""Empty relation...""")
sense.add_rel(rel)
continue
m = self.HYPER_RE.match(self.line)
if m is not None:
hypers = m.group(1).split(";")
for hyper in hypers:
hyper = hyper.strip()
if len(hyper) == 0:
raise ParseException("""Empty hypernym...""")
sense.add_hyper(hyper)
continue
m = self.HYPO_RE.match(self.line)
if m is not None:
hypos = m.group(1).split(";")
for hypo in hypos:
hypo = hypo.strip()
if len(hypo) == 0:
raise ParseException("""Empty hyponym...""")
sense.add_hypo(hypo)
continue
m = self.COL_RE.match(self.line)
if m is not None:
cols = m.group(1).split(";")
for col in cols:
col = col.strip()
if len(col) == 0:
raise ParseException("""Empty collocations...""")
sense.add_col(col)
continue
m = self.TRANSL_RE.match(self.line)
if m is not None:
sense.add_tr((m.group(1), m.group(2) + self.parse_translation_continuation()))
read = False
continue
m = self.TRANSL_EX_RE.match(self.line)
if m is not None:
sense.add_ex((m.group(1), m.group(2) + self.parse_translation_continuation()))
read = False
continue
m = self.TRANSL_GLOS_RE.match(self.line)
if m is not None:
sense.add_glos((m.group(1), m.group(2) + self.parse_translation_continuation()))
read = False
continue
raise ParseException("""Uknown syntax...""")
self.tran = senses
def __repr__(self):
return u"<{:s}; line={:s}>".format(type(self).__name__, repr(self.line))