Add dictionaries logo. Goldendict supports PNG format and by default display
16x16 images in "Result navigation pane" and "Dictionary bar".
"""
gadict dictionary format parser.
"""
import sys
import regex
class Prelude:
"""Dictionary metainfo structure."""
name = None
about = ""
urls = []
authors = []
licences = []
class ParseException(BaseException):
def __init__(self, msg, lineno=None, line=None):
BaseException.__init__(self)
self.msg = msg
self.lineno = lineno
self.line = line
def __repr__(self):
if self.lineno is None:
return self.msg
elif self.line is None:
return u":{:d}:{:s}".format(self.lineno, self.msg)
else:
return u":{:d}: {:s}\nLINE: {:s}".format(self.lineno, self.msg, self.line)
class Headword:
def __init__(self, headword, pron = None, attrs = None):
self.headword = headword
self.pron = pron
self.attrs = attrs
def __str__(self):
return self.headword
def __repr__(self):
return "<Headword {}>".format(self.headword)
class Sense:
def __init__(self, pos, tr_list = None, ex_list = None, glos_list = None, ant_list = None, syn_list = None, rel_list = None, topic_list = None, hyper_list = None, hypo_list = None):
if not pos:
raise ParseException("Part of speech expected...\n")
self.pos = pos
self.tr_list = tr_list
self.ex_list = ex_list
self.glos_list = glos_list
self.ant_list = ant_list
self.syn_list = syn_list
self.rel_list = rel_list
self.topic_list = topic_list
self.hyper_list = hyper_list
self.hypo_list = hypo_list
def add_tr(self, tr):
if self.tr_list:
self.tr_list.append(tr)
else:
self.tr_list = [tr]
def add_ex(self, ex):
if self.ex_list:
self.ex_list.append(ex)
else:
self.ex_list = [ex]
def add_glos(self, glos):
if self.glos_list:
self.glos_list.append(glos)
else:
self.glos_list = [glos]
def add_ant(self, ant):
if self.ant_list:
self.ant_list.append(ant)
else:
self.ant_list = [ant]
def add_syn(self, syn):
if self.syn_list:
self.syn_list.append(syn)
else:
self.syn_list = [syn]
def add_rel(self, rel):
if self.rel_list:
self.rel_list.append(rel)
else:
self.rel_list = [rel]
def add_topic(self, topic):
if self.topic_list:
self.topic_list.append(topic)
else:
self.topic_list = [topic]
def add_hyper(self, hyper):
if self.hyper_list:
self.hyper_list.append(hyper)
else:
self.hyper_list = [hyper]
def add_hypo(self, hypo):
if self.hypo_list:
self.hypo_list.append(hypo)
else:
self.hypo_list = [hypo]
def __str__(self):
if tr_list:
(lang, text) = self.tr_list[0]
return "{}: {}".format(lang, text)
return "<empy sense>"
def __repr__(self):
return "<Sense {}>".format(str(self))
class Parser:
"""gadict dictionary format parser."""
COMMENT_RE = regex.compile(r"^# ")
SEPARATOR_RE = regex.compile(u"^__$")
HEADWORD_RE = regex.compile( u"^(\\p{L}.*)$" )
HEADWORD_VAR_RE = regex.compile(u"^ +(s|pl|v[123]|male|female|comp|super|abbr|Am|Br|Au)$")
HEADWORD_PRON_RE = regex.compile(u"^ +\\[([\p{L}' ]+)\\]$")
TRANSL_POS_RE = regex.compile(u"^(?:n|det|pron|adj|v|adv|prep|conj|num|int|phr|phr\\.v|contr|abbr|prefix)$")
TRANSL_RE = regex.compile(u"^(ru|uk|la|en): ([\\p{L}(].*)$")
TRANSL_EX_RE = regex.compile(u"""^(ru|uk|la|en)> ([-'"\\p{L}].*)$""")
TRANSL_GLOS_RE = regex.compile(u"^(ru|uk|la|en)= ([-\\p{L}].*)$")
TOPIC_RE = regex.compile(u"^topic: (\\p{L}.*)$")
SYN_RE = regex.compile(u"^syn: (\\p{L}.*)$")
ANT_RE = regex.compile(u"^ant: (\\p{L}.*)$")
REL_RE = regex.compile(u"^rel: (\\p{L}.*)$")
HYPER_RE = regex.compile(u"^hyper: (\\p{L}.*)$")
HYPO_RE = regex.compile(u"^hypo: (\\p{L}.*)$")
CONT_RE = regex.compile(u"^ +(.*)")
TRAILING_SPACES_RE = regex.compile(u"\\p{Z}+$")
PRELUDE_NAME_RE = regex.compile(u"^name: (.*)")
PRELUDE_URL_RE = regex.compile(u"^url: (.*)")
PRELUDE_AUTHOR_RE = regex.compile(u"^by: (.*)")
PRELUDE_LICENSE_RE = regex.compile(u"^term: (.*)")
PRELUDE_ABOUT_RE = regex.compile(u"^about: ?(.*)")
def __init__(self):
pass
def readline(self):
while True:
self.line = self.stream.readline()
self.eof = len(self.line) == 0
if not self.eof:
self.lineno += 1
if self.TRAILING_SPACES_RE.search(self.line):
raise ParseException("Traling spaces detected...\n")
if self.COMMENT_RE.search(self.line):
continue
break
def parse(self, stream):
self.lineno = 0
self.stream = stream
self.dom = []
try:
self.parse_prelude()
while not self.eof:
self.parse_article()
except ParseException as ex:
if sys.version_info.major == 2:
import traceback
traceback.print_exc()
raise ParseException(ex.msg, self.lineno, self.line)
return self.dom
def parse_prelude_continuation(self):
string = ""
while True:
self.readline()
if self.eof:
return string
m = self.CONT_RE.match(self.line)
if m is not None:
string += "\n" + m.group(1)
elif len(self.line) == 1:
string += "\n"
else:
return string
def parse_prelude(self):
"""Read dictionary prelude until first "__" delimiter."""
pre = Prelude()
while True:
self.readline()
if self.eof:
raise ParseException("There are no articles...")
m = self.PRELUDE_ABOUT_RE.match(self.line)
if m:
pre.about += m.group(1) + self.parse_prelude_continuation()
if self.eof:
raise ParseException("There are no articles...")
if self.SEPARATOR_RE.match(self.line):
break
m = self.PRELUDE_NAME_RE.match(self.line)
if m:
pre.name = m.group(1)
continue
m = self.PRELUDE_URL_RE.match(self.line)
if m:
pre.urls.append(m.group(1))
continue
m = self.PRELUDE_AUTHOR_RE.match(self.line)
if m:
pre.authors.append(m.group(1))
continue
m = self.PRELUDE_LICENSE_RE.match(self.line)
if m:
pre.licences.append(m.group(1))
continue
self.dom.append(pre)
def parse_article(self):
"""Try to match article until next "__" delimiter. Assume that `self.line` point to "__" delimiter."""
self.words = None
self.tran = None
self.parse_empty_line()
self.parse_headlines()
self.parse_translation()
self.dom.append((self.words, self.tran))
def parse_empty_line(self):
self.readline()
if self.eof or len(self.line) != 1:
raise ParseException(""""__" delimiter should followed by empty line...""")
def parse_headlines(self):
"""Try to match word variations with attributed. Assume that `self.line` on preceding empty line."""
self.words = []
self.readline()
if self.eof:
raise ParseException("""There are no definition after "__" delimiter...""")
m = self.HEADWORD_RE.match(self.line)
if m is None:
raise ParseException("""There are no headword after "__" delimiter...""")
word = m.group(1)
pron = None
attrs = set()
while True:
self.readline()
if self.eof or len(self.line) == 1:
break
m = self.HEADWORD_RE.match(self.line)
if m is not None:
if word is None:
raise ParseException("""Didn't match previous headword...""")
self.words.append(Headword(word, pron, attrs))
word = m.group(1)
pron = None
attrs = set()
continue
m = self.HEADWORD_PRON_RE.match(self.line)
if m is not None:
if pron is not None:
raise ParseException("""Pronunciation is redefined...""")
pron = m.group(1)
continue
m = self.HEADWORD_VAR_RE.match(self.line)
if m is not None:
attrs.add(m.group(1))
continue
raise ParseException("""Line is not a headword or translation or headword attribute...""")
self.words.append(Headword(word, pron, attrs))
def parse_translation_continuation(self):
string = ""
while True:
self.readline()
if self.eof:
return string
m = self.CONT_RE.match(self.line)
if m is not None:
string += " " + m.group(1)
else:
return string
def parse_translation(self):
senses = []
sense = None
read = True
while True:
if read:
self.readline()
read = True
if self.eof:
break
m = self.SEPARATOR_RE.match(self.line)
if m is not None:
if sense:
senses.append(sense)
break
if len(self.line) == 1:
if sense:
senses.append(sense)
sense = None
continue
m = self.TRANSL_POS_RE.match(self.line)
if m is not None:
if sense is not None:
raise ParseException("""Each translation should have only one part of speech marker...""")
pos = m.group(0)
sense = Sense(pos)
continue
if not sense:
raise ParseException("""Missing part of speech marker...""")
m = self.TOPIC_RE.match(self.line)
if m is not None:
topics = m.group(1).split(";")
for topic in topics:
topic = topic.strip()
if len(topic) == 0:
raise ParseException("""Empty topic...""")
sense.add_topic(topic)
continue
m = self.SYN_RE.match(self.line)
if m is not None:
syns = m.group(1).split(";")
for syn in syns:
syn = syn.strip()
if len(syn) == 0:
raise ParseException("""Empty synonym...""")
sense.add_syn(syn)
continue
m = self.ANT_RE.match(self.line)
if m is not None:
ants = m.group(1).split(";")
for ant in ants:
ant = ant.strip()
if len(ant) == 0:
raise ParseException("""Empty antonym...""")
sense.add_ant(ant)
continue
m = self.REL_RE.match(self.line)
if m is not None:
rels = m.group(1).split(";")
for rel in rels:
rel = rel.strip()
if len(rel) == 0:
raise ParseException("""Empty relation...""")
sense.add_rel(rel)
continue
m = self.HYPER_RE.match(self.line)
if m is not None:
hypers = m.group(1).split(";")
for hyper in hypers:
hyper = hyper.strip()
if len(hyper) == 0:
raise ParseException("""Empty hypernym...""")
sense.add_hyper(hyper)
continue
m = self.HYPO_RE.match(self.line)
if m is not None:
hypos = m.group(1).split(";")
for hypo in hypos:
hypo = hypo.strip()
if len(hypo) == 0:
raise ParseException("""Empty hyponym...""")
sense.add_hypo(hypo)
continue
m = self.TRANSL_RE.match(self.line)
if m is not None:
sense.add_tr((m.group(1), m.group(2) + self.parse_translation_continuation()))
read = False
continue
m = self.TRANSL_EX_RE.match(self.line)
if m is not None:
sense.add_ex((m.group(1), m.group(2) + self.parse_translation_continuation()))
read = False
continue
m = self.TRANSL_GLOS_RE.match(self.line)
if m is not None:
sense.add_glos((m.group(1), m.group(2) + self.parse_translation_continuation()))
read = False
continue
raise ParseException("""Uknown syntax...""")
self.tran = senses
def __repr__(self):
return u"<{:s}; line={:s}>".format(type(self).__name__, repr(self.line))