py/gadict.py
author Oleksandr Gavenko <gavenkoa@gmail.com>
Wed, 02 Nov 2016 21:16:39 +0200
changeset 624 934a2a6cbe27
parent 618 6ad7203ac9dc
child 629 6a862ea41c00
permissions -rw-r--r--
Add dictionaries logo. Goldendict supports PNG format and by default display 16x16 images in "Result navigation pane" and "Dictionary bar".

"""
gadict dictionary format parser.
"""

import sys
import regex


class Prelude:
    """Dictionary metainfo structure."""
    name = None
    about = ""
    urls = []
    authors = []
    licences = []


class ParseException(BaseException):

    def __init__(self, msg, lineno=None, line=None):
        BaseException.__init__(self)
        self.msg = msg
        self.lineno = lineno
        self.line = line

    def __repr__(self):
        if self.lineno is None:
            return self.msg
        elif self.line is None:
            return u":{:d}:{:s}".format(self.lineno, self.msg)
        else:
            return u":{:d}: {:s}\nLINE: {:s}".format(self.lineno, self.msg, self.line)

class Headword:

    def __init__(self, headword, pron = None, attrs = None):
        self.headword = headword
        self.pron = pron
        self.attrs = attrs

    def __str__(self):
        return self.headword
    def __repr__(self):
        return "<Headword {}>".format(self.headword)

class Sense:

    def __init__(self, pos, tr_list = None, ex_list = None, glos_list = None, ant_list = None, syn_list = None, rel_list = None, topic_list = None, hyper_list = None, hypo_list = None):
        if not pos:
            raise ParseException("Part of speech expected...\n")
        self.pos = pos
        self.tr_list = tr_list
        self.ex_list = ex_list
        self.glos_list = glos_list
        self.ant_list = ant_list
        self.syn_list = syn_list
        self.rel_list = rel_list
        self.topic_list = topic_list
        self.hyper_list = hyper_list
        self.hypo_list = hypo_list

    def add_tr(self, tr):
        if self.tr_list:
            self.tr_list.append(tr)
        else:
            self.tr_list = [tr]

    def add_ex(self, ex):
        if self.ex_list:
            self.ex_list.append(ex)
        else:
            self.ex_list = [ex]

    def add_glos(self, glos):
        if self.glos_list:
            self.glos_list.append(glos)
        else:
            self.glos_list = [glos]

    def add_ant(self, ant):
        if self.ant_list:
            self.ant_list.append(ant)
        else:
            self.ant_list = [ant]

    def add_syn(self, syn):
        if self.syn_list:
            self.syn_list.append(syn)
        else:
            self.syn_list = [syn]

    def add_rel(self, rel):
        if self.rel_list:
            self.rel_list.append(rel)
        else:
            self.rel_list = [rel]

    def add_topic(self, topic):
        if self.topic_list:
            self.topic_list.append(topic)
        else:
            self.topic_list = [topic]

    def add_hyper(self, hyper):
        if self.hyper_list:
            self.hyper_list.append(hyper)
        else:
            self.hyper_list = [hyper]

    def add_hypo(self, hypo):
        if self.hypo_list:
            self.hypo_list.append(hypo)
        else:
            self.hypo_list = [hypo]

    def __str__(self):
        if tr_list:
            (lang, text) = self.tr_list[0]
            return "{}: {}".format(lang, text)
        return "<empy sense>"
    def __repr__(self):
        return "<Sense {}>".format(str(self))

class Parser:
    """gadict dictionary format parser."""

    COMMENT_RE = regex.compile(r"^# ")

    SEPARATOR_RE = regex.compile(u"^__$")
    HEADWORD_RE = regex.compile( u"^(\\p{L}.*)$" )
    HEADWORD_VAR_RE = regex.compile(u"^ +(s|pl|v[123]|male|female|comp|super|abbr|Am|Br|Au)$")
    HEADWORD_PRON_RE = regex.compile(u"^ +\\[([\p{L}' ]+)\\]$")
    TRANSL_POS_RE = regex.compile(u"^(?:n|det|pron|adj|v|adv|prep|conj|num|int|phr|phr\\.v|contr|abbr|prefix)$")
    TRANSL_RE = regex.compile(u"^(ru|uk|la|en): ([\\p{L}(].*)$")
    TRANSL_EX_RE = regex.compile(u"""^(ru|uk|la|en)> ([-'"\\p{L}].*)$""")
    TRANSL_GLOS_RE = regex.compile(u"^(ru|uk|la|en)= ([-\\p{L}].*)$")
    TOPIC_RE = regex.compile(u"^topic: (\\p{L}.*)$")
    SYN_RE = regex.compile(u"^syn: (\\p{L}.*)$")
    ANT_RE = regex.compile(u"^ant: (\\p{L}.*)$")
    REL_RE = regex.compile(u"^rel: (\\p{L}.*)$")
    HYPER_RE = regex.compile(u"^hyper: (\\p{L}.*)$")
    HYPO_RE = regex.compile(u"^hypo: (\\p{L}.*)$")

    CONT_RE = regex.compile(u"^ +(.*)")

    TRAILING_SPACES_RE = regex.compile(u"\\p{Z}+$")

    PRELUDE_NAME_RE = regex.compile(u"^name: (.*)")
    PRELUDE_URL_RE = regex.compile(u"^url: (.*)")
    PRELUDE_AUTHOR_RE = regex.compile(u"^by: (.*)")
    PRELUDE_LICENSE_RE = regex.compile(u"^term: (.*)")
    PRELUDE_ABOUT_RE = regex.compile(u"^about: ?(.*)")

    def __init__(self):
        pass

    def readline(self):
        while True:
            self.line = self.stream.readline()
            self.eof = len(self.line) == 0
            if not self.eof:
                self.lineno += 1
            if self.TRAILING_SPACES_RE.search(self.line):
                raise ParseException("Traling spaces detected...\n")
            if self.COMMENT_RE.search(self.line):
                continue
            break

    def parse(self, stream):
        self.lineno = 0
        self.stream = stream
        self.dom = []
        try:
            self.parse_prelude()
            while not self.eof:
                self.parse_article()
        except ParseException as ex:
            if sys.version_info.major == 2:
                import traceback
                traceback.print_exc()
            raise ParseException(ex.msg, self.lineno, self.line)
        return self.dom

    def parse_prelude_continuation(self):
        string = ""
        while True:
            self.readline()
            if self.eof:
                return string
            m = self.CONT_RE.match(self.line)
            if m is not None:
                string += "\n" + m.group(1)
            elif len(self.line) == 1:
                string += "\n"
            else:
                return string

    def parse_prelude(self):
        """Read dictionary prelude until first "__" delimiter."""
        pre = Prelude()
        while True:
            self.readline()
            if self.eof:
                raise ParseException("There are no articles...")
            m = self.PRELUDE_ABOUT_RE.match(self.line)
            if m:
                pre.about += m.group(1) + self.parse_prelude_continuation()
                if self.eof:
                    raise ParseException("There are no articles...")
            if self.SEPARATOR_RE.match(self.line):
                break
            m = self.PRELUDE_NAME_RE.match(self.line)
            if m:
                pre.name = m.group(1)
                continue
            m = self.PRELUDE_URL_RE.match(self.line)
            if m:
                pre.urls.append(m.group(1))
                continue
            m = self.PRELUDE_AUTHOR_RE.match(self.line)
            if m:
                pre.authors.append(m.group(1))
                continue
            m = self.PRELUDE_LICENSE_RE.match(self.line)
            if m:
                pre.licences.append(m.group(1))
                continue
        self.dom.append(pre)

    def parse_article(self):
        """Try to match article until next "__" delimiter. Assume that `self.line` point to "__" delimiter."""
        self.words = None
        self.tran = None
        self.parse_empty_line()
        self.parse_headlines()
        self.parse_translation()
        self.dom.append((self.words, self.tran))

    def parse_empty_line(self):
        self.readline()
        if self.eof or len(self.line) != 1:
            raise ParseException(""""__" delimiter should followed by empty line...""")

    def parse_headlines(self):
        """Try to match word variations with attributed. Assume that `self.line` on preceding empty line."""
        self.words = []
        self.readline()
        if self.eof:
            raise ParseException("""There are no definition after "__" delimiter...""")
        m = self.HEADWORD_RE.match(self.line)
        if m is None:
            raise ParseException("""There are no headword after "__" delimiter...""")
        word = m.group(1)
        pron = None
        attrs = set()
        while True:
            self.readline()
            if self.eof or len(self.line) == 1:
                break
            m = self.HEADWORD_RE.match(self.line)
            if m is not None:
                if word is None:
                    raise ParseException("""Didn't match previous headword...""")
                self.words.append(Headword(word, pron, attrs))
                word = m.group(1)
                pron = None
                attrs = set()
                continue
            m = self.HEADWORD_PRON_RE.match(self.line)
            if m is not None:
                if pron is not None:
                    raise ParseException("""Pronunciation is redefined...""")
                pron = m.group(1)
                continue
            m = self.HEADWORD_VAR_RE.match(self.line)
            if m is not None:
                attrs.add(m.group(1))
                continue
            raise ParseException("""Line is not a headword or translation or headword attribute...""")
        self.words.append(Headword(word, pron, attrs))

    def parse_translation_continuation(self):
        string = ""
        while True:
            self.readline()
            if self.eof:
                return string
            m = self.CONT_RE.match(self.line)
            if m is not None:
                string += " " + m.group(1)
            else:
                return string

    def parse_translation(self):
        senses = []
        sense = None
        read = True
        while True:
            if read:
                self.readline()
            read = True
            if self.eof:
                break
            m = self.SEPARATOR_RE.match(self.line)
            if m is not None:
                if sense:
                    senses.append(sense)
                break
            if len(self.line) == 1:
                if sense:
                    senses.append(sense)
                sense = None
                continue
            m = self.TRANSL_POS_RE.match(self.line)
            if m is not None:
                if sense is not None:
                    raise ParseException("""Each translation should have only one part of speech marker...""")
                pos = m.group(0)
                sense = Sense(pos)
                continue
            if not sense:
                raise ParseException("""Missing part of speech marker...""")
            m = self.TOPIC_RE.match(self.line)
            if m is not None:
                topics = m.group(1).split(";")
                for topic in topics:
                    topic = topic.strip()
                    if len(topic) == 0:
                        raise ParseException("""Empty topic...""")
                    sense.add_topic(topic)
                continue
            m = self.SYN_RE.match(self.line)
            if m is not None:
                syns = m.group(1).split(";")
                for syn in syns:
                    syn = syn.strip()
                    if len(syn) == 0:
                        raise ParseException("""Empty synonym...""")
                    sense.add_syn(syn)
                continue
            m = self.ANT_RE.match(self.line)
            if m is not None:
                ants = m.group(1).split(";")
                for ant in ants:
                    ant = ant.strip()
                    if len(ant) == 0:
                        raise ParseException("""Empty antonym...""")
                    sense.add_ant(ant)
                continue
            m = self.REL_RE.match(self.line)
            if m is not None:
                rels = m.group(1).split(";")
                for rel in rels:
                    rel = rel.strip()
                    if len(rel) == 0:
                        raise ParseException("""Empty relation...""")
                    sense.add_rel(rel)
                continue
            m = self.HYPER_RE.match(self.line)
            if m is not None:
                hypers = m.group(1).split(";")
                for hyper in hypers:
                    hyper = hyper.strip()
                    if len(hyper) == 0:
                        raise ParseException("""Empty hypernym...""")
                    sense.add_hyper(hyper)
                continue
            m = self.HYPO_RE.match(self.line)
            if m is not None:
                hypos = m.group(1).split(";")
                for hypo in hypos:
                    hypo = hypo.strip()
                    if len(hypo) == 0:
                        raise ParseException("""Empty hyponym...""")
                    sense.add_hypo(hypo)
                continue
            m = self.TRANSL_RE.match(self.line)
            if m is not None:
                sense.add_tr((m.group(1), m.group(2) + self.parse_translation_continuation()))
                read = False
                continue
            m = self.TRANSL_EX_RE.match(self.line)
            if m is not None:
                sense.add_ex((m.group(1), m.group(2) + self.parse_translation_continuation()))
                read = False
                continue
            m = self.TRANSL_GLOS_RE.match(self.line)
            if m is not None:
                sense.add_glos((m.group(1), m.group(2) + self.parse_translation_continuation()))
                read = False
                continue
            raise ParseException("""Uknown syntax...""")
        self.tran = senses

    def __repr__(self):
        return u"<{:s}; line={:s}>".format(type(self).__name__, repr(self.line))