py/gadict.py
author Oleksandr Gavenko <gavenkoa@gmail.com>
Tue, 08 May 2018 20:35:36 +0300
changeset 1055 fa9c3391e40a
parent 1011 fdf5640f221a
child 1148 263e9e066981
permissions -rw-r--r--
Added new articles.

# -*- coding: utf-8 -*-
"""
gadict dictionary format parser.
"""

import sys
import re


class Prelude:
    """Dictionary metainfo structure."""
    name = None
    about = ""
    urls = []
    authors = []
    licences = []


class ParseException(BaseException):

    def __init__(self, msg, lineno=None, line=None):
        BaseException.__init__(self)
        self.msg = msg
        self.lineno = lineno
        self.line = line

    def __repr__(self):
        if self.lineno is None:
            return self.msg
        elif self.line is None:
            return u":{:d}:{:s}".format(self.lineno, self.msg)
        else:
            return u":{:d}: {:s}\nLINE: {:s}".format(self.lineno, self.msg, self.line)

class Headword:

    def __init__(self, headword, pron = None, attrs = None, homo = None):
        self.headword = headword
        self.pron = pron
        self.attrs = attrs
        self.homo = homo

    def __str__(self):
        return self.headword
    def __repr__(self):
        return "<Headword {}>".format(self.headword)

class Sense:

    def __init__(self, pos, tr_list = None, ex_list = None, glos_list = None, ant_list = None, syn_list = None, rel_list = None, topic_list = None, hyper_list = None, hypo_list = None, col_list = None, countable = None):
        if not pos:
            raise ParseException("Part of speech expected...\n")
        self.pos = pos
        self.tr_list = tr_list
        self.ex_list = ex_list
        self.glos_list = glos_list
        self.ant_list = ant_list
        self.syn_list = syn_list
        self.rel_list = rel_list
        self.topic_list = topic_list
        self.hyper_list = hyper_list
        self.hypo_list = hypo_list
        self.col_list = col_list
        self.countable = countable

    def add_tr(self, tr):
        if self.tr_list:
            self.tr_list.append(tr)
        else:
            self.tr_list = [tr]

    def add_ex(self, ex):
        if self.ex_list:
            self.ex_list.append(ex)
        else:
            self.ex_list = [ex]

    def add_glos(self, glos):
        if self.glos_list:
            self.glos_list.append(glos)
        else:
            self.glos_list = [glos]

    def add_ant(self, ant):
        if self.ant_list:
            self.ant_list.append(ant)
        else:
            self.ant_list = [ant]

    def add_syn(self, syn):
        if self.syn_list:
            self.syn_list.append(syn)
        else:
            self.syn_list = [syn]

    def add_rel(self, rel):
        if self.rel_list:
            self.rel_list.append(rel)
        else:
            self.rel_list = [rel]

    def add_topic(self, topic):
        if self.topic_list:
            self.topic_list.append(topic)
        else:
            self.topic_list = [topic]

    def add_hyper(self, hyper):
        if self.hyper_list:
            self.hyper_list.append(hyper)
        else:
            self.hyper_list = [hyper]

    def add_hypo(self, hypo):
        if self.hypo_list:
            self.hypo_list.append(hypo)
        else:
            self.hypo_list = [hypo]

    def add_col(self, col):
        if self.col_list:
            self.col_list.append(col)
        else:
            self.col_list = [col]

    def set_countable(self, countable):
        if isinstance(countable, str):
            if countable == 'yes':
                self.countable = True
            elif countable == 'no':
                self.countable = False
            else:
                raise ParseException("Countable can only be yes/no.")
        elif isinstance(countable, bool):
            self.countable = countable
        else:
            raise ParseException("Countable can only be yes/no or bool.")

    def __str__(self):
        if tr_list:
            (lang, text) = self.tr_list[0]
            return "{}: {}".format(lang, text)
        return "<empy sense>"
    def __repr__(self):
        return "<Sense {}>".format(str(self))

class Parser:
    """gadict dictionary format parser."""

    COMMENT_RE = re.compile("^# ")

    SEPARATOR_RE = re.compile(u"^__$", re.UNICODE)
    HEADWORD_RE = re.compile( u"^(\\w.*)$" )
    HEADWORD_VAR_RE = re.compile(u"^ +(rare|s|pl|v[123]|male|female|comp|super|abbr|Am|Br|Au)$", re.UNICODE)
    HEADWORD_PRON_RE = re.compile(u"^ +\\[([a-zˌˈːəæɛɒʊɪɔɜɑʌɚɐɹʃʧθðɡʒŋɾʔ ]+)\\]$", re.UNICODE)
    HEADWORD_HOMO_RE = re.compile(u"^ +homo: (\\w|\\w[-'\\w ;]*\\w)$", re.UNICODE)
    TRANSL_POS_RE = re.compile(u"^(?:n|det|pron|adj|v|adv|prep|conj|num|int|phr|phr\\.v|contr|abbr|prefix)$", re.UNICODE)
    TRANSL_RE = re.compile(u"^(ru|uk|la|en): ([.\\w(].*)$", re.UNICODE)
    TRANSL_EX_RE = re.compile(u"""^(ru|uk|la|en)> ([-'"\\w].*)$""", re.UNICODE)
    TRANSL_GLOS_RE = re.compile(u"^(ru|uk|la|en)= ([-\\w\\d].*)$", re.UNICODE)
    CNT_RE = re.compile(u"^cnt: (yes|no)$", re.UNICODE)
    TOPIC_RE = re.compile(u"^topic: (\\w.*)$", re.UNICODE)
    SYN_RE = re.compile(u"^syn: (\\w.*)$", re.UNICODE)
    ANT_RE = re.compile(u"^ant: (\\w.*)$", re.UNICODE)
    REL_RE = re.compile(u"^rel: (\\w.*)$", re.UNICODE)
    HYPER_RE = re.compile(u"^hyper: (\\w.*)$", re.UNICODE)
    HYPO_RE = re.compile(u"^hypo: (\\w.*)$", re.UNICODE)
    COL_RE = re.compile(u"^col: (\\w.*)$", re.UNICODE)

    CONT_RE = re.compile(u"^ +(.*)", re.UNICODE)

    TRAILING_SPACES_RE = re.compile(u"\\s+$", re.UNICODE)

    PRELUDE_NAME_RE = re.compile(u"^name: (.*)", re.UNICODE)
    PRELUDE_URL_RE = re.compile(u"^url: (.*)", re.UNICODE)
    PRELUDE_AUTHOR_RE = re.compile(u"^by: (.*)", re.UNICODE)
    PRELUDE_LICENSE_RE = re.compile(u"^term: (.*)", re.UNICODE)
    PRELUDE_ABOUT_RE = re.compile(u"^about: ?(.*)", re.UNICODE)

    def __init__(self):
        pass

    def readline(self):
        while True:
            self.line = self.stream.readline()
            self.eof = len(self.line) == 0
            if not self.eof:
                self.lineno += 1
            self.line = self.line.rstrip('\n')
            if self.TRAILING_SPACES_RE.search(self.line):
                raise ParseException("Traling spaces detected...\n")
            if self.COMMENT_RE.search(self.line):
                continue
            break

    def parse(self, stream):
        self.lineno = 0
        self.stream = stream
        self.dom = []
        try:
            self.parse_prelude()
            while not self.eof:
                self.parse_article()
        except ParseException as ex:
            if sys.version_info.major == 2:
                import traceback
                traceback.print_exc()
            raise ParseException(ex.msg, self.lineno, self.line)
        return self.dom

    def parse_prelude_continuation(self):
        string = ""
        while True:
            self.readline()
            if self.eof:
                return string
            m = self.CONT_RE.match(self.line)
            if m is not None:
                string += "\n" + m.group(1)
            elif len(self.line) == 0:
                string += "\n"
            else:
                return string

    def parse_prelude(self):
        """Read dictionary prelude until first "__" delimiter."""
        pre = Prelude()
        while True:
            self.readline()
            if self.eof:
                raise ParseException("There are no articles...")
            m = self.PRELUDE_ABOUT_RE.match(self.line)
            if m:
                pre.about += m.group(1) + self.parse_prelude_continuation()
                if self.eof:
                    raise ParseException("There are no articles...")
            if self.SEPARATOR_RE.match(self.line):
                break
            m = self.PRELUDE_NAME_RE.match(self.line)
            if m:
                pre.name = m.group(1)
                continue
            m = self.PRELUDE_URL_RE.match(self.line)
            if m:
                pre.urls.append(m.group(1))
                continue
            m = self.PRELUDE_AUTHOR_RE.match(self.line)
            if m:
                pre.authors.append(m.group(1))
                continue
            m = self.PRELUDE_LICENSE_RE.match(self.line)
            if m:
                pre.licences.append(m.group(1))
                continue
        self.dom.append(pre)

    def parse_article(self):
        """Try to match article until next "__" delimiter. Assume that `self.line` point to "__" delimiter."""
        self.words = None
        self.tran = None
        self.parse_empty_line()
        self.parse_headlines()
        self.parse_translation()
        self.dom.append((self.words, self.tran))

    def parse_empty_line(self):
        self.readline()
        if self.eof or len(self.line) != 0:
            raise ParseException(""""__" delimiter should followed by empty line...""")

    def parse_headlines(self):
        """Try to match word variations with attributed. Assume that `self.line` on preceding empty line."""
        self.words = []
        self.readline()
        if self.eof:
            raise ParseException("""There are no definition after "__" delimiter...""")
        m = self.HEADWORD_RE.match(self.line)
        if m is None:
            raise ParseException("""There are no headword after "__" delimiter...""")
        word = m.group(1)
        pron = None
        attrs = set()
        homo = None
        while True:
            self.readline()
            if self.eof or len(self.line) == 0:
                break
            m = self.HEADWORD_RE.match(self.line)
            if m is not None:
                if word is None:
                    raise ParseException("""Didn't match previous headword...""")
                self.words.append(Headword(word, pron, attrs, homo = homo))
                word = m.group(1)
                pron = None
                attrs = set()
                homo = None
                continue
            m = self.HEADWORD_PRON_RE.match(self.line)
            if m is not None:
                if pron is not None:
                    raise ParseException("""Pronunciation is redefined...""")
                pron = m.group(1)
                continue
            m = self.HEADWORD_VAR_RE.match(self.line)
            if m is not None:
                attrs.add(m.group(1))
                continue
            m = self.HEADWORD_HOMO_RE.match(self.line)
            if m is not None:
                if homo is not None:
                    raise ParseException("""Homophones are redefined...""")
                homo = [s.strip() for s in m.group(1).split(";")]
                continue
            raise ParseException("""Line is not a headword or translation or headword attribute...""")
        self.words.append(Headword(word, pron, attrs, homo))

    def parse_translation_continuation(self):
        string = ""
        while True:
            self.readline()
            if self.eof:
                return string
            m = self.CONT_RE.match(self.line)
            if m is not None:
                string += " " + m.group(1)
            else:
                return string

    def parse_translation(self):
        senses = []
        sense = None
        read = True
        while True:
            if read:
                self.readline()
            read = True
            if self.eof:
                if sense:
                    senses.append(sense)
                break
            m = self.SEPARATOR_RE.match(self.line)
            if m is not None:
                if sense:
                    senses.append(sense)
                break
            if len(self.line) == 0:
                if sense:
                    senses.append(sense)
                sense = None
                continue
            m = self.TRANSL_POS_RE.match(self.line)
            if m is not None:
                if sense is not None:
                    raise ParseException("""Each translation should have only one part of speech marker...""")
                pos = m.group(0)
                sense = Sense(pos)
                continue
            if not sense:
                raise ParseException("""Missing part of speech marker...""")
            m = self.CNT_RE.match(self.line)
            if m is not None:
                sense.set_countable(m.group(1))
                continue
            m = self.TOPIC_RE.match(self.line)
            if m is not None:
                topics = m.group(1).split(";")
                for topic in topics:
                    topic = topic.strip()
                    if len(topic) == 0:
                        raise ParseException("""Empty topic...""")
                    sense.add_topic(topic)
                continue
            m = self.SYN_RE.match(self.line)
            if m is not None:
                syns = m.group(1).split(";")
                for syn in syns:
                    syn = syn.strip()
                    if len(syn) == 0:
                        raise ParseException("""Empty synonym...""")
                    sense.add_syn(syn)
                continue
            m = self.ANT_RE.match(self.line)
            if m is not None:
                ants = m.group(1).split(";")
                for ant in ants:
                    ant = ant.strip()
                    if len(ant) == 0:
                        raise ParseException("""Empty antonym...""")
                    sense.add_ant(ant)
                continue
            m = self.REL_RE.match(self.line)
            if m is not None:
                rels = m.group(1).split(";")
                for rel in rels:
                    rel = rel.strip()
                    if len(rel) == 0:
                        raise ParseException("""Empty relation...""")
                    sense.add_rel(rel)
                continue
            m = self.HYPER_RE.match(self.line)
            if m is not None:
                hypers = m.group(1).split(";")
                for hyper in hypers:
                    hyper = hyper.strip()
                    if len(hyper) == 0:
                        raise ParseException("""Empty hypernym...""")
                    sense.add_hyper(hyper)
                continue
            m = self.HYPO_RE.match(self.line)
            if m is not None:
                hypos = m.group(1).split(";")
                for hypo in hypos:
                    hypo = hypo.strip()
                    if len(hypo) == 0:
                        raise ParseException("""Empty hyponym...""")
                    sense.add_hypo(hypo)
                continue
            m = self.COL_RE.match(self.line)
            if m is not None:
                cols = m.group(1).split(";")
                for col in cols:
                    col = col.strip()
                    if len(col) == 0:
                        raise ParseException("""Empty collocations...""")
                    sense.add_col(col)
                continue
            m = self.TRANSL_RE.match(self.line)
            if m is not None:
                sense.add_tr((m.group(1), m.group(2) + self.parse_translation_continuation()))
                read = False
                continue
            m = self.TRANSL_EX_RE.match(self.line)
            if m is not None:
                sense.add_ex((m.group(1), m.group(2) + self.parse_translation_continuation()))
                read = False
                continue
            m = self.TRANSL_GLOS_RE.match(self.line)
            if m is not None:
                sense.add_glos((m.group(1), m.group(2) + self.parse_translation_continuation()))
                read = False
                continue
            raise ParseException("""Uknown syntax...""")
        self.tran = senses

    def __repr__(self):
        return u"<{:s}; line={:s}>".format(type(self).__name__, repr(self.line))