py/gadict.py
author Oleksandr Gavenko <gavenkoa@gmail.com>
Thu, 15 Sep 2016 20:13:18 +0300
changeset 558 53fd793e345d
parent 554 59714b9033bc
child 565 ac68f2680ea0
permissions -rw-r--r--
Add shortcut to deploy to HG repos.
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
406
f0ac87e10d9a Fix warnings from pylint/pyflakes.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 402
diff changeset
     1
"""
f0ac87e10d9a Fix warnings from pylint/pyflakes.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 402
diff changeset
     2
gadict dictionary format parser.
f0ac87e10d9a Fix warnings from pylint/pyflakes.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 402
diff changeset
     3
"""
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     4
552
7398bc1829d6 Port to python2 because Anki library written for this version.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 542
diff changeset
     5
import sys
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     6
import regex
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     7
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     8
402
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
     9
class Prelude:
406
f0ac87e10d9a Fix warnings from pylint/pyflakes.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 402
diff changeset
    10
    """Dictionary metainfo structure."""
402
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
    11
    name = None
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
    12
    about = ""
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
    13
    urls = []
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
    14
    authors = []
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
    15
    licences = []
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
    16
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
    17
399
a6a7036f3c6f File name is not available in parser. Move error printing to writer.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 394
diff changeset
    18
class ParseException(BaseException):
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    19
406
f0ac87e10d9a Fix warnings from pylint/pyflakes.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 402
diff changeset
    20
    def __init__(self, msg, lineno=None, line=None):
552
7398bc1829d6 Port to python2 because Anki library written for this version.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 542
diff changeset
    21
        BaseException.__init__(self)
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    22
        self.msg = msg
399
a6a7036f3c6f File name is not available in parser. Move error printing to writer.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 394
diff changeset
    23
        self.lineno = lineno
a6a7036f3c6f File name is not available in parser. Move error printing to writer.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 394
diff changeset
    24
        self.line = line
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    25
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    26
    def __repr__(self):
399
a6a7036f3c6f File name is not available in parser. Move error printing to writer.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 394
diff changeset
    27
        if self.lineno is None:
a6a7036f3c6f File name is not available in parser. Move error printing to writer.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 394
diff changeset
    28
            return self.msg
a6a7036f3c6f File name is not available in parser. Move error printing to writer.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 394
diff changeset
    29
        elif self.line is None:
552
7398bc1829d6 Port to python2 because Anki library written for this version.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 542
diff changeset
    30
            return ":{:d}:{:s}".format(self.lineno, self.msg.encode('utf-8'))
399
a6a7036f3c6f File name is not available in parser. Move error printing to writer.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 394
diff changeset
    31
        else:
552
7398bc1829d6 Port to python2 because Anki library written for this version.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 542
diff changeset
    32
            return ":{:d}: {:s}\nLINE: {:s}".format(self.lineno, self.msg.encode('utf-8'), self.line.encode('utf-8'))
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    33
554
59714b9033bc Store headword structure as class. Store headwords in list to preserve order
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 553
diff changeset
    34
class Headword:
59714b9033bc Store headword structure as class. Store headwords in list to preserve order
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 553
diff changeset
    35
59714b9033bc Store headword structure as class. Store headwords in list to preserve order
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 553
diff changeset
    36
    def __init__(self, headword, pron = None, attrs = None):
59714b9033bc Store headword structure as class. Store headwords in list to preserve order
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 553
diff changeset
    37
        self.headword = headword
59714b9033bc Store headword structure as class. Store headwords in list to preserve order
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 553
diff changeset
    38
        self.pron = pron
59714b9033bc Store headword structure as class. Store headwords in list to preserve order
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 553
diff changeset
    39
        self.attrs = attrs
59714b9033bc Store headword structure as class. Store headwords in list to preserve order
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 553
diff changeset
    40
59714b9033bc Store headword structure as class. Store headwords in list to preserve order
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 553
diff changeset
    41
    def __str__(self):
59714b9033bc Store headword structure as class. Store headwords in list to preserve order
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 553
diff changeset
    42
        return self.headword
59714b9033bc Store headword structure as class. Store headwords in list to preserve order
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 553
diff changeset
    43
    def __repr__(self):
59714b9033bc Store headword structure as class. Store headwords in list to preserve order
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 553
diff changeset
    44
        return "<Headword {}>".format(self.headword)
59714b9033bc Store headword structure as class. Store headwords in list to preserve order
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 553
diff changeset
    45
530
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
    46
class Sense:
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
    47
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
    48
    def __init__(self, pos, tr_list = None, ex_list = None, syn_list = None, ant_list = None, topic_list = None):
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
    49
        if not pos:
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
    50
            raise ParseException("Part of speech expected...\n")
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
    51
        self.pos = pos
553
45a3138c9b4d Simplify expression.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 552
diff changeset
    52
        self.tr_list = tr_list
45a3138c9b4d Simplify expression.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 552
diff changeset
    53
        if not tr_list:
530
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
    54
            self.tr_list = []
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
    55
        self.ex_list = ex_list
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
    56
        self.syn_list = syn_list
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
    57
        self.ant_list = ant_list
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
    58
        self.topic_list = topic_list
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
    59
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
    60
    def add_tr(self, tr):
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
    61
        self.tr_list.append(tr)
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
    62
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
    63
    def add_ex(self, ex):
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
    64
        if not self.ex_list:
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
    65
            self.ex_list = [ex]
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
    66
        else:
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
    67
            self.ex_list.append(ex)
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
    68
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
    69
    def add_syn(self, syn):
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
    70
        if not self.syn_list:
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
    71
            self.syn_list = [syn]
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
    72
        else:
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
    73
            self.syn_list.append(syn)
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
    74
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
    75
    def add_ant(self, ant):
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
    76
        if not self.ant_list:
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
    77
            self.ant_list = [ant]
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
    78
        else:
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
    79
            self.ant_list.append(ant)
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
    80
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
    81
    def add_topic(self, topic):
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
    82
        if not self.topic_list:
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
    83
            self.topic_list = [topic]
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
    84
        else:
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
    85
            self.topic_list.append(topic)
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    86
554
59714b9033bc Store headword structure as class. Store headwords in list to preserve order
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 553
diff changeset
    87
    def __str__(self):
59714b9033bc Store headword structure as class. Store headwords in list to preserve order
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 553
diff changeset
    88
        if tr_list:
59714b9033bc Store headword structure as class. Store headwords in list to preserve order
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 553
diff changeset
    89
            (lang, text) = self.tr_list[0]
59714b9033bc Store headword structure as class. Store headwords in list to preserve order
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 553
diff changeset
    90
            return "{}: {}".format(lang, text)
59714b9033bc Store headword structure as class. Store headwords in list to preserve order
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 553
diff changeset
    91
        return "<empy sence>"
59714b9033bc Store headword structure as class. Store headwords in list to preserve order
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 553
diff changeset
    92
    def __repr__(self):
59714b9033bc Store headword structure as class. Store headwords in list to preserve order
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 553
diff changeset
    93
        return "<Sence {}>".format(str(self))
59714b9033bc Store headword structure as class. Store headwords in list to preserve order
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 553
diff changeset
    94
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    95
class Parser:
406
f0ac87e10d9a Fix warnings from pylint/pyflakes.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 402
diff changeset
    96
    """gadict dictionary format parser."""
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    97
432
b3a78fc20b31 Add parsing comment syntax.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 422
diff changeset
    98
    COMMENT_RE = regex.compile(r"^# ")
b3a78fc20b31 Add parsing comment syntax.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 422
diff changeset
    99
552
7398bc1829d6 Port to python2 because Anki library written for this version.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 542
diff changeset
   100
    SEPARATOR_RE = regex.compile(u"^__$")
7398bc1829d6 Port to python2 because Anki library written for this version.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 542
diff changeset
   101
    HEADWORD_RE = regex.compile( u"^(\\p{L}.*)$" )
7398bc1829d6 Port to python2 because Anki library written for this version.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 542
diff changeset
   102
    HEADWORD_VAR_RE = regex.compile(u"^ +(s|pl|v[123]|male|female|comp|super|abbr|Am|Br|Au)$")
7398bc1829d6 Port to python2 because Anki library written for this version.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 542
diff changeset
   103
    HEADWORD_PRON_RE = regex.compile(u"^ +\\[([\p{L}' ]+)\\]$")
7398bc1829d6 Port to python2 because Anki library written for this version.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 542
diff changeset
   104
    TRANSL_POS_RE = regex.compile(u"^n|det|pron|adj|v|adv|prep|conj|num|int|phr|phr\\.v|contr|abbr|prefix$")
7398bc1829d6 Port to python2 because Anki library written for this version.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 542
diff changeset
   105
    TRANSL_RE = regex.compile(u"^(ru|uk|la|en): ([\\p{L}(].*)$")
7398bc1829d6 Port to python2 because Anki library written for this version.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 542
diff changeset
   106
    TRANSL_EX_RE = regex.compile(u"^(ru|uk|la|en)> ([-\\p{L}].*)$")
7398bc1829d6 Port to python2 because Anki library written for this version.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 542
diff changeset
   107
    TOPIC_RE = regex.compile(u"^topic: (\\p{L}.*)$")
7398bc1829d6 Port to python2 because Anki library written for this version.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 542
diff changeset
   108
    SYN_RE = regex.compile(u"^syn: (\\p{L}.*)$")
7398bc1829d6 Port to python2 because Anki library written for this version.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 542
diff changeset
   109
    ANT_RE = regex.compile(u"^ant: (\\p{L}.*)$")
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   110
552
7398bc1829d6 Port to python2 because Anki library written for this version.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 542
diff changeset
   111
    CONT_RE = regex.compile(u"^ +(.*)")
402
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   112
552
7398bc1829d6 Port to python2 because Anki library written for this version.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 542
diff changeset
   113
    TRAILING_SPACES_RE = regex.compile(u"\\p{Z}+$")
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   114
552
7398bc1829d6 Port to python2 because Anki library written for this version.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 542
diff changeset
   115
    PRELUDE_NAME_RE = regex.compile(u"^name: (.*)")
7398bc1829d6 Port to python2 because Anki library written for this version.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 542
diff changeset
   116
    PRELUDE_URL_RE = regex.compile(u"^url: (.*)")
7398bc1829d6 Port to python2 because Anki library written for this version.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 542
diff changeset
   117
    PRELUDE_AUTHOR_RE = regex.compile(u"^by: (.*)")
7398bc1829d6 Port to python2 because Anki library written for this version.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 542
diff changeset
   118
    PRELUDE_LICENSE_RE = regex.compile(u"^term: (.*)")
7398bc1829d6 Port to python2 because Anki library written for this version.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 542
diff changeset
   119
    PRELUDE_ABOUT_RE = regex.compile(u"^about: ?(.*)")
402
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   120
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   121
    def __init__(self):
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   122
        pass
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   123
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   124
    def readline(self):
432
b3a78fc20b31 Add parsing comment syntax.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 422
diff changeset
   125
        while True:
b3a78fc20b31 Add parsing comment syntax.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 422
diff changeset
   126
            self.line = self.stream.readline()
b3a78fc20b31 Add parsing comment syntax.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 422
diff changeset
   127
            self.eof = len(self.line) == 0
b3a78fc20b31 Add parsing comment syntax.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 422
diff changeset
   128
            if not self.eof:
b3a78fc20b31 Add parsing comment syntax.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 422
diff changeset
   129
                self.lineno += 1
b3a78fc20b31 Add parsing comment syntax.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 422
diff changeset
   130
            if self.TRAILING_SPACES_RE.search(self.line):
b3a78fc20b31 Add parsing comment syntax.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 422
diff changeset
   131
                raise ParseException("Traling spaces detected...\n")
b3a78fc20b31 Add parsing comment syntax.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 422
diff changeset
   132
            if self.COMMENT_RE.search(self.line):
b3a78fc20b31 Add parsing comment syntax.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 422
diff changeset
   133
                continue
b3a78fc20b31 Add parsing comment syntax.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 422
diff changeset
   134
            break
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   135
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   136
    def parse(self, stream):
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   137
        self.lineno = 0
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   138
        self.stream = stream
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   139
        self.dom = []
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   140
        try:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   141
            self.parse_prelude()
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   142
            while not self.eof:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   143
                self.parse_article()
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   144
        except ParseException as ex:
552
7398bc1829d6 Port to python2 because Anki library written for this version.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 542
diff changeset
   145
            if sys.version_info.major == 2:
7398bc1829d6 Port to python2 because Anki library written for this version.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 542
diff changeset
   146
                import traceback
7398bc1829d6 Port to python2 because Anki library written for this version.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 542
diff changeset
   147
                traceback.print_exc()
406
f0ac87e10d9a Fix warnings from pylint/pyflakes.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 402
diff changeset
   148
            raise ParseException(ex.msg, self.lineno, self.line)
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   149
        return self.dom
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   150
412
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
   151
    def parse_prelude_continuation(self):
402
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   152
        string = ""
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   153
        while True:
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   154
            self.readline()
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   155
            if self.eof:
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   156
                return string
406
f0ac87e10d9a Fix warnings from pylint/pyflakes.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 402
diff changeset
   157
            m = self.CONT_RE.match(self.line)
402
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   158
            if m is not None:
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   159
                string += "\n" + m.group(1)
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   160
            elif len(self.line) == 1:
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   161
                string += "\n"
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   162
            else:
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   163
                return string
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   164
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   165
    def parse_prelude(self):
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   166
        """Read dictionary prelude until first "__" delimiter."""
402
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   167
        pre = Prelude()
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   168
        while True:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   169
            self.readline()
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   170
            if self.eof:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   171
                raise ParseException("There are no articles...")
402
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   172
            m = self.PRELUDE_ABOUT_RE.match(self.line)
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   173
            if m:
412
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
   174
                pre.about += m.group(1) + self.parse_prelude_continuation()
402
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   175
                if self.eof:
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   176
                    raise ParseException("There are no articles...")
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   177
            if self.SEPARATOR_RE.match(self.line):
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   178
                break
402
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   179
            m = self.PRELUDE_NAME_RE.match(self.line)
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   180
            if m:
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   181
                pre.name = m.group(1)
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   182
                continue
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   183
            m = self.PRELUDE_URL_RE.match(self.line)
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   184
            if m:
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   185
                pre.urls.append(m.group(1))
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   186
                continue
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   187
            m = self.PRELUDE_AUTHOR_RE.match(self.line)
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   188
            if m:
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   189
                pre.authors.append(m.group(1))
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   190
                continue
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   191
            m = self.PRELUDE_LICENSE_RE.match(self.line)
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   192
            if m:
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   193
                pre.licences.append(m.group(1))
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   194
                continue
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   195
        self.dom.append(pre)
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   196
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   197
    def parse_article(self):
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   198
        """Try to match article until next "__" delimiter. Assume that `self.line` point to "__" delimiter."""
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   199
        self.words = None
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   200
        self.tran = None
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   201
        self.parse_empty_line()
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   202
        self.parse_headlines()
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   203
        self.parse_translation()
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   204
        self.dom.append((self.words, self.tran))
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   205
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   206
    def parse_empty_line(self):
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   207
        self.readline()
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   208
        if self.eof or len(self.line) != 1:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   209
            raise ParseException(""""__" delimiter should followed by empty line...""")
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   210
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   211
    def parse_headlines(self):
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   212
        """Try to match word variations with attributed. Assume that `self.line` on preceding empty line."""
554
59714b9033bc Store headword structure as class. Store headwords in list to preserve order
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 553
diff changeset
   213
        self.words = []
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   214
        self.readline()
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   215
        if self.eof:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   216
            raise ParseException("""There are no definition after "__" delimiter...""")
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   217
        m = self.HEADWORD_RE.match(self.line)
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   218
        if m is None:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   219
            raise ParseException("""There are no headword after "__" delimiter...""")
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   220
        word = m.group(1)
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   221
        pron = None
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   222
        attrs = set()
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   223
        while True:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   224
            self.readline()
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   225
            if self.eof or len(self.line) == 1:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   226
                break
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   227
            m = self.HEADWORD_RE.match(self.line)
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   228
            if m is not None:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   229
                if word is None:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   230
                    raise ParseException("""Didn't match previous headword...""")
554
59714b9033bc Store headword structure as class. Store headwords in list to preserve order
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 553
diff changeset
   231
                self.words.append(Headword(word, pron, attrs))
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   232
                word = m.group(1)
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   233
                pron = None
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   234
                attrs = set()
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   235
                continue
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   236
            m = self.HEADWORD_PRON_RE.match(self.line)
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   237
            if m is not None:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   238
                if pron is not None:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   239
                    raise ParseException("""Pronunciation is redefined...""")
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   240
                pron = m.group(1)
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   241
                continue
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   242
            m = self.HEADWORD_VAR_RE.match(self.line)
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   243
            if m is not None:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   244
                attrs.add(m.group(1))
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   245
                continue
400
aa03182d2e26 Proper check for trailing spaces.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 399
diff changeset
   246
            raise ParseException("""Line is not a headword or translation or headword attribute...""")
554
59714b9033bc Store headword structure as class. Store headwords in list to preserve order
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 553
diff changeset
   247
        self.words.append(Headword(word, pron, attrs))
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   248
412
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
   249
    def parse_translation_continuation(self):
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
   250
        string = ""
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
   251
        while True:
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
   252
            self.readline()
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
   253
            if self.eof:
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
   254
                return string
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
   255
            m = self.CONT_RE.match(self.line)
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
   256
            if m is not None:
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
   257
                string += "\n" + m.group(1)
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
   258
            else:
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
   259
                return string
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
   260
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   261
    def parse_translation(self):
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   262
        senses = []
530
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   263
        sense = None
412
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
   264
        read = True
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   265
        while True:
412
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
   266
            if read:
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
   267
                self.readline()
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
   268
            read = True
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   269
            if self.eof:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   270
                break
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   271
            m = self.SEPARATOR_RE.match(self.line)
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   272
            if m is not None:
530
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   273
                if sense:
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   274
                    senses.append(sense)
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   275
                break
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   276
            if len(self.line) == 1:
530
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   277
                if sense:
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   278
                    senses.append(sense)
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   279
                sense = None
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   280
                continue
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   281
            m = self.TRANSL_POS_RE.match(self.line)
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   282
            if m is not None:
530
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   283
                if sense is not None:
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   284
                    raise ParseException("""Each translation should have only one part of speech marker...""")
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   285
                pos = m.group(0)
530
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   286
                sense = Sense(pos)
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   287
                continue
530
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   288
            if not sense:
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   289
                raise ParseException("""Missing part of speech marker...""")
412
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
   290
            m = self.TOPIC_RE.match(self.line)
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
   291
            if m is not None:
530
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   292
                topics = m.group(1).split(";")
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   293
                for topic in topics:
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   294
                    topic = topic.strip()
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   295
                    if len(topic) == 0:
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   296
                        raise ParseException("""Empty topic...""")
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   297
                    sense.add_topic(topic)
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   298
                continue
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   299
            m = self.SYN_RE.match(self.line)
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   300
            if m is not None:
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   301
                syns = m.group(1).split(";")
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   302
                for syn in syns:
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   303
                    syn = syn.strip()
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   304
                    if len(syn) == 0:
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   305
                        raise ParseException("""Empty synonym...""")
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   306
                    sense.add_syn(syn)
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   307
                continue
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   308
            m = self.ANT_RE.match(self.line)
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   309
            if m is not None:
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   310
                ants = m.group(1).split(";")
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   311
                for ant in ants:
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   312
                    ant = ant.strip()
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   313
                    if len(ant) == 0:
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   314
                        raise ParseException("""Empty antonym...""")
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   315
                    sense.add_ant(ant)
412
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
   316
                continue
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   317
            m = self.TRANSL_RE.match(self.line)
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   318
            if m is not None:
530
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   319
                sense.add_tr((m.group(1), m.group(2) + self.parse_translation_continuation()))
412
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
   320
                read = False
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   321
                continue
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   322
            m = self.TRANSL_EX_RE.match(self.line)
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   323
            if m is not None:
530
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   324
                sense.add_ex((m.group(1), m.group(2) + self.parse_translation_continuation()))
412
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
   325
                read = False
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   326
                continue
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   327
            raise ParseException("""Uknown syntax...""")
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   328
        self.tran = senses