py/gadict.py
author Oleksandr Gavenko <gavenkoa@gmail.com>
Sat, 24 Mar 2018 21:51:31 +0200
changeset 1028 570faef6aaef
parent 1011 fdf5640f221a
child 1148 263e9e066981
permissions -rw-r--r--
Added some professions.
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
631
f6d706812a0f Fix: SyntaxError: Non-ASCII character '\xcb' in file, but no encoding
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 629
diff changeset
     1
# -*- coding: utf-8 -*-
406
f0ac87e10d9a Fix warnings from pylint/pyflakes.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 402
diff changeset
     2
"""
f0ac87e10d9a Fix warnings from pylint/pyflakes.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 402
diff changeset
     3
gadict dictionary format parser.
f0ac87e10d9a Fix warnings from pylint/pyflakes.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 402
diff changeset
     4
"""
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     5
552
7398bc1829d6 Port to python2 because Anki library written for this version.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 542
diff changeset
     6
import sys
757
5417f2102dc5 Switch to built-in `re` Python module over `regex`.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 740
diff changeset
     7
import re
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     8
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     9
402
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
    10
class Prelude:
406
f0ac87e10d9a Fix warnings from pylint/pyflakes.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 402
diff changeset
    11
    """Dictionary metainfo structure."""
402
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
    12
    name = None
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
    13
    about = ""
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
    14
    urls = []
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
    15
    authors = []
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
    16
    licences = []
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
    17
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
    18
399
a6a7036f3c6f File name is not available in parser. Move error printing to writer.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 394
diff changeset
    19
class ParseException(BaseException):
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    20
406
f0ac87e10d9a Fix warnings from pylint/pyflakes.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 402
diff changeset
    21
    def __init__(self, msg, lineno=None, line=None):
552
7398bc1829d6 Port to python2 because Anki library written for this version.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 542
diff changeset
    22
        BaseException.__init__(self)
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    23
        self.msg = msg
399
a6a7036f3c6f File name is not available in parser. Move error printing to writer.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 394
diff changeset
    24
        self.lineno = lineno
a6a7036f3c6f File name is not available in parser. Move error printing to writer.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 394
diff changeset
    25
        self.line = line
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    26
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    27
    def __repr__(self):
399
a6a7036f3c6f File name is not available in parser. Move error printing to writer.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 394
diff changeset
    28
        if self.lineno is None:
a6a7036f3c6f File name is not available in parser. Move error printing to writer.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 394
diff changeset
    29
            return self.msg
a6a7036f3c6f File name is not available in parser. Move error printing to writer.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 394
diff changeset
    30
        elif self.line is None:
582
92a189ed49b6 Fix: TypeError: non-empty format string passed to object.__format__.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 569
diff changeset
    31
            return u":{:d}:{:s}".format(self.lineno, self.msg)
399
a6a7036f3c6f File name is not available in parser. Move error printing to writer.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 394
diff changeset
    32
        else:
582
92a189ed49b6 Fix: TypeError: non-empty format string passed to object.__format__.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 569
diff changeset
    33
            return u":{:d}: {:s}\nLINE: {:s}".format(self.lineno, self.msg, self.line)
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    34
554
59714b9033bc Store headword structure as class. Store headwords in list to preserve order
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 553
diff changeset
    35
class Headword:
59714b9033bc Store headword structure as class. Store headwords in list to preserve order
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 553
diff changeset
    36
931
9a5f97027ee7 Added support for homophones.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 892
diff changeset
    37
    def __init__(self, headword, pron = None, attrs = None, homo = None):
554
59714b9033bc Store headword structure as class. Store headwords in list to preserve order
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 553
diff changeset
    38
        self.headword = headword
59714b9033bc Store headword structure as class. Store headwords in list to preserve order
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 553
diff changeset
    39
        self.pron = pron
59714b9033bc Store headword structure as class. Store headwords in list to preserve order
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 553
diff changeset
    40
        self.attrs = attrs
931
9a5f97027ee7 Added support for homophones.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 892
diff changeset
    41
        self.homo = homo
554
59714b9033bc Store headword structure as class. Store headwords in list to preserve order
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 553
diff changeset
    42
59714b9033bc Store headword structure as class. Store headwords in list to preserve order
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 553
diff changeset
    43
    def __str__(self):
59714b9033bc Store headword structure as class. Store headwords in list to preserve order
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 553
diff changeset
    44
        return self.headword
59714b9033bc Store headword structure as class. Store headwords in list to preserve order
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 553
diff changeset
    45
    def __repr__(self):
59714b9033bc Store headword structure as class. Store headwords in list to preserve order
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 553
diff changeset
    46
        return "<Headword {}>".format(self.headword)
59714b9033bc Store headword structure as class. Store headwords in list to preserve order
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 553
diff changeset
    47
530
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
    48
class Sense:
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
    49
1006
b1f11eff7c70 Added support for countable.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 984
diff changeset
    50
    def __init__(self, pos, tr_list = None, ex_list = None, glos_list = None, ant_list = None, syn_list = None, rel_list = None, topic_list = None, hyper_list = None, hypo_list = None, col_list = None, countable = None):
530
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
    51
        if not pos:
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
    52
            raise ParseException("Part of speech expected...\n")
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
    53
        self.pos = pos
553
45a3138c9b4d Simplify expression.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 552
diff changeset
    54
        self.tr_list = tr_list
530
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
    55
        self.ex_list = ex_list
566
0bba61492c37 Add syntax for glossary/explanation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 565
diff changeset
    56
        self.glos_list = glos_list
565
ac68f2680ea0 Add syntax to add related words. Add separators between ant/syn/rel in
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 554
diff changeset
    57
        self.ant_list = ant_list
530
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
    58
        self.syn_list = syn_list
565
ac68f2680ea0 Add syntax to add related words. Add separators between ant/syn/rel in
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 554
diff changeset
    59
        self.rel_list = rel_list
530
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
    60
        self.topic_list = topic_list
618
6ad7203ac9dc Add support for hypernyms and hyponyms.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 594
diff changeset
    61
        self.hyper_list = hyper_list
6ad7203ac9dc Add support for hypernyms and hyponyms.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 594
diff changeset
    62
        self.hypo_list = hypo_list
984
73d6e2631338 Added support for collocations' reference.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 937
diff changeset
    63
        self.col_list = col_list
1006
b1f11eff7c70 Added support for countable.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 984
diff changeset
    64
        self.countable = countable
530
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
    65
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
    66
    def add_tr(self, tr):
569
5709d5bd349b Fix: reverse card missed glossary.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 568
diff changeset
    67
        if self.tr_list:
5709d5bd349b Fix: reverse card missed glossary.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 568
diff changeset
    68
            self.tr_list.append(tr)
5709d5bd349b Fix: reverse card missed glossary.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 568
diff changeset
    69
        else:
5709d5bd349b Fix: reverse card missed glossary.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 568
diff changeset
    70
            self.tr_list = [tr]
530
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
    71
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
    72
    def add_ex(self, ex):
565
ac68f2680ea0 Add syntax to add related words. Add separators between ant/syn/rel in
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 554
diff changeset
    73
        if self.ex_list:
ac68f2680ea0 Add syntax to add related words. Add separators between ant/syn/rel in
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 554
diff changeset
    74
            self.ex_list.append(ex)
ac68f2680ea0 Add syntax to add related words. Add separators between ant/syn/rel in
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 554
diff changeset
    75
        else:
530
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
    76
            self.ex_list = [ex]
565
ac68f2680ea0 Add syntax to add related words. Add separators between ant/syn/rel in
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 554
diff changeset
    77
566
0bba61492c37 Add syntax for glossary/explanation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 565
diff changeset
    78
    def add_glos(self, glos):
0bba61492c37 Add syntax for glossary/explanation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 565
diff changeset
    79
        if self.glos_list:
0bba61492c37 Add syntax for glossary/explanation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 565
diff changeset
    80
            self.glos_list.append(glos)
0bba61492c37 Add syntax for glossary/explanation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 565
diff changeset
    81
        else:
0bba61492c37 Add syntax for glossary/explanation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 565
diff changeset
    82
            self.glos_list = [glos]
0bba61492c37 Add syntax for glossary/explanation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 565
diff changeset
    83
565
ac68f2680ea0 Add syntax to add related words. Add separators between ant/syn/rel in
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 554
diff changeset
    84
    def add_ant(self, ant):
ac68f2680ea0 Add syntax to add related words. Add separators between ant/syn/rel in
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 554
diff changeset
    85
        if self.ant_list:
ac68f2680ea0 Add syntax to add related words. Add separators between ant/syn/rel in
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 554
diff changeset
    86
            self.ant_list.append(ant)
530
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
    87
        else:
565
ac68f2680ea0 Add syntax to add related words. Add separators between ant/syn/rel in
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 554
diff changeset
    88
            self.ant_list = [ant]
530
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
    89
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
    90
    def add_syn(self, syn):
565
ac68f2680ea0 Add syntax to add related words. Add separators between ant/syn/rel in
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 554
diff changeset
    91
        if self.syn_list:
ac68f2680ea0 Add syntax to add related words. Add separators between ant/syn/rel in
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 554
diff changeset
    92
            self.syn_list.append(syn)
ac68f2680ea0 Add syntax to add related words. Add separators between ant/syn/rel in
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 554
diff changeset
    93
        else:
530
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
    94
            self.syn_list = [syn]
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
    95
565
ac68f2680ea0 Add syntax to add related words. Add separators between ant/syn/rel in
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 554
diff changeset
    96
    def add_rel(self, rel):
ac68f2680ea0 Add syntax to add related words. Add separators between ant/syn/rel in
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 554
diff changeset
    97
        if self.rel_list:
ac68f2680ea0 Add syntax to add related words. Add separators between ant/syn/rel in
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 554
diff changeset
    98
            self.rel_list.append(rel)
530
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
    99
        else:
565
ac68f2680ea0 Add syntax to add related words. Add separators between ant/syn/rel in
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 554
diff changeset
   100
            self.rel_list = [rel]
530
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   101
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   102
    def add_topic(self, topic):
565
ac68f2680ea0 Add syntax to add related words. Add separators between ant/syn/rel in
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 554
diff changeset
   103
        if self.topic_list:
ac68f2680ea0 Add syntax to add related words. Add separators between ant/syn/rel in
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 554
diff changeset
   104
            self.topic_list.append(topic)
ac68f2680ea0 Add syntax to add related words. Add separators between ant/syn/rel in
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 554
diff changeset
   105
        else:
530
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   106
            self.topic_list = [topic]
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   107
618
6ad7203ac9dc Add support for hypernyms and hyponyms.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 594
diff changeset
   108
    def add_hyper(self, hyper):
6ad7203ac9dc Add support for hypernyms and hyponyms.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 594
diff changeset
   109
        if self.hyper_list:
6ad7203ac9dc Add support for hypernyms and hyponyms.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 594
diff changeset
   110
            self.hyper_list.append(hyper)
6ad7203ac9dc Add support for hypernyms and hyponyms.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 594
diff changeset
   111
        else:
6ad7203ac9dc Add support for hypernyms and hyponyms.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 594
diff changeset
   112
            self.hyper_list = [hyper]
6ad7203ac9dc Add support for hypernyms and hyponyms.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 594
diff changeset
   113
6ad7203ac9dc Add support for hypernyms and hyponyms.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 594
diff changeset
   114
    def add_hypo(self, hypo):
6ad7203ac9dc Add support for hypernyms and hyponyms.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 594
diff changeset
   115
        if self.hypo_list:
6ad7203ac9dc Add support for hypernyms and hyponyms.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 594
diff changeset
   116
            self.hypo_list.append(hypo)
6ad7203ac9dc Add support for hypernyms and hyponyms.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 594
diff changeset
   117
        else:
6ad7203ac9dc Add support for hypernyms and hyponyms.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 594
diff changeset
   118
            self.hypo_list = [hypo]
6ad7203ac9dc Add support for hypernyms and hyponyms.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 594
diff changeset
   119
984
73d6e2631338 Added support for collocations' reference.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 937
diff changeset
   120
    def add_col(self, col):
73d6e2631338 Added support for collocations' reference.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 937
diff changeset
   121
        if self.col_list:
73d6e2631338 Added support for collocations' reference.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 937
diff changeset
   122
            self.col_list.append(col)
73d6e2631338 Added support for collocations' reference.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 937
diff changeset
   123
        else:
73d6e2631338 Added support for collocations' reference.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 937
diff changeset
   124
            self.col_list = [col]
73d6e2631338 Added support for collocations' reference.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 937
diff changeset
   125
1006
b1f11eff7c70 Added support for countable.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 984
diff changeset
   126
    def set_countable(self, countable):
b1f11eff7c70 Added support for countable.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 984
diff changeset
   127
        if isinstance(countable, str):
b1f11eff7c70 Added support for countable.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 984
diff changeset
   128
            if countable == 'yes':
b1f11eff7c70 Added support for countable.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 984
diff changeset
   129
                self.countable = True
b1f11eff7c70 Added support for countable.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 984
diff changeset
   130
            elif countable == 'no':
b1f11eff7c70 Added support for countable.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 984
diff changeset
   131
                self.countable = False
b1f11eff7c70 Added support for countable.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 984
diff changeset
   132
            else:
b1f11eff7c70 Added support for countable.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 984
diff changeset
   133
                raise ParseException("Countable can only be yes/no.")
b1f11eff7c70 Added support for countable.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 984
diff changeset
   134
        elif isinstance(countable, bool):
b1f11eff7c70 Added support for countable.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 984
diff changeset
   135
            self.countable = countable
b1f11eff7c70 Added support for countable.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 984
diff changeset
   136
        else:
b1f11eff7c70 Added support for countable.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 984
diff changeset
   137
            raise ParseException("Countable can only be yes/no or bool.")
b1f11eff7c70 Added support for countable.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 984
diff changeset
   138
554
59714b9033bc Store headword structure as class. Store headwords in list to preserve order
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 553
diff changeset
   139
    def __str__(self):
59714b9033bc Store headword structure as class. Store headwords in list to preserve order
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 553
diff changeset
   140
        if tr_list:
59714b9033bc Store headword structure as class. Store headwords in list to preserve order
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 553
diff changeset
   141
            (lang, text) = self.tr_list[0]
59714b9033bc Store headword structure as class. Store headwords in list to preserve order
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 553
diff changeset
   142
            return "{}: {}".format(lang, text)
568
4b610eaaf4be Fix mistype.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 566
diff changeset
   143
        return "<empy sense>"
554
59714b9033bc Store headword structure as class. Store headwords in list to preserve order
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 553
diff changeset
   144
    def __repr__(self):
568
4b610eaaf4be Fix mistype.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 566
diff changeset
   145
        return "<Sense {}>".format(str(self))
554
59714b9033bc Store headword structure as class. Store headwords in list to preserve order
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 553
diff changeset
   146
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   147
class Parser:
406
f0ac87e10d9a Fix warnings from pylint/pyflakes.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 402
diff changeset
   148
    """gadict dictionary format parser."""
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   149
757
5417f2102dc5 Switch to built-in `re` Python module over `regex`.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 740
diff changeset
   150
    COMMENT_RE = re.compile("^# ")
432
b3a78fc20b31 Add parsing comment syntax.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 422
diff changeset
   151
757
5417f2102dc5 Switch to built-in `re` Python module over `regex`.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 740
diff changeset
   152
    SEPARATOR_RE = re.compile(u"^__$", re.UNICODE)
5417f2102dc5 Switch to built-in `re` Python module over `regex`.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 740
diff changeset
   153
    HEADWORD_RE = re.compile( u"^(\\w.*)$" )
5417f2102dc5 Switch to built-in `re` Python module over `regex`.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 740
diff changeset
   154
    HEADWORD_VAR_RE = re.compile(u"^ +(rare|s|pl|v[123]|male|female|comp|super|abbr|Am|Br|Au)$", re.UNICODE)
892
0c298fe6739e Marked glottal stop as valid IPA sign.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 871
diff changeset
   155
    HEADWORD_PRON_RE = re.compile(u"^ +\\[([a-zˌˈːəæɛɒʊɪɔɜɑʌɚɐɹʃʧθðɡʒŋɾʔ ]+)\\]$", re.UNICODE)
937
981839c72b64 Made parsing rule more strict for homophones.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 931
diff changeset
   156
    HEADWORD_HOMO_RE = re.compile(u"^ +homo: (\\w|\\w[-'\\w ;]*\\w)$", re.UNICODE)
757
5417f2102dc5 Switch to built-in `re` Python module over `regex`.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 740
diff changeset
   157
    TRANSL_POS_RE = re.compile(u"^(?:n|det|pron|adj|v|adv|prep|conj|num|int|phr|phr\\.v|contr|abbr|prefix)$", re.UNICODE)
1011
fdf5640f221a Allow translation to start with dot.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 1006
diff changeset
   158
    TRANSL_RE = re.compile(u"^(ru|uk|la|en): ([.\\w(].*)$", re.UNICODE)
757
5417f2102dc5 Switch to built-in `re` Python module over `regex`.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 740
diff changeset
   159
    TRANSL_EX_RE = re.compile(u"""^(ru|uk|la|en)> ([-'"\\w].*)$""", re.UNICODE)
5417f2102dc5 Switch to built-in `re` Python module over `regex`.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 740
diff changeset
   160
    TRANSL_GLOS_RE = re.compile(u"^(ru|uk|la|en)= ([-\\w\\d].*)$", re.UNICODE)
1006
b1f11eff7c70 Added support for countable.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 984
diff changeset
   161
    CNT_RE = re.compile(u"^cnt: (yes|no)$", re.UNICODE)
757
5417f2102dc5 Switch to built-in `re` Python module over `regex`.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 740
diff changeset
   162
    TOPIC_RE = re.compile(u"^topic: (\\w.*)$", re.UNICODE)
5417f2102dc5 Switch to built-in `re` Python module over `regex`.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 740
diff changeset
   163
    SYN_RE = re.compile(u"^syn: (\\w.*)$", re.UNICODE)
5417f2102dc5 Switch to built-in `re` Python module over `regex`.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 740
diff changeset
   164
    ANT_RE = re.compile(u"^ant: (\\w.*)$", re.UNICODE)
5417f2102dc5 Switch to built-in `re` Python module over `regex`.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 740
diff changeset
   165
    REL_RE = re.compile(u"^rel: (\\w.*)$", re.UNICODE)
5417f2102dc5 Switch to built-in `re` Python module over `regex`.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 740
diff changeset
   166
    HYPER_RE = re.compile(u"^hyper: (\\w.*)$", re.UNICODE)
5417f2102dc5 Switch to built-in `re` Python module over `regex`.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 740
diff changeset
   167
    HYPO_RE = re.compile(u"^hypo: (\\w.*)$", re.UNICODE)
984
73d6e2631338 Added support for collocations' reference.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 937
diff changeset
   168
    COL_RE = re.compile(u"^col: (\\w.*)$", re.UNICODE)
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   169
757
5417f2102dc5 Switch to built-in `re` Python module over `regex`.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 740
diff changeset
   170
    CONT_RE = re.compile(u"^ +(.*)", re.UNICODE)
402
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   171
757
5417f2102dc5 Switch to built-in `re` Python module over `regex`.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 740
diff changeset
   172
    TRAILING_SPACES_RE = re.compile(u"\\s+$", re.UNICODE)
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   173
757
5417f2102dc5 Switch to built-in `re` Python module over `regex`.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 740
diff changeset
   174
    PRELUDE_NAME_RE = re.compile(u"^name: (.*)", re.UNICODE)
5417f2102dc5 Switch to built-in `re` Python module over `regex`.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 740
diff changeset
   175
    PRELUDE_URL_RE = re.compile(u"^url: (.*)", re.UNICODE)
5417f2102dc5 Switch to built-in `re` Python module over `regex`.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 740
diff changeset
   176
    PRELUDE_AUTHOR_RE = re.compile(u"^by: (.*)", re.UNICODE)
5417f2102dc5 Switch to built-in `re` Python module over `regex`.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 740
diff changeset
   177
    PRELUDE_LICENSE_RE = re.compile(u"^term: (.*)", re.UNICODE)
5417f2102dc5 Switch to built-in `re` Python module over `regex`.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 740
diff changeset
   178
    PRELUDE_ABOUT_RE = re.compile(u"^about: ?(.*)", re.UNICODE)
402
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   179
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   180
    def __init__(self):
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   181
        pass
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   182
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   183
    def readline(self):
432
b3a78fc20b31 Add parsing comment syntax.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 422
diff changeset
   184
        while True:
b3a78fc20b31 Add parsing comment syntax.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 422
diff changeset
   185
            self.line = self.stream.readline()
b3a78fc20b31 Add parsing comment syntax.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 422
diff changeset
   186
            self.eof = len(self.line) == 0
b3a78fc20b31 Add parsing comment syntax.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 422
diff changeset
   187
            if not self.eof:
b3a78fc20b31 Add parsing comment syntax.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 422
diff changeset
   188
                self.lineno += 1
757
5417f2102dc5 Switch to built-in `re` Python module over `regex`.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 740
diff changeset
   189
            self.line = self.line.rstrip('\n')
432
b3a78fc20b31 Add parsing comment syntax.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 422
diff changeset
   190
            if self.TRAILING_SPACES_RE.search(self.line):
b3a78fc20b31 Add parsing comment syntax.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 422
diff changeset
   191
                raise ParseException("Traling spaces detected...\n")
b3a78fc20b31 Add parsing comment syntax.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 422
diff changeset
   192
            if self.COMMENT_RE.search(self.line):
b3a78fc20b31 Add parsing comment syntax.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 422
diff changeset
   193
                continue
b3a78fc20b31 Add parsing comment syntax.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 422
diff changeset
   194
            break
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   195
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   196
    def parse(self, stream):
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   197
        self.lineno = 0
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   198
        self.stream = stream
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   199
        self.dom = []
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   200
        try:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   201
            self.parse_prelude()
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   202
            while not self.eof:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   203
                self.parse_article()
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   204
        except ParseException as ex:
552
7398bc1829d6 Port to python2 because Anki library written for this version.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 542
diff changeset
   205
            if sys.version_info.major == 2:
7398bc1829d6 Port to python2 because Anki library written for this version.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 542
diff changeset
   206
                import traceback
7398bc1829d6 Port to python2 because Anki library written for this version.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 542
diff changeset
   207
                traceback.print_exc()
406
f0ac87e10d9a Fix warnings from pylint/pyflakes.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 402
diff changeset
   208
            raise ParseException(ex.msg, self.lineno, self.line)
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   209
        return self.dom
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   210
412
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
   211
    def parse_prelude_continuation(self):
402
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   212
        string = ""
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   213
        while True:
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   214
            self.readline()
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   215
            if self.eof:
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   216
                return string
406
f0ac87e10d9a Fix warnings from pylint/pyflakes.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 402
diff changeset
   217
            m = self.CONT_RE.match(self.line)
402
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   218
            if m is not None:
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   219
                string += "\n" + m.group(1)
757
5417f2102dc5 Switch to built-in `re` Python module over `regex`.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 740
diff changeset
   220
            elif len(self.line) == 0:
402
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   221
                string += "\n"
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   222
            else:
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   223
                return string
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   224
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   225
    def parse_prelude(self):
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   226
        """Read dictionary prelude until first "__" delimiter."""
402
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   227
        pre = Prelude()
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   228
        while True:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   229
            self.readline()
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   230
            if self.eof:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   231
                raise ParseException("There are no articles...")
402
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   232
            m = self.PRELUDE_ABOUT_RE.match(self.line)
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   233
            if m:
412
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
   234
                pre.about += m.group(1) + self.parse_prelude_continuation()
402
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   235
                if self.eof:
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   236
                    raise ParseException("There are no articles...")
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   237
            if self.SEPARATOR_RE.match(self.line):
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   238
                break
402
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   239
            m = self.PRELUDE_NAME_RE.match(self.line)
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   240
            if m:
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   241
                pre.name = m.group(1)
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   242
                continue
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   243
            m = self.PRELUDE_URL_RE.match(self.line)
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   244
            if m:
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   245
                pre.urls.append(m.group(1))
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   246
                continue
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   247
            m = self.PRELUDE_AUTHOR_RE.match(self.line)
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   248
            if m:
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   249
                pre.authors.append(m.group(1))
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   250
                continue
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   251
            m = self.PRELUDE_LICENSE_RE.match(self.line)
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   252
            if m:
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   253
                pre.licences.append(m.group(1))
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   254
                continue
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   255
        self.dom.append(pre)
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   256
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   257
    def parse_article(self):
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   258
        """Try to match article until next "__" delimiter. Assume that `self.line` point to "__" delimiter."""
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   259
        self.words = None
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   260
        self.tran = None
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   261
        self.parse_empty_line()
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   262
        self.parse_headlines()
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   263
        self.parse_translation()
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   264
        self.dom.append((self.words, self.tran))
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   265
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   266
    def parse_empty_line(self):
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   267
        self.readline()
757
5417f2102dc5 Switch to built-in `re` Python module over `regex`.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 740
diff changeset
   268
        if self.eof or len(self.line) != 0:
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   269
            raise ParseException(""""__" delimiter should followed by empty line...""")
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   270
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   271
    def parse_headlines(self):
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   272
        """Try to match word variations with attributed. Assume that `self.line` on preceding empty line."""
554
59714b9033bc Store headword structure as class. Store headwords in list to preserve order
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 553
diff changeset
   273
        self.words = []
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   274
        self.readline()
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   275
        if self.eof:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   276
            raise ParseException("""There are no definition after "__" delimiter...""")
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   277
        m = self.HEADWORD_RE.match(self.line)
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   278
        if m is None:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   279
            raise ParseException("""There are no headword after "__" delimiter...""")
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   280
        word = m.group(1)
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   281
        pron = None
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   282
        attrs = set()
931
9a5f97027ee7 Added support for homophones.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 892
diff changeset
   283
        homo = None
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   284
        while True:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   285
            self.readline()
757
5417f2102dc5 Switch to built-in `re` Python module over `regex`.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 740
diff changeset
   286
            if self.eof or len(self.line) == 0:
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   287
                break
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   288
            m = self.HEADWORD_RE.match(self.line)
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   289
            if m is not None:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   290
                if word is None:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   291
                    raise ParseException("""Didn't match previous headword...""")
931
9a5f97027ee7 Added support for homophones.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 892
diff changeset
   292
                self.words.append(Headword(word, pron, attrs, homo = homo))
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   293
                word = m.group(1)
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   294
                pron = None
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   295
                attrs = set()
931
9a5f97027ee7 Added support for homophones.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 892
diff changeset
   296
                homo = None
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   297
                continue
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   298
            m = self.HEADWORD_PRON_RE.match(self.line)
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   299
            if m is not None:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   300
                if pron is not None:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   301
                    raise ParseException("""Pronunciation is redefined...""")
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   302
                pron = m.group(1)
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   303
                continue
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   304
            m = self.HEADWORD_VAR_RE.match(self.line)
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   305
            if m is not None:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   306
                attrs.add(m.group(1))
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   307
                continue
931
9a5f97027ee7 Added support for homophones.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 892
diff changeset
   308
            m = self.HEADWORD_HOMO_RE.match(self.line)
9a5f97027ee7 Added support for homophones.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 892
diff changeset
   309
            if m is not None:
9a5f97027ee7 Added support for homophones.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 892
diff changeset
   310
                if homo is not None:
9a5f97027ee7 Added support for homophones.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 892
diff changeset
   311
                    raise ParseException("""Homophones are redefined...""")
9a5f97027ee7 Added support for homophones.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 892
diff changeset
   312
                homo = [s.strip() for s in m.group(1).split(";")]
9a5f97027ee7 Added support for homophones.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 892
diff changeset
   313
                continue
400
aa03182d2e26 Proper check for trailing spaces.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 399
diff changeset
   314
            raise ParseException("""Line is not a headword or translation or headword attribute...""")
931
9a5f97027ee7 Added support for homophones.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 892
diff changeset
   315
        self.words.append(Headword(word, pron, attrs, homo))
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   316
412
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
   317
    def parse_translation_continuation(self):
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
   318
        string = ""
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
   319
        while True:
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
   320
            self.readline()
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
   321
            if self.eof:
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
   322
                return string
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
   323
            m = self.CONT_RE.match(self.line)
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
   324
            if m is not None:
593
32de94c1b039 Disable word wrapping for dictd dictionary.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 582
diff changeset
   325
                string += " " + m.group(1)
412
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
   326
            else:
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
   327
                return string
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
   328
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   329
    def parse_translation(self):
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   330
        senses = []
530
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   331
        sense = None
412
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
   332
        read = True
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   333
        while True:
412
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
   334
            if read:
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
   335
                self.readline()
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
   336
            read = True
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   337
            if self.eof:
659
3d4ea0a5928f Fix: last sense in last article is skipped.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 647
diff changeset
   338
                if sense:
3d4ea0a5928f Fix: last sense in last article is skipped.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 647
diff changeset
   339
                    senses.append(sense)
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   340
                break
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   341
            m = self.SEPARATOR_RE.match(self.line)
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   342
            if m is not None:
530
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   343
                if sense:
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   344
                    senses.append(sense)
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   345
                break
757
5417f2102dc5 Switch to built-in `re` Python module over `regex`.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 740
diff changeset
   346
            if len(self.line) == 0:
530
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   347
                if sense:
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   348
                    senses.append(sense)
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   349
                sense = None
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   350
                continue
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   351
            m = self.TRANSL_POS_RE.match(self.line)
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   352
            if m is not None:
530
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   353
                if sense is not None:
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   354
                    raise ParseException("""Each translation should have only one part of speech marker...""")
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   355
                pos = m.group(0)
530
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   356
                sense = Sense(pos)
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   357
                continue
530
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   358
            if not sense:
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   359
                raise ParseException("""Missing part of speech marker...""")
1006
b1f11eff7c70 Added support for countable.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 984
diff changeset
   360
            m = self.CNT_RE.match(self.line)
b1f11eff7c70 Added support for countable.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 984
diff changeset
   361
            if m is not None:
b1f11eff7c70 Added support for countable.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 984
diff changeset
   362
                sense.set_countable(m.group(1))
b1f11eff7c70 Added support for countable.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 984
diff changeset
   363
                continue
412
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
   364
            m = self.TOPIC_RE.match(self.line)
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
   365
            if m is not None:
530
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   366
                topics = m.group(1).split(";")
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   367
                for topic in topics:
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   368
                    topic = topic.strip()
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   369
                    if len(topic) == 0:
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   370
                        raise ParseException("""Empty topic...""")
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   371
                    sense.add_topic(topic)
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   372
                continue
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   373
            m = self.SYN_RE.match(self.line)
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   374
            if m is not None:
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   375
                syns = m.group(1).split(";")
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   376
                for syn in syns:
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   377
                    syn = syn.strip()
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   378
                    if len(syn) == 0:
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   379
                        raise ParseException("""Empty synonym...""")
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   380
                    sense.add_syn(syn)
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   381
                continue
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   382
            m = self.ANT_RE.match(self.line)
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   383
            if m is not None:
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   384
                ants = m.group(1).split(";")
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   385
                for ant in ants:
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   386
                    ant = ant.strip()
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   387
                    if len(ant) == 0:
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   388
                        raise ParseException("""Empty antonym...""")
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   389
                    sense.add_ant(ant)
412
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
   390
                continue
565
ac68f2680ea0 Add syntax to add related words. Add separators between ant/syn/rel in
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 554
diff changeset
   391
            m = self.REL_RE.match(self.line)
ac68f2680ea0 Add syntax to add related words. Add separators between ant/syn/rel in
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 554
diff changeset
   392
            if m is not None:
ac68f2680ea0 Add syntax to add related words. Add separators between ant/syn/rel in
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 554
diff changeset
   393
                rels = m.group(1).split(";")
ac68f2680ea0 Add syntax to add related words. Add separators between ant/syn/rel in
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 554
diff changeset
   394
                for rel in rels:
ac68f2680ea0 Add syntax to add related words. Add separators between ant/syn/rel in
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 554
diff changeset
   395
                    rel = rel.strip()
ac68f2680ea0 Add syntax to add related words. Add separators between ant/syn/rel in
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 554
diff changeset
   396
                    if len(rel) == 0:
ac68f2680ea0 Add syntax to add related words. Add separators between ant/syn/rel in
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 554
diff changeset
   397
                        raise ParseException("""Empty relation...""")
ac68f2680ea0 Add syntax to add related words. Add separators between ant/syn/rel in
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 554
diff changeset
   398
                    sense.add_rel(rel)
ac68f2680ea0 Add syntax to add related words. Add separators between ant/syn/rel in
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 554
diff changeset
   399
                continue
618
6ad7203ac9dc Add support for hypernyms and hyponyms.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 594
diff changeset
   400
            m = self.HYPER_RE.match(self.line)
6ad7203ac9dc Add support for hypernyms and hyponyms.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 594
diff changeset
   401
            if m is not None:
6ad7203ac9dc Add support for hypernyms and hyponyms.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 594
diff changeset
   402
                hypers = m.group(1).split(";")
6ad7203ac9dc Add support for hypernyms and hyponyms.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 594
diff changeset
   403
                for hyper in hypers:
6ad7203ac9dc Add support for hypernyms and hyponyms.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 594
diff changeset
   404
                    hyper = hyper.strip()
6ad7203ac9dc Add support for hypernyms and hyponyms.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 594
diff changeset
   405
                    if len(hyper) == 0:
6ad7203ac9dc Add support for hypernyms and hyponyms.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 594
diff changeset
   406
                        raise ParseException("""Empty hypernym...""")
6ad7203ac9dc Add support for hypernyms and hyponyms.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 594
diff changeset
   407
                    sense.add_hyper(hyper)
6ad7203ac9dc Add support for hypernyms and hyponyms.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 594
diff changeset
   408
                continue
6ad7203ac9dc Add support for hypernyms and hyponyms.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 594
diff changeset
   409
            m = self.HYPO_RE.match(self.line)
6ad7203ac9dc Add support for hypernyms and hyponyms.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 594
diff changeset
   410
            if m is not None:
6ad7203ac9dc Add support for hypernyms and hyponyms.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 594
diff changeset
   411
                hypos = m.group(1).split(";")
6ad7203ac9dc Add support for hypernyms and hyponyms.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 594
diff changeset
   412
                for hypo in hypos:
6ad7203ac9dc Add support for hypernyms and hyponyms.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 594
diff changeset
   413
                    hypo = hypo.strip()
6ad7203ac9dc Add support for hypernyms and hyponyms.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 594
diff changeset
   414
                    if len(hypo) == 0:
6ad7203ac9dc Add support for hypernyms and hyponyms.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 594
diff changeset
   415
                        raise ParseException("""Empty hyponym...""")
6ad7203ac9dc Add support for hypernyms and hyponyms.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 594
diff changeset
   416
                    sense.add_hypo(hypo)
6ad7203ac9dc Add support for hypernyms and hyponyms.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 594
diff changeset
   417
                continue
984
73d6e2631338 Added support for collocations' reference.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 937
diff changeset
   418
            m = self.COL_RE.match(self.line)
73d6e2631338 Added support for collocations' reference.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 937
diff changeset
   419
            if m is not None:
73d6e2631338 Added support for collocations' reference.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 937
diff changeset
   420
                cols = m.group(1).split(";")
73d6e2631338 Added support for collocations' reference.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 937
diff changeset
   421
                for col in cols:
73d6e2631338 Added support for collocations' reference.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 937
diff changeset
   422
                    col = col.strip()
73d6e2631338 Added support for collocations' reference.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 937
diff changeset
   423
                    if len(col) == 0:
73d6e2631338 Added support for collocations' reference.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 937
diff changeset
   424
                        raise ParseException("""Empty collocations...""")
73d6e2631338 Added support for collocations' reference.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 937
diff changeset
   425
                    sense.add_col(col)
73d6e2631338 Added support for collocations' reference.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 937
diff changeset
   426
                continue
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   427
            m = self.TRANSL_RE.match(self.line)
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   428
            if m is not None:
530
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   429
                sense.add_tr((m.group(1), m.group(2) + self.parse_translation_continuation()))
412
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
   430
                read = False
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   431
                continue
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   432
            m = self.TRANSL_EX_RE.match(self.line)
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   433
            if m is not None:
530
91771594bc8b Make storage for topics, antonyms and synonyms. Require pos marker.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 527
diff changeset
   434
                sense.add_ex((m.group(1), m.group(2) + self.parse_translation_continuation()))
412
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
   435
                read = False
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   436
                continue
566
0bba61492c37 Add syntax for glossary/explanation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 565
diff changeset
   437
            m = self.TRANSL_GLOS_RE.match(self.line)
0bba61492c37 Add syntax for glossary/explanation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 565
diff changeset
   438
            if m is not None:
0bba61492c37 Add syntax for glossary/explanation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 565
diff changeset
   439
                sense.add_glos((m.group(1), m.group(2) + self.parse_translation_continuation()))
0bba61492c37 Add syntax for glossary/explanation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 565
diff changeset
   440
                read = False
0bba61492c37 Add syntax for glossary/explanation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 565
diff changeset
   441
                continue
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   442
            raise ParseException("""Uknown syntax...""")
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   443
        self.tran = senses
594
910efcf51ac0 Fix: regex doesn't match whole line. Add Parser representation to simplify debugging.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 593
diff changeset
   444
910efcf51ac0 Fix: regex doesn't match whole line. Add Parser representation to simplify debugging.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 593
diff changeset
   445
    def __repr__(self):
910efcf51ac0 Fix: regex doesn't match whole line. Add Parser representation to simplify debugging.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 593
diff changeset
   446
        return u"<{:s}; line={:s}>".format(type(self).__name__, repr(self.line))