py/gadict.py
author Oleksandr Gavenko <gavenkoa@gmail.com>
Fri, 15 Apr 2016 18:25:18 +0300
changeset 442 50c70c5dbce3
parent 432 b3a78fc20b31
child 446 b628292c6d48
permissions -rw-r--r--
Add "abbr" word variance.
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
406
f0ac87e10d9a Fix warnings from pylint/pyflakes.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 402
diff changeset
     1
"""
f0ac87e10d9a Fix warnings from pylint/pyflakes.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 402
diff changeset
     2
gadict dictionary format parser.
f0ac87e10d9a Fix warnings from pylint/pyflakes.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 402
diff changeset
     3
"""
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     4
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     5
import regex
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     6
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     7
402
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
     8
class Prelude:
406
f0ac87e10d9a Fix warnings from pylint/pyflakes.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 402
diff changeset
     9
    """Dictionary metainfo structure."""
402
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
    10
    name = None
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
    11
    about = ""
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
    12
    urls = []
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
    13
    authors = []
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
    14
    licences = []
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
    15
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
    16
399
a6a7036f3c6f File name is not available in parser. Move error printing to writer.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 394
diff changeset
    17
class ParseException(BaseException):
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    18
406
f0ac87e10d9a Fix warnings from pylint/pyflakes.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 402
diff changeset
    19
    def __init__(self, msg, lineno=None, line=None):
f0ac87e10d9a Fix warnings from pylint/pyflakes.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 402
diff changeset
    20
        super().__init__()
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    21
        self.msg = msg
399
a6a7036f3c6f File name is not available in parser. Move error printing to writer.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 394
diff changeset
    22
        self.lineno = lineno
a6a7036f3c6f File name is not available in parser. Move error printing to writer.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 394
diff changeset
    23
        self.line = line
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    24
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    25
    def __repr__(self):
399
a6a7036f3c6f File name is not available in parser. Move error printing to writer.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 394
diff changeset
    26
        if self.lineno is None:
a6a7036f3c6f File name is not available in parser. Move error printing to writer.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 394
diff changeset
    27
            return self.msg
a6a7036f3c6f File name is not available in parser. Move error printing to writer.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 394
diff changeset
    28
        elif self.line is None:
a6a7036f3c6f File name is not available in parser. Move error printing to writer.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 394
diff changeset
    29
            return ":{:d}:{:s}".format(self.lineno, self.msg)
a6a7036f3c6f File name is not available in parser. Move error printing to writer.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 394
diff changeset
    30
        else:
a6a7036f3c6f File name is not available in parser. Move error printing to writer.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 394
diff changeset
    31
            return ":{:d}: {:s}\nLINE: {:s}".format(self.lineno, self.msg, self.line)
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    32
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    33
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    34
class Parser:
406
f0ac87e10d9a Fix warnings from pylint/pyflakes.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 402
diff changeset
    35
    """gadict dictionary format parser."""
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    36
432
b3a78fc20b31 Add parsing comment syntax.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 422
diff changeset
    37
    COMMENT_RE = regex.compile(r"^# ")
b3a78fc20b31 Add parsing comment syntax.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 422
diff changeset
    38
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    39
    SEPARATOR_RE = regex.compile(r"^__$")
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    40
    HEADWORD_RE = regex.compile(r"^(\p{L}.*)$")
442
50c70c5dbce3 Add "abbr" word variance.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 432
diff changeset
    41
    HEADWORD_VAR_RE = regex.compile(r"^ +(s|pl|v[123]|male|female|comp|super|abbr)$")
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    42
    HEADWORD_PRON_RE = regex.compile(r"^ +\[([\p{L}' ]+)\]$")
422
c97e9c1febe8 Handle special classes of pos.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 412
diff changeset
    43
    TRANSL_POS_RE = regex.compile(r"^n|pron|adj|v|adv|prep|conj|num|int|phr|phr\.v|abbr|prefix$")
412
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
    44
    TRANSL_RE = regex.compile(r"^(ru|uk|la|en): ([\p{L}(].*)$")
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    45
    TRANSL_EX_RE = regex.compile(r"^(ru|uk|la|en)> (\p{L}.*)$")
412
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
    46
    TOPIC_RE = regex.compile(r"^topic: (\p{L}.*)$")
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    47
402
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
    48
    CONT_RE = regex.compile(r"^ +(.*)")
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
    49
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    50
    TRAILING_SPACES_RE = regex.compile(r"\p{Z}+$")
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    51
402
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
    52
    PRELUDE_NAME_RE = regex.compile(r"^name: (.*)")
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
    53
    PRELUDE_URL_RE = regex.compile(r"^url: (.*)")
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
    54
    PRELUDE_AUTHOR_RE = regex.compile(r"^by: (.*)")
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
    55
    PRELUDE_LICENSE_RE = regex.compile(r"^term: (.*)")
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
    56
    PRELUDE_ABOUT_RE = regex.compile(r"^about: ?(.*)")
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
    57
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    58
    def __init__(self):
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    59
        pass
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    60
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    61
    def readline(self):
432
b3a78fc20b31 Add parsing comment syntax.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 422
diff changeset
    62
        while True:
b3a78fc20b31 Add parsing comment syntax.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 422
diff changeset
    63
            self.line = self.stream.readline()
b3a78fc20b31 Add parsing comment syntax.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 422
diff changeset
    64
            self.eof = len(self.line) == 0
b3a78fc20b31 Add parsing comment syntax.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 422
diff changeset
    65
            if not self.eof:
b3a78fc20b31 Add parsing comment syntax.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 422
diff changeset
    66
                self.lineno += 1
b3a78fc20b31 Add parsing comment syntax.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 422
diff changeset
    67
            if self.TRAILING_SPACES_RE.search(self.line):
b3a78fc20b31 Add parsing comment syntax.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 422
diff changeset
    68
                raise ParseException("Traling spaces detected...\n")
b3a78fc20b31 Add parsing comment syntax.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 422
diff changeset
    69
            if self.COMMENT_RE.search(self.line):
b3a78fc20b31 Add parsing comment syntax.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 422
diff changeset
    70
                continue
b3a78fc20b31 Add parsing comment syntax.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 422
diff changeset
    71
            break
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    72
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    73
    def parse(self, stream):
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    74
        self.lineno = 0
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    75
        self.stream = stream
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    76
        self.dom = []
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    77
        try:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    78
            self.parse_prelude()
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    79
            while not self.eof:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    80
                self.parse_article()
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    81
        except ParseException as ex:
406
f0ac87e10d9a Fix warnings from pylint/pyflakes.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 402
diff changeset
    82
            raise ParseException(ex.msg, self.lineno, self.line)
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    83
        return self.dom
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    84
412
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
    85
    def parse_prelude_continuation(self):
402
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
    86
        string = ""
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
    87
        while True:
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
    88
            self.readline()
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
    89
            if self.eof:
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
    90
                return string
406
f0ac87e10d9a Fix warnings from pylint/pyflakes.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 402
diff changeset
    91
            m = self.CONT_RE.match(self.line)
402
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
    92
            if m is not None:
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
    93
                string += "\n" + m.group(1)
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
    94
            elif len(self.line) == 1:
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
    95
                string += "\n"
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
    96
            else:
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
    97
                return string
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
    98
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    99
    def parse_prelude(self):
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   100
        """Read dictionary prelude until first "__" delimiter."""
402
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   101
        pre = Prelude()
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   102
        while True:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   103
            self.readline()
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   104
            if self.eof:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   105
                raise ParseException("There are no articles...")
402
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   106
            m = self.PRELUDE_ABOUT_RE.match(self.line)
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   107
            if m:
412
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
   108
                pre.about += m.group(1) + self.parse_prelude_continuation()
402
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   109
                if self.eof:
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   110
                    raise ParseException("There are no articles...")
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   111
            if self.SEPARATOR_RE.match(self.line):
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   112
                break
402
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   113
            m = self.PRELUDE_NAME_RE.match(self.line)
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   114
            if m:
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   115
                pre.name = m.group(1)
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   116
                continue
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   117
            m = self.PRELUDE_URL_RE.match(self.line)
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   118
            if m:
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   119
                pre.urls.append(m.group(1))
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   120
                continue
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   121
            m = self.PRELUDE_AUTHOR_RE.match(self.line)
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   122
            if m:
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   123
                pre.authors.append(m.group(1))
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   124
                continue
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   125
            m = self.PRELUDE_LICENSE_RE.match(self.line)
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   126
            if m:
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   127
                pre.licences.append(m.group(1))
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   128
                continue
b47698d5ccab Parse dictionary metainfo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 400
diff changeset
   129
        self.dom.append(pre)
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   130
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   131
    def parse_article(self):
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   132
        """Try to match article until next "__" delimiter. Assume that `self.line` point to "__" delimiter."""
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   133
        self.words = None
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   134
        self.tran = None
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   135
        self.parse_empty_line()
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   136
        self.parse_headlines()
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   137
        self.parse_translation()
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   138
        self.dom.append((self.words, self.tran))
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   139
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   140
    def parse_empty_line(self):
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   141
        self.readline()
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   142
        if self.eof or len(self.line) != 1:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   143
            raise ParseException(""""__" delimiter should followed by empty line...""")
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   144
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   145
    def parse_headlines(self):
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   146
        """Try to match word variations with attributed. Assume that `self.line` on preceding empty line."""
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   147
        self.words = {}
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   148
        self.readline()
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   149
        if self.eof:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   150
            raise ParseException("""There are no definition after "__" delimiter...""")
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   151
        m = self.HEADWORD_RE.match(self.line)
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   152
        if m is None:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   153
            raise ParseException("""There are no headword after "__" delimiter...""")
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   154
        word = m.group(1)
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   155
        pron = None
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   156
        attrs = set()
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   157
        while True:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   158
            self.readline()
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   159
            if self.eof or len(self.line) == 1:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   160
                break
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   161
            m = self.HEADWORD_RE.match(self.line)
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   162
            if m is not None:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   163
                if word is None:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   164
                    raise ParseException("""Didn't match previous headword...""")
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   165
                self.words[word] = (pron, attrs)
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   166
                word = m.group(1)
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   167
                pron = None
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   168
                attrs = set()
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   169
                continue
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   170
            m = self.HEADWORD_PRON_RE.match(self.line)
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   171
            if m is not None:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   172
                if pron is not None:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   173
                    raise ParseException("""Pronunciation is redefined...""")
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   174
                pron = m.group(1)
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   175
                continue
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   176
            m = self.HEADWORD_VAR_RE.match(self.line)
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   177
            if m is not None:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   178
                attrs.add(m.group(1))
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   179
                continue
400
aa03182d2e26 Proper check for trailing spaces.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 399
diff changeset
   180
            raise ParseException("""Line is not a headword or translation or headword attribute...""")
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   181
        self.words[word] = (pron, attrs)
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   182
412
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
   183
    def parse_translation_continuation(self):
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
   184
        string = ""
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
   185
        while True:
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
   186
            self.readline()
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
   187
            if self.eof:
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
   188
                return string
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
   189
            m = self.CONT_RE.match(self.line)
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
   190
            if m is not None:
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
   191
                string += "\n" + m.group(1)
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
   192
            else:
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
   193
                return string
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
   194
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   195
    def parse_translation(self):
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   196
        senses = []
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   197
        pos = None
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   198
        tr = []
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   199
        ex = []
412
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
   200
        read = True
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   201
        while True:
412
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
   202
            if read:
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
   203
                self.readline()
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
   204
            read = True
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   205
            if self.eof:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   206
                break
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   207
            m = self.SEPARATOR_RE.match(self.line)
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   208
            if m is not None:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   209
                break
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   210
            if len(self.line) == 1:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   211
                senses.append((pos, tr, ex))
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   212
                pos = None
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   213
                tr = []
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   214
                ex = []
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   215
                continue
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   216
            m = self.TRANSL_POS_RE.match(self.line)
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   217
            if m is not None:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   218
                if pos is not None:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   219
                    raise ParseException("""Each translation should have only one part of speech marker...""")
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   220
                pos = m.group(0)
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   221
                continue
412
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
   222
            m = self.TOPIC_RE.match(self.line)
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
   223
            if m is not None:
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
   224
                # TODO
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
   225
                continue
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   226
            m = self.TRANSL_RE.match(self.line)
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   227
            if m is not None:
412
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
   228
                tr.append((m.group(1), m.group(2) + self.parse_translation_continuation()))
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
   229
                read = False
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   230
                continue
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   231
            m = self.TRANSL_EX_RE.match(self.line)
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   232
            if m is not None:
412
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
   233
                ex.append((m.group(1), m.group(2) + self.parse_translation_continuation()))
ece60575a96a Adopt to parse VOA dictionary: add topics support and translation continuation.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 406
diff changeset
   234
                read = False
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   235
                continue
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   236
            raise ParseException("""Uknown syntax...""")
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   237
        if len(tr) > 0:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   238
            senses.append((pos, tr, ex))
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   239
        self.tran = senses