gadict.py
author Oleksandr Gavenko <gavenkoa@gmail.com>
Thu, 24 Mar 2016 01:11:47 +0200
changeset 385 18284ce77c7a
permissions -rw-r--r--
gadict format parser.
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
385
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     1
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     2
import io
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     3
import sys
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     4
# import re
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     5
import regex
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     6
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     7
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     8
# fgadict = "gadict_en-ru+ua.gadict"
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     9
fgadict = None
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    10
fnout = None
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    11
if len(sys.argv) >= 2:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    12
    fgadict = sys.argv[1]
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    13
if len(sys.argv) >= 3:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    14
    fnout = sys.argv[2]
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    15
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    16
fin = io.open(fgadict, mode='r', buffering=1, encoding="utf-8")
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    17
if fnout is None:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    18
    fout = sys.stdout
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    19
else:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    20
    fout = open(fnout, "w")
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    21
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    22
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    23
class ParseException(Exception):
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    24
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    25
    def __init__(self, msg):
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    26
        self.msg = msg
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    27
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    28
    def __repr__(self):
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    29
        return self.msg
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    30
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    31
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    32
class Parser:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    33
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    34
    SEPARATOR_RE = regex.compile(r"^__$")
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    35
    HEADWORD_RE = regex.compile(r"^(\p{L}.*)$")
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    36
    HEADWORD_VAR_RE = regex.compile(r"^ +(s|pl|v[123]|male|female|comp|super)$")
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    37
    HEADWORD_PRON_RE = regex.compile(r"^ +\[([\p{L}' ]+)\]$")
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    38
    TRANSL_POS_RE = regex.compile(r"^n|pron|adj|v|adv|prep|conj|num|int|phr\.v|abbr$")
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    39
    TRANSL_RE = regex.compile(r"^(ru|uk|la|en): ([\p{L}(][\p{L}\p{P}~ ]*)$")
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    40
    TRANSL_EX_RE = regex.compile(r"^(ru|uk|la|en)> (\p{L}.*)$")
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    41
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    42
    TRAILING_SPACES_RE = regex.compile(r"\p{Z}+$")
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    43
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    44
    def __init__(self):
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    45
        pass
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    46
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    47
    def readline(self):
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    48
        self.line = self.stream.readline()
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    49
        self.eof = len(self.line) == 0
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    50
        if not self.eof:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    51
            self.lineno += 1
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    52
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    53
    def parse(self, stream):
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    54
        self.lineno = 0
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    55
        self.stream = stream
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    56
        self.dom = []
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    57
        try:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    58
            self.parse_prelude()
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    59
            while not self.eof:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    60
                self.parse_article()
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    61
        except ParseException as ex:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    62
            if self.TRAILING_SPACES_RE.match(self.line):
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    63
                fout.write("{:s}:{:d}: {:s}".format(fgadict, self.lineno, "Traling spaces detected...\n"))
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    64
            fout.write("{:s}:{:d}: {:s}\nLINE: {:s}\n".format(fgadict, self.lineno, str(ex), self.line))
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    65
            raise Exception(ex)
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    66
        return self.dom
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    67
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    68
    def parse_prelude(self):
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    69
        """Read dictionary prelude until first "__" delimiter."""
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    70
        while True:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    71
            self.readline()
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    72
            if self.eof:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    73
                raise ParseException("There are no articles...")
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    74
            if self.SEPARATOR_RE.match(self.line):
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    75
                break
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    76
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    77
    def parse_article(self):
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    78
        """Try to match article until next "__" delimiter. Assume that `self.line` point to "__" delimiter."""
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    79
        self.words = None
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    80
        self.tran = None
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    81
        self.parse_empty_line()
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    82
        self.parse_headlines()
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    83
        self.parse_translation()
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    84
        self.dom.append((self.words, self.tran))
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    85
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    86
    def parse_empty_line(self):
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    87
        self.readline()
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    88
        if self.eof or len(self.line) != 1:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    89
            raise ParseException(""""__" delimiter should followed by empty line...""")
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    90
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    91
    def parse_headlines(self):
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    92
        """Try to match word variations with attributed. Assume that `self.line` on preceding empty line."""
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    93
        self.words = {}
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    94
        self.readline()
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    95
        if self.eof:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    96
            raise ParseException("""There are no definition after "__" delimiter...""")
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    97
        m = self.HEADWORD_RE.match(self.line)
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    98
        if m is None:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    99
            raise ParseException("""There are no headword after "__" delimiter...""")
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   100
        word = m.group(1)
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   101
        pron = None
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   102
        attrs = set()
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   103
        while True:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   104
            self.readline()
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   105
            if self.eof or len(self.line) == 1:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   106
                break
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   107
            m = self.HEADWORD_RE.match(self.line)
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   108
            if m is not None:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   109
                if word is None:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   110
                    raise ParseException("""Didn't match previous headword...""")
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   111
                self.words[word] = (pron, attrs)
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   112
                word = m.group(1)
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   113
                pron = None
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   114
                attrs = set()
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   115
                continue
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   116
            m = self.HEADWORD_PRON_RE.match(self.line)
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   117
            if m is not None:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   118
                if pron is not None:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   119
                    raise ParseException("""Pronunciation is redefined...""")
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   120
                pron = m.group(1)
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   121
                continue
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   122
            m = self.HEADWORD_VAR_RE.match(self.line)
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   123
            if m is not None:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   124
                attrs.add(m.group(1))
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   125
                continue
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   126
            raise ParseException("""Line is not headword or translation or headword attribute...""")
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   127
        self.words[word] = (pron, attrs)
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   128
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   129
    def parse_translation(self):
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   130
        senses = []
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   131
        pos = None
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   132
        tr = []
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   133
        ex = []
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   134
        while True:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   135
            self.readline()
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   136
            if self.eof:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   137
                break
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   138
            m = self.SEPARATOR_RE.match(self.line)
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   139
            if m is not None:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   140
                break
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   141
            if len(self.line) == 1:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   142
                senses.append((pos, tr, ex))
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   143
                pos = None
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   144
                tr = []
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   145
                ex = []
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   146
                continue
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   147
            m = self.TRANSL_POS_RE.match(self.line)
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   148
            if m is not None:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   149
                if pos is not None:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   150
                    raise ParseException("""Each translation should have only one part of speech marker...""")
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   151
                pos = m.group(0)
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   152
                continue
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   153
            m = self.TRANSL_RE.match(self.line)
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   154
            if m is not None:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   155
                tr.append((m.group(1), m.group(2)))
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   156
                continue
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   157
            m = self.TRANSL_EX_RE.match(self.line)
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   158
            if m is not None:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   159
                ex.append((m.group(1), m.group(2)))
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   160
                continue
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   161
            raise ParseException("""Uknown syntax...""")
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   162
        if len(tr) > 0:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   163
            senses.append((pos, tr, ex))
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   164
        self.tran = senses
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   165
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   166
parser = Parser()
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   167
dom = parser.parse(fin)
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   168
fin.close()
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   169
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   170
for idx in range(1, len(dom)):
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   171
    article = dom[idx]
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   172
    fout.write("_____\n\n")
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   173
    title = "; ".join(article[0].keys())
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   174
    fout.write(title)
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   175
    fout.write("\n\n")
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   176
    for (word, (pron, attrs)) in article[0].items():
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   177
        if word == "approach":
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   178
            fout.write(str(article[0]))
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   179
        fout.write("  ")
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   180
        fout.write(word)
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   181
        fout.write("\n")
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   182
        if pron is not None:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   183
            fout.write("    [")
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   184
            fout.write(pron)
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   185
            fout.write("]\n")
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   186
        if len(attrs) > 0:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   187
            fout.write("    ")
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   188
            l = list(attrs)
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   189
            l.sort()
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   190
            fout.write(", ".join(l))
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   191
            fout.write("\n")
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   192
    fout.write("\n")
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   193
    for (pos, trs, exs) in article[1]:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   194
        fout.write("  ")
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   195
        if pos is not None:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   196
            fout.write("⟨")
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   197
            fout.write(pos)
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   198
            fout.write("⟩ ")
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   199
        for (lang, tr) in trs:
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   200
            if lang == "ru":
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   201
                fout.write(tr)
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   202
                break
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   203
        fout.write("\n")
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   204
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   205
    # fout.write(str(article[0])+"\n")
18284ce77c7a gadict format parser.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
   206