py/gadict.py
changeset 393 2756a6deca7e
parent 385 18284ce77c7a
child 394 4d45194c71b6
equal deleted inserted replaced
392:4657b44ad9af 393:2756a6deca7e
       
     1 
       
     2 import io
       
     3 import sys
       
     4 # import re
       
     5 import regex
       
     6 
       
     7 
       
     8 # fgadict = "gadict_en-ru+ua.gadict"
       
     9 fgadict = None
       
    10 fnout = None
       
    11 if len(sys.argv) >= 2:
       
    12     fgadict = sys.argv[1]
       
    13 if len(sys.argv) >= 3:
       
    14     fnout = sys.argv[2]
       
    15 
       
    16 fin = io.open(fgadict, mode='r', buffering=1, encoding="utf-8")
       
    17 if fnout is None:
       
    18     fout = sys.stdout
       
    19 else:
       
    20     fout = open(fnout, "w")
       
    21 
       
    22 
       
    23 class ParseException(Exception):
       
    24 
       
    25     def __init__(self, msg):
       
    26         self.msg = msg
       
    27 
       
    28     def __repr__(self):
       
    29         return self.msg
       
    30 
       
    31 
       
    32 class Parser:
       
    33 
       
    34     SEPARATOR_RE = regex.compile(r"^__$")
       
    35     HEADWORD_RE = regex.compile(r"^(\p{L}.*)$")
       
    36     HEADWORD_VAR_RE = regex.compile(r"^ +(s|pl|v[123]|male|female|comp|super)$")
       
    37     HEADWORD_PRON_RE = regex.compile(r"^ +\[([\p{L}' ]+)\]$")
       
    38     TRANSL_POS_RE = regex.compile(r"^n|pron|adj|v|adv|prep|conj|num|int|phr\.v|abbr$")
       
    39     TRANSL_RE = regex.compile(r"^(ru|uk|la|en): ([\p{L}(][\p{L}\p{P}~ ]*)$")
       
    40     TRANSL_EX_RE = regex.compile(r"^(ru|uk|la|en)> (\p{L}.*)$")
       
    41 
       
    42     TRAILING_SPACES_RE = regex.compile(r"\p{Z}+$")
       
    43 
       
    44     def __init__(self):
       
    45         pass
       
    46 
       
    47     def readline(self):
       
    48         self.line = self.stream.readline()
       
    49         self.eof = len(self.line) == 0
       
    50         if not self.eof:
       
    51             self.lineno += 1
       
    52 
       
    53     def parse(self, stream):
       
    54         self.lineno = 0
       
    55         self.stream = stream
       
    56         self.dom = []
       
    57         try:
       
    58             self.parse_prelude()
       
    59             while not self.eof:
       
    60                 self.parse_article()
       
    61         except ParseException as ex:
       
    62             if self.TRAILING_SPACES_RE.match(self.line):
       
    63                 fout.write("{:s}:{:d}: {:s}".format(fgadict, self.lineno, "Traling spaces detected...\n"))
       
    64             fout.write("{:s}:{:d}: {:s}\nLINE: {:s}\n".format(fgadict, self.lineno, str(ex), self.line))
       
    65             raise Exception(ex)
       
    66         return self.dom
       
    67 
       
    68     def parse_prelude(self):
       
    69         """Read dictionary prelude until first "__" delimiter."""
       
    70         while True:
       
    71             self.readline()
       
    72             if self.eof:
       
    73                 raise ParseException("There are no articles...")
       
    74             if self.SEPARATOR_RE.match(self.line):
       
    75                 break
       
    76 
       
    77     def parse_article(self):
       
    78         """Try to match article until next "__" delimiter. Assume that `self.line` point to "__" delimiter."""
       
    79         self.words = None
       
    80         self.tran = None
       
    81         self.parse_empty_line()
       
    82         self.parse_headlines()
       
    83         self.parse_translation()
       
    84         self.dom.append((self.words, self.tran))
       
    85 
       
    86     def parse_empty_line(self):
       
    87         self.readline()
       
    88         if self.eof or len(self.line) != 1:
       
    89             raise ParseException(""""__" delimiter should followed by empty line...""")
       
    90 
       
    91     def parse_headlines(self):
       
    92         """Try to match word variations with attributed. Assume that `self.line` on preceding empty line."""
       
    93         self.words = {}
       
    94         self.readline()
       
    95         if self.eof:
       
    96             raise ParseException("""There are no definition after "__" delimiter...""")
       
    97         m = self.HEADWORD_RE.match(self.line)
       
    98         if m is None:
       
    99             raise ParseException("""There are no headword after "__" delimiter...""")
       
   100         word = m.group(1)
       
   101         pron = None
       
   102         attrs = set()
       
   103         while True:
       
   104             self.readline()
       
   105             if self.eof or len(self.line) == 1:
       
   106                 break
       
   107             m = self.HEADWORD_RE.match(self.line)
       
   108             if m is not None:
       
   109                 if word is None:
       
   110                     raise ParseException("""Didn't match previous headword...""")
       
   111                 self.words[word] = (pron, attrs)
       
   112                 word = m.group(1)
       
   113                 pron = None
       
   114                 attrs = set()
       
   115                 continue
       
   116             m = self.HEADWORD_PRON_RE.match(self.line)
       
   117             if m is not None:
       
   118                 if pron is not None:
       
   119                     raise ParseException("""Pronunciation is redefined...""")
       
   120                 pron = m.group(1)
       
   121                 continue
       
   122             m = self.HEADWORD_VAR_RE.match(self.line)
       
   123             if m is not None:
       
   124                 attrs.add(m.group(1))
       
   125                 continue
       
   126             raise ParseException("""Line is not headword or translation or headword attribute...""")
       
   127         self.words[word] = (pron, attrs)
       
   128 
       
   129     def parse_translation(self):
       
   130         senses = []
       
   131         pos = None
       
   132         tr = []
       
   133         ex = []
       
   134         while True:
       
   135             self.readline()
       
   136             if self.eof:
       
   137                 break
       
   138             m = self.SEPARATOR_RE.match(self.line)
       
   139             if m is not None:
       
   140                 break
       
   141             if len(self.line) == 1:
       
   142                 senses.append((pos, tr, ex))
       
   143                 pos = None
       
   144                 tr = []
       
   145                 ex = []
       
   146                 continue
       
   147             m = self.TRANSL_POS_RE.match(self.line)
       
   148             if m is not None:
       
   149                 if pos is not None:
       
   150                     raise ParseException("""Each translation should have only one part of speech marker...""")
       
   151                 pos = m.group(0)
       
   152                 continue
       
   153             m = self.TRANSL_RE.match(self.line)
       
   154             if m is not None:
       
   155                 tr.append((m.group(1), m.group(2)))
       
   156                 continue
       
   157             m = self.TRANSL_EX_RE.match(self.line)
       
   158             if m is not None:
       
   159                 ex.append((m.group(1), m.group(2)))
       
   160                 continue
       
   161             raise ParseException("""Uknown syntax...""")
       
   162         if len(tr) > 0:
       
   163             senses.append((pos, tr, ex))
       
   164         self.tran = senses
       
   165 
       
   166 parser = Parser()
       
   167 dom = parser.parse(fin)
       
   168 fin.close()
       
   169 
       
   170 for idx in range(1, len(dom)):
       
   171     article = dom[idx]
       
   172     fout.write("_____\n\n")
       
   173     title = "; ".join(article[0].keys())
       
   174     fout.write(title)
       
   175     fout.write("\n\n")
       
   176     for (word, (pron, attrs)) in article[0].items():
       
   177         if word == "approach":
       
   178             fout.write(str(article[0]))
       
   179         fout.write("  ")
       
   180         fout.write(word)
       
   181         fout.write("\n")
       
   182         if pron is not None:
       
   183             fout.write("    [")
       
   184             fout.write(pron)
       
   185             fout.write("]\n")
       
   186         if len(attrs) > 0:
       
   187             fout.write("    ")
       
   188             l = list(attrs)
       
   189             l.sort()
       
   190             fout.write(", ".join(l))
       
   191             fout.write("\n")
       
   192     fout.write("\n")
       
   193     for (pos, trs, exs) in article[1]:
       
   194         fout.write("  ")
       
   195         if pos is not None:
       
   196             fout.write("⟨")
       
   197             fout.write(pos)
       
   198             fout.write("⟩ ")
       
   199         for (lang, tr) in trs:
       
   200             if lang == "ru":
       
   201                 fout.write(tr)
       
   202                 break
       
   203         fout.write("\n")
       
   204 
       
   205     # fout.write(str(article[0])+"\n")
       
   206