obsolete/conv-c5-to-gadict.py
changeset 319 75430afe1a43
equal deleted inserted replaced
318:d4767f21ca59 319:75430afe1a43
       
     1 """
       
     2 Put *.dict-c5 files to directory with script and run:
       
     3 
       
     4   $ python conv-c5-to-gadict.py
       
     5 
       
     6 Output saved to `conv.gadict` file.
       
     7 """
       
     8 
       
     9 import re
       
    10 
       
    11 ARTICLE_SEP = """
       
    12 _____
       
    13 
       
    14 """
       
    15 
       
    16 
       
    17 class Variance:
       
    18     def __init__(self, pron=None, attrs=None):
       
    19         self.pron = pron
       
    20         self.attrs = set()
       
    21         self.addAttrs(attrs)
       
    22 
       
    23     def addAttrs(self, attrs):
       
    24         if attrs is None:
       
    25             return
       
    26         if isinstance(attrs, str):
       
    27             self.attrs.add(attrs)
       
    28         elif isinstance(attrs, set):
       
    29             self.attrs.update(attrs)
       
    30         else:
       
    31             raise TypeError("Should be str or set...", type(attrs))
       
    32 
       
    33     def __repr__(self):
       
    34         return "<pron: " + self.pron + ">"
       
    35 
       
    36 
       
    37 class Variances:
       
    38     def __init__(self):
       
    39         self.store = {}
       
    40 
       
    41     def add(self, word, pron=None, attrs=None):
       
    42         """`word`is `str`, `pron` is a `str`, `atts` is a `list` of `str`."""
       
    43         if word not in self.store:
       
    44             self.store[word] = Variance()
       
    45         var = self.store[word]
       
    46         if pron is not None:
       
    47             if var.pron is not None:
       
    48                 print("two pronunciations detected!!", pron, var.pron)
       
    49                 # raise Exception("two pronunciations detected!!", pron, var.pron)
       
    50             var.pron = pron
       
    51         var.addAttrs(attrs)
       
    52 
       
    53     def __repr__(self):
       
    54         return repr(self.store)
       
    55 
       
    56 
       
    57 class Translation:
       
    58     def __init__(self, pos, tran):
       
    59         """`pos` is `str` like 'n' or 'v', `tran` is `str`."""
       
    60         self.pos = pos
       
    61         self.tran = tran
       
    62 
       
    63     def __repr__(self):
       
    64         return "<pos: " + self.pos + ", tran: " + self.tran + ">"
       
    65 
       
    66 
       
    67 class Article:
       
    68     def __init__(self):
       
    69         self.vars = Variances()
       
    70         self.trans = []
       
    71 
       
    72     def addVar(self, word, pron=None, attrs=None):
       
    73         """`var` is `list`"""
       
    74         self.vars.add(word, pron, attrs)
       
    75 
       
    76     def addTransl(self, pos, tran):
       
    77         """`trans` is `Translation`."""
       
    78         self.trans.append(Translation(pos, tran))
       
    79 
       
    80 
       
    81 DICT = {}
       
    82 
       
    83 f = open('gadict-irregular-verbs-en-ru.dict-c5')
       
    84 content = f.read()
       
    85 content = content.split(ARTICLE_SEP)
       
    86 content = iter(content)
       
    87 next(content)
       
    88 
       
    89 PRON_LINE_RE = re.compile(r'^  \[')
       
    90 PRON_RE = re.compile(r'\[([^]]+)]')
       
    91 
       
    92 V1_RE = re.compile(r"^  (?:inf\. )?([^/]+)/?(.*)?")
       
    93 V2_RE = re.compile(r"^  (?:p. )?([^/ ]+)/?(.*)?")
       
    94 V3_RE = re.compile(r"^  (?:p\.p\. )?([^/ ]+)/?(.*)?")
       
    95 V3_RE = re.compile(r"^  (?:p\.p\. )?([^/ ]+)/?(.*)?")
       
    96 RU_RE = re.compile(r"^  (.+)")
       
    97 
       
    98 for piece in content:
       
    99     article = Article()
       
   100     lines = piece.split("\n")
       
   101     headwords = lines[0].split("; ")
       
   102     for title in headwords:
       
   103         article.addVar(title)
       
   104     assert lines[1] == ""
       
   105     curr = 2
       
   106     line = lines[curr]
       
   107     if PRON_LINE_RE.match(line):
       
   108         curr += 1
       
   109         prons = PRON_RE.findall(line)
       
   110         if len(prons) != len(headwords):
       
   111             raise Exception("some prononsiation missing for", headwords, prons)
       
   112         for i in range(len(headwords)):
       
   113             article.addVar(headwords[i], pron=prons[i])
       
   114     line = lines[curr]
       
   115     m = V1_RE.match(line)
       
   116     if m:
       
   117         article.addVar(word=m.group(1), attrs="v1")
       
   118         if len(m.group(2)) > 0:
       
   119             article.addVar(word=m.group(2), attrs="v1")
       
   120     else:
       
   121         raise Exception("Can't match", line)
       
   122     curr += 1
       
   123     line = lines[curr]
       
   124     m = V2_RE.match(line)
       
   125     if m:
       
   126         article.addVar(m.group(1), attrs="v2")
       
   127         if len(m.group(2)) > 0:
       
   128             article.addVar(m.group(2), attrs="v2")
       
   129     else:
       
   130         raise Exception("Can't match", line)
       
   131     curr += 1
       
   132     line = lines[curr]
       
   133     m = V3_RE.match(line)
       
   134     if m:
       
   135         article.addVar(m.group(1), attrs="v3")
       
   136         if len(m.group(2)) > 0:
       
   137             article.addVar(m.group(2), attrs="v3")
       
   138     else:
       
   139         raise Exception("Can't match", line)
       
   140     try:
       
   141         curr += 1
       
   142         line = lines[curr]
       
   143     except IndexError:
       
   144         raise IndexError("No translation after", lines[curr-1], lines)
       
   145     m = RU_RE.match(line)
       
   146     if m:
       
   147         article.addTransl(pos="v", tran=m.group(1))
       
   148     else:
       
   149         raise Exception("Can't match", line)
       
   150     if len(lines) != curr+1:
       
   151         raise Exception("Unknown line", line, lines, len(lines), curr)
       
   152     DICT[headwords[0]] = article
       
   153 
       
   154 f.close()
       
   155 
       
   156 HEADWORD_RE = re.compile(r"^([a-zA-Z][a-z -]*)$")
       
   157 PRON_RE = re.compile(r"^  \[([^]]+)\]$")
       
   158 TRANSL_RE = re.compile(r"^  (?:[1-9]\) )?(.+)$")
       
   159 EMPTY_PRON_RE = re.compile(r"^  \[\]$")
       
   160 
       
   161 
       
   162 def parse_entry(entry, pos):
       
   163     lines = entry.split("\n")
       
   164     m = HEADWORD_RE.match(lines[0])
       
   165     if m is None:
       
   166         raise Exception("Fail to parse headword", lines)
       
   167     if len(lines[1]) != 0:
       
   168         raise Exception("Headword is not separated from article", lines)
       
   169     headword = m.group(1)
       
   170     if headword in DICT:
       
   171         article = DICT[headword]
       
   172     else:
       
   173         article = Article()
       
   174         DICT[headword] = article
       
   175     curr = 2
       
   176     line = lines[curr]
       
   177     m = PRON_RE.match(line)
       
   178     pron = None
       
   179     if m is not None:
       
   180         curr += 1
       
   181         pron = m.group(1)
       
   182         if len(pron) == 0:
       
   183             pron = None
       
   184     article.addVar(headword, pron=pron)
       
   185     while curr < len(lines):
       
   186         line = lines[curr]
       
   187         curr += 1
       
   188         if EMPTY_PRON_RE.match(line):
       
   189             continue
       
   190         m = TRANSL_RE.match(line)
       
   191         if m:
       
   192             article.addTransl(pos=pos, tran=m.group(1))
       
   193 
       
   194 
       
   195 def parse_file(fn, pos):
       
   196     f = open(fn)
       
   197     content = f.read()
       
   198     content = content.split(ARTICLE_SEP)
       
   199     content = iter(content)
       
   200     next(content)
       
   201     for entry in content:
       
   202         try:
       
   203             parse_entry(entry, pos)
       
   204         except Exception as e:
       
   205             raise Exception(e, fn, entry)
       
   206     f.close()
       
   207 
       
   208 
       
   209 parse_file('gadict-regular-verbs-en-ru.dict-c5', 'v')
       
   210 parse_file('gadict-adjective-en-ru.dict-c5', 'adj')
       
   211 parse_file('gadict-adverb-en-ru.dict-c5', 'adv')
       
   212 parse_file('gadict-conjunction-en-ru.dict-c5', 'conj')
       
   213 parse_file('gadict-en-ru.dict-c5', 'n')
       
   214 parse_file('gadict-numeral-en-ru.dict-c5', 'num')
       
   215 parse_file('gadict-phrasal-verbs-en-ru.dict-c5', 'phr.v')
       
   216 parse_file('gadict-preposition-en-ru.dict-c5', 'prep')
       
   217 parse_file('gadict-pronoun-en-ru.dict-c5', 'pron')
       
   218 # parse_file('', '')
       
   219 # parse_file('', '')
       
   220 # parse_file('', '')
       
   221 # parse_file('', '')
       
   222 
       
   223 f = open("conv.gadict", "w")
       
   224 
       
   225 for baseword in sorted(DICT.keys()):
       
   226     art = DICT[baseword]
       
   227     f.write('__\n\n')
       
   228     for (headword, var) in art.vars.store.items():
       
   229         f.write(headword)
       
   230         f.write('\n')
       
   231         if var.pron is not None:
       
   232             f.write(' [')
       
   233             f.write(var.pron)
       
   234             f.write(']\n')
       
   235         for attr in var.attrs:
       
   236             f.write(' ')
       
   237             f.write(attr)
       
   238             f.write('\n')
       
   239     for tran in art.trans:
       
   240         f.write('\n')
       
   241         f.write(tran.pos)
       
   242         f.write('\nru: ')
       
   243         f.write(tran.tran)
       
   244         f.write('\n')
       
   245 
       
   246 f.close()