# HG changeset patch # User Oleksandr Gavenko # Date 1473950900 -10800 # Node ID 59714b9033bc319305ba7bda13da1851a96e33e6 # Parent 45a3138c9b4d7254cc0940d43aba0a73aca1235b Store headword structure as class. Store headwords in list to preserve order like in source file. diff -r 45a3138c9b4d -r 59714b9033bc py/gadict.py --- a/py/gadict.py Thu Sep 15 15:42:52 2016 +0300 +++ b/py/gadict.py Thu Sep 15 17:48:20 2016 +0300 @@ -31,6 +31,18 @@ else: return ":{:d}: {:s}\nLINE: {:s}".format(self.lineno, self.msg.encode('utf-8'), self.line.encode('utf-8')) +class Headword: + + def __init__(self, headword, pron = None, attrs = None): + self.headword = headword + self.pron = pron + self.attrs = attrs + + def __str__(self): + return self.headword + def __repr__(self): + return "".format(self.headword) + class Sense: def __init__(self, pos, tr_list = None, ex_list = None, syn_list = None, ant_list = None, topic_list = None): @@ -72,6 +84,14 @@ else: self.topic_list.append(topic) + def __str__(self): + if tr_list: + (lang, text) = self.tr_list[0] + return "{}: {}".format(lang, text) + return "" + def __repr__(self): + return "".format(str(self)) + class Parser: """gadict dictionary format parser.""" @@ -190,7 +210,7 @@ def parse_headlines(self): """Try to match word variations with attributed. Assume that `self.line` on preceding empty line.""" - self.words = {} + self.words = [] self.readline() if self.eof: raise ParseException("""There are no definition after "__" delimiter...""") @@ -208,7 +228,7 @@ if m is not None: if word is None: raise ParseException("""Didn't match previous headword...""") - self.words[word] = (pron, attrs) + self.words.append(Headword(word, pron, attrs)) word = m.group(1) pron = None attrs = set() @@ -224,7 +244,7 @@ attrs.add(m.group(1)) continue raise ParseException("""Line is not a headword or translation or headword attribute...""") - self.words[word] = (pron, attrs) + self.words.append(Headword(word, pron, attrs)) def parse_translation_continuation(self): string = "" diff -r 45a3138c9b4d -r 59714b9033bc py/gadict_c5.py --- a/py/gadict_c5.py Thu Sep 15 15:42:52 2016 +0300 +++ b/py/gadict_c5.py Thu Sep 15 17:48:20 2016 +0300 @@ -65,54 +65,28 @@ FOUT.write("\n") -def attr_key(item): - (word, (pron, attrs)) = item - if not attrs: - return "zzz" - best_vattr = None - for attr in attrs: - if attr in ["v1", "v2", "v3"]: - if not best_vattr or (best_vattr and best_vattr > attr): - best_vattr = attr - if best_vattr: - return best_vattr - for attr in attrs: # single/plural - if attr in ["s"]: - return attr - for attr in attrs: # comparative/superlative - if attr in ["comp"]: - return attr - for attr in attrs: # Am/Br/Au - if attr in ["Am"]: - return attr - return "zzz" - - -for idx in range(1, len(DOM)): - article = DOM[idx] +for (headwords, translations) in DOM[1:]: FOUT.write("_____\n\n") - title = "; ".join(article[0].keys()) + title = "; ".join([h.headword for h in headwords]) FOUT.write(title) FOUT.write("\n\n") - defs = article[0].items() - defs = sorted(defs, key = attr_key) - for (word, (pron, attrs)) in defs: + for hw in headwords: FOUT.write(" ") - FOUT.write(word) - if pron is not None: + FOUT.write(hw.headword) + if hw.pron is not None: FOUT.write(" [") - FOUT.write(pron) + FOUT.write(hw.pron) FOUT.write("]") - if len(attrs) > 0: + if len(hw.attrs) > 0: FOUT.write(" ") - l = ["«"+x+"»" for x in attrs] + l = ["«"+x+"»" for x in hw.attrs] l.sort() FOUT.write(", ".join(l)) FOUT.write("\n") FOUT.write("\n") - for sense in article[1]: + for sense in translations: if not sense: - raise Exception("""Empty sense for article: """ + article[0].__iter__().__next__()) + raise Exception("""Empty sense for article: """ + headwords.__iter__().__next__()) FOUT.write(" ") if sense.pos: FOUT.write("«") diff -r 45a3138c9b4d -r 59714b9033bc py/gadict_srs_tab.py --- a/py/gadict_srs_tab.py Thu Sep 15 15:42:52 2016 +0300 +++ b/py/gadict_srs_tab.py Thu Sep 15 17:48:20 2016 +0300 @@ -38,46 +38,21 @@ else: FOUT = codecs.open(FONAME, "w", "utf-8") -def attr_key(item): - (word, (pron, attrs)) = item - if not attrs: - return "zzz" - best_vattr = None - for attr in attrs: - if attr in ["v1", "v2", "v3"]: - if not best_vattr or (best_vattr and best_vattr > attr): - best_vattr = attr - if best_vattr: - return best_vattr - for attr in attrs: # single/plural - if attr in ["s"]: - return attr - for attr in attrs: # comparative/superlative - if attr in ["comp"]: - return attr - for attr in attrs: # Am/Br/Au - if attr in ["Am"]: - return attr - return "zzz" - -for idx in range(1, len(DOM)): - article = DOM[idx] - defs = article[0].items() - defs = sorted(defs, key = attr_key) +for (headwords, translations) in DOM[1:]: lines = [] - for (word, (pron, attrs)) in defs: - line = ""+word+"" - if pron: - line += " ["+pron+"]" - if len(attrs) > 0: - attrs = [" «"+x+"»" for x in attrs] + for hw in headwords: + line = ""+hw.headword+"" + if hw.pron: + line += " ["+hw.pron+"]" + if len(hw.attrs) > 0: + attrs = [" «"+x+"»" for x in hw.attrs] attrs.sort() line += ",".join(attrs) lines.append(line) question = "
".join(lines) FOUT.write(question) FOUT.write("\t") - for sense in article[1]: + for sense in translations: if not sense: raise Exception("""Empty sense for article: """ + article[0].__iter__().__next__()) if sense.pos: