# HG changeset patch # User Oleksandr Gavenko # Date 1472074345 -10800 # Node ID 91771594bc8b4d1a20123301d6a99d55608b154c # Parent ed54a93aa8d79be3be0570c1cbed60ea7a662052 Make storage for topics, antonyms and synonyms. Require pos marker. diff -r ed54a93aa8d7 -r 91771594bc8b py/gadict.py --- a/py/gadict.py Thu Aug 25 00:29:40 2016 +0300 +++ b/py/gadict.py Thu Aug 25 00:32:25 2016 +0300 @@ -30,6 +30,47 @@ else: return ":{:d}: {:s}\nLINE: {:s}".format(self.lineno, self.msg, self.line) +class Sense: + + def __init__(self, pos, tr_list = None, ex_list = None, syn_list = None, ant_list = None, topic_list = None): + if not pos: + raise ParseException("Part of speech expected...\n") + self.pos = pos + if tr_list: + self.tr_list = tr_list + else: + self.tr_list = [] + self.ex_list = ex_list + self.syn_list = syn_list + self.ant_list = ant_list + self.topic_list = topic_list + + def add_tr(self, tr): + self.tr_list.append(tr) + + def add_ex(self, ex): + if not self.ex_list: + self.ex_list = [ex] + else: + self.ex_list.append(ex) + + def add_syn(self, syn): + if not self.syn_list: + self.syn_list = [syn] + else: + self.syn_list.append(syn) + + def add_ant(self, ant): + if not self.ant_list: + self.ant_list = [ant] + else: + self.ant_list.append(ant) + + def add_topic(self, topic): + if not self.topic_list: + self.topic_list = [topic] + else: + self.topic_list.append(topic) class Parser: """gadict dictionary format parser.""" @@ -40,10 +81,12 @@ HEADWORD_RE = regex.compile(r"^(\p{L}.*)$") HEADWORD_VAR_RE = regex.compile(r"^ +(s|pl|v[123]|male|female|comp|super|abbr|Am|Br|Au)$") HEADWORD_PRON_RE = regex.compile(r"^ +\[([\p{L}' ]+)\]$") - TRANSL_POS_RE = regex.compile(r"^n|pron|adj|v|adv|prep|conj|num|int|phr|phr\.v|abbr|prefix$") + TRANSL_POS_RE = regex.compile(r"^n|det|pron|adj|v|adv|prep|conj|num|int|phr|phr\.v|abbr|prefix$") TRANSL_RE = regex.compile(r"^(ru|uk|la|en): ([\p{L}(].*)$") TRANSL_EX_RE = regex.compile(r"^(ru|uk|la|en)> ([-\p{L}].*)$") - TOPIC_RE = regex.compile(r"^(topic|ant|syn): (\p{L}.*)$") + TOPIC_RE = regex.compile(r"^topic: (\p{L}.*)$") + SYN_RE = regex.compile(r"^syn: (\p{L}.*)$") + ANT_RE = regex.compile(r"^ant: (\p{L}.*)$") CONT_RE = regex.compile(r"^ +(.*)") @@ -194,9 +237,7 @@ def parse_translation(self): senses = [] - pos = None - tr = [] - ex = [] + sense = None read = True while True: if read: @@ -206,34 +247,59 @@ break m = self.SEPARATOR_RE.match(self.line) if m is not None: + if sense: + senses.append(sense) break if len(self.line) == 1: - senses.append((pos, tr, ex)) - pos = None - tr = [] - ex = [] + if sense: + senses.append(sense) + sense = None continue m = self.TRANSL_POS_RE.match(self.line) if m is not None: - if pos is not None: + if sense is not None: raise ParseException("""Each translation should have only one part of speech marker...""") pos = m.group(0) + sense = Sense(pos) continue + if not sense: + raise ParseException("""Missing part of speech marker...""") m = self.TOPIC_RE.match(self.line) if m is not None: - # TODO + topics = m.group(1).split(";") + for topic in topics: + topic = topic.strip() + if len(topic) == 0: + raise ParseException("""Empty topic...""") + sense.add_topic(topic) + continue + m = self.SYN_RE.match(self.line) + if m is not None: + syns = m.group(1).split(";") + for syn in syns: + syn = syn.strip() + if len(syn) == 0: + raise ParseException("""Empty synonym...""") + sense.add_syn(syn) + continue + m = self.ANT_RE.match(self.line) + if m is not None: + ants = m.group(1).split(";") + for ant in ants: + ant = ant.strip() + if len(ant) == 0: + raise ParseException("""Empty antonym...""") + sense.add_ant(ant) continue m = self.TRANSL_RE.match(self.line) if m is not None: - tr.append((m.group(1), m.group(2) + self.parse_translation_continuation())) + sense.add_tr((m.group(1), m.group(2) + self.parse_translation_continuation())) read = False continue m = self.TRANSL_EX_RE.match(self.line) if m is not None: - ex.append((m.group(1), m.group(2) + self.parse_translation_continuation())) + sense.add_ex((m.group(1), m.group(2) + self.parse_translation_continuation())) read = False continue raise ParseException("""Uknown syntax...""") - if len(tr) > 0: - senses.append((pos, tr, ex)) self.tran = senses diff -r ed54a93aa8d7 -r 91771594bc8b py/gadict_c5.py --- a/py/gadict_c5.py Thu Aug 25 00:29:40 2016 +0300 +++ b/py/gadict_c5.py Thu Aug 25 00:32:25 2016 +0300 @@ -84,14 +84,16 @@ FOUT.write(", ".join(l)) FOUT.write("\n") FOUT.write("\n") - for (pos, trs, exs) in article[1]: + for sense in article[1]: + if not sense: + raise Exception("""Empty sense for article: """ + article[0].__iter__().__next__()) FOUT.write(" ") - if pos is not None: + if sense.pos: FOUT.write("«") - FOUT.write(pos) + FOUT.write(sense.pos) FOUT.write("» ") FOUT.write("\n") - for (lang, tr) in trs: + for (lang, tr) in sense.tr_list: FOUT.write(" ") if LANGS is None: FOUT.write(lang)