py/gadict.py
changeset 530 91771594bc8b
parent 527 0a31299fad70
child 542 b5197c70972c
equal deleted inserted replaced
529:ed54a93aa8d7 530:91771594bc8b
    28         elif self.line is None:
    28         elif self.line is None:
    29             return ":{:d}:{:s}".format(self.lineno, self.msg)
    29             return ":{:d}:{:s}".format(self.lineno, self.msg)
    30         else:
    30         else:
    31             return ":{:d}: {:s}\nLINE: {:s}".format(self.lineno, self.msg, self.line)
    31             return ":{:d}: {:s}\nLINE: {:s}".format(self.lineno, self.msg, self.line)
    32 
    32 
       
    33 class Sense:
       
    34 
       
    35     def __init__(self, pos, tr_list = None, ex_list = None, syn_list = None, ant_list = None, topic_list = None):
       
    36         if not pos:
       
    37             raise ParseException("Part of speech expected...\n")
       
    38         self.pos = pos
       
    39         if tr_list:
       
    40             self.tr_list = tr_list
       
    41         else:
       
    42             self.tr_list = []
       
    43         self.ex_list = ex_list
       
    44         self.syn_list = syn_list
       
    45         self.ant_list = ant_list
       
    46         self.topic_list = topic_list
       
    47 
       
    48     def add_tr(self, tr):
       
    49         self.tr_list.append(tr)
       
    50 
       
    51     def add_ex(self, ex):
       
    52         if not self.ex_list:
       
    53             self.ex_list = [ex]
       
    54         else:
       
    55             self.ex_list.append(ex)
       
    56 
       
    57     def add_syn(self, syn):
       
    58         if not self.syn_list:
       
    59             self.syn_list = [syn]
       
    60         else:
       
    61             self.syn_list.append(syn)
       
    62 
       
    63     def add_ant(self, ant):
       
    64         if not self.ant_list:
       
    65             self.ant_list = [ant]
       
    66         else:
       
    67             self.ant_list.append(ant)
       
    68 
       
    69     def add_topic(self, topic):
       
    70         if not self.topic_list:
       
    71             self.topic_list = [topic]
       
    72         else:
       
    73             self.topic_list.append(topic)
    33 
    74 
    34 class Parser:
    75 class Parser:
    35     """gadict dictionary format parser."""
    76     """gadict dictionary format parser."""
    36 
    77 
    37     COMMENT_RE = regex.compile(r"^# ")
    78     COMMENT_RE = regex.compile(r"^# ")
    38 
    79 
    39     SEPARATOR_RE = regex.compile(r"^__$")
    80     SEPARATOR_RE = regex.compile(r"^__$")
    40     HEADWORD_RE = regex.compile(r"^(\p{L}.*)$")
    81     HEADWORD_RE = regex.compile(r"^(\p{L}.*)$")
    41     HEADWORD_VAR_RE = regex.compile(r"^ +(s|pl|v[123]|male|female|comp|super|abbr|Am|Br|Au)$")
    82     HEADWORD_VAR_RE = regex.compile(r"^ +(s|pl|v[123]|male|female|comp|super|abbr|Am|Br|Au)$")
    42     HEADWORD_PRON_RE = regex.compile(r"^ +\[([\p{L}' ]+)\]$")
    83     HEADWORD_PRON_RE = regex.compile(r"^ +\[([\p{L}' ]+)\]$")
    43     TRANSL_POS_RE = regex.compile(r"^n|pron|adj|v|adv|prep|conj|num|int|phr|phr\.v|abbr|prefix$")
    84     TRANSL_POS_RE = regex.compile(r"^n|det|pron|adj|v|adv|prep|conj|num|int|phr|phr\.v|abbr|prefix$")
    44     TRANSL_RE = regex.compile(r"^(ru|uk|la|en): ([\p{L}(].*)$")
    85     TRANSL_RE = regex.compile(r"^(ru|uk|la|en): ([\p{L}(].*)$")
    45     TRANSL_EX_RE = regex.compile(r"^(ru|uk|la|en)> ([-\p{L}].*)$")
    86     TRANSL_EX_RE = regex.compile(r"^(ru|uk|la|en)> ([-\p{L}].*)$")
    46     TOPIC_RE = regex.compile(r"^(topic|ant|syn): (\p{L}.*)$")
    87     TOPIC_RE = regex.compile(r"^topic: (\p{L}.*)$")
       
    88     SYN_RE = regex.compile(r"^syn: (\p{L}.*)$")
       
    89     ANT_RE = regex.compile(r"^ant: (\p{L}.*)$")
    47 
    90 
    48     CONT_RE = regex.compile(r"^ +(.*)")
    91     CONT_RE = regex.compile(r"^ +(.*)")
    49 
    92 
    50     TRAILING_SPACES_RE = regex.compile(r"\p{Z}+$")
    93     TRAILING_SPACES_RE = regex.compile(r"\p{Z}+$")
    51 
    94 
   192             else:
   235             else:
   193                 return string
   236                 return string
   194 
   237 
   195     def parse_translation(self):
   238     def parse_translation(self):
   196         senses = []
   239         senses = []
   197         pos = None
   240         sense = None
   198         tr = []
       
   199         ex = []
       
   200         read = True
   241         read = True
   201         while True:
   242         while True:
   202             if read:
   243             if read:
   203                 self.readline()
   244                 self.readline()
   204             read = True
   245             read = True
   205             if self.eof:
   246             if self.eof:
   206                 break
   247                 break
   207             m = self.SEPARATOR_RE.match(self.line)
   248             m = self.SEPARATOR_RE.match(self.line)
   208             if m is not None:
   249             if m is not None:
       
   250                 if sense:
       
   251                     senses.append(sense)
   209                 break
   252                 break
   210             if len(self.line) == 1:
   253             if len(self.line) == 1:
   211                 senses.append((pos, tr, ex))
   254                 if sense:
   212                 pos = None
   255                     senses.append(sense)
   213                 tr = []
   256                 sense = None
   214                 ex = []
       
   215                 continue
   257                 continue
   216             m = self.TRANSL_POS_RE.match(self.line)
   258             m = self.TRANSL_POS_RE.match(self.line)
   217             if m is not None:
   259             if m is not None:
   218                 if pos is not None:
   260                 if sense is not None:
   219                     raise ParseException("""Each translation should have only one part of speech marker...""")
   261                     raise ParseException("""Each translation should have only one part of speech marker...""")
   220                 pos = m.group(0)
   262                 pos = m.group(0)
   221                 continue
   263                 sense = Sense(pos)
       
   264                 continue
       
   265             if not sense:
       
   266                 raise ParseException("""Missing part of speech marker...""")
   222             m = self.TOPIC_RE.match(self.line)
   267             m = self.TOPIC_RE.match(self.line)
   223             if m is not None:
   268             if m is not None:
   224                 # TODO
   269                 topics = m.group(1).split(";")
       
   270                 for topic in topics:
       
   271                     topic = topic.strip()
       
   272                     if len(topic) == 0:
       
   273                         raise ParseException("""Empty topic...""")
       
   274                     sense.add_topic(topic)
       
   275                 continue
       
   276             m = self.SYN_RE.match(self.line)
       
   277             if m is not None:
       
   278                 syns = m.group(1).split(";")
       
   279                 for syn in syns:
       
   280                     syn = syn.strip()
       
   281                     if len(syn) == 0:
       
   282                         raise ParseException("""Empty synonym...""")
       
   283                     sense.add_syn(syn)
       
   284                 continue
       
   285             m = self.ANT_RE.match(self.line)
       
   286             if m is not None:
       
   287                 ants = m.group(1).split(";")
       
   288                 for ant in ants:
       
   289                     ant = ant.strip()
       
   290                     if len(ant) == 0:
       
   291                         raise ParseException("""Empty antonym...""")
       
   292                     sense.add_ant(ant)
   225                 continue
   293                 continue
   226             m = self.TRANSL_RE.match(self.line)
   294             m = self.TRANSL_RE.match(self.line)
   227             if m is not None:
   295             if m is not None:
   228                 tr.append((m.group(1), m.group(2) + self.parse_translation_continuation()))
   296                 sense.add_tr((m.group(1), m.group(2) + self.parse_translation_continuation()))
   229                 read = False
   297                 read = False
   230                 continue
   298                 continue
   231             m = self.TRANSL_EX_RE.match(self.line)
   299             m = self.TRANSL_EX_RE.match(self.line)
   232             if m is not None:
   300             if m is not None:
   233                 ex.append((m.group(1), m.group(2) + self.parse_translation_continuation()))
   301                 sense.add_ex((m.group(1), m.group(2) + self.parse_translation_continuation()))
   234                 read = False
   302                 read = False
   235                 continue
   303                 continue
   236             raise ParseException("""Uknown syntax...""")
   304             raise ParseException("""Uknown syntax...""")
   237         if len(tr) > 0:
       
   238             senses.append((pos, tr, ex))
       
   239         self.tran = senses
   305         self.tran = senses