py/gadict.py
changeset 931 9a5f97027ee7
parent 892 0c298fe6739e
child 937 981839c72b64
equal deleted inserted replaced
930:2989d9b90b14 931:9a5f97027ee7
    32         else:
    32         else:
    33             return u":{:d}: {:s}\nLINE: {:s}".format(self.lineno, self.msg, self.line)
    33             return u":{:d}: {:s}\nLINE: {:s}".format(self.lineno, self.msg, self.line)
    34 
    34 
    35 class Headword:
    35 class Headword:
    36 
    36 
    37     def __init__(self, headword, pron = None, attrs = None):
    37     def __init__(self, headword, pron = None, attrs = None, homo = None):
    38         self.headword = headword
    38         self.headword = headword
    39         self.pron = pron
    39         self.pron = pron
    40         self.attrs = attrs
    40         self.attrs = attrs
       
    41         self.homo = homo
    41 
    42 
    42     def __str__(self):
    43     def __str__(self):
    43         return self.headword
    44         return self.headword
    44     def __repr__(self):
    45     def __repr__(self):
    45         return "<Headword {}>".format(self.headword)
    46         return "<Headword {}>".format(self.headword)
   129 
   130 
   130     SEPARATOR_RE = re.compile(u"^__$", re.UNICODE)
   131     SEPARATOR_RE = re.compile(u"^__$", re.UNICODE)
   131     HEADWORD_RE = re.compile( u"^(\\w.*)$" )
   132     HEADWORD_RE = re.compile( u"^(\\w.*)$" )
   132     HEADWORD_VAR_RE = re.compile(u"^ +(rare|s|pl|v[123]|male|female|comp|super|abbr|Am|Br|Au)$", re.UNICODE)
   133     HEADWORD_VAR_RE = re.compile(u"^ +(rare|s|pl|v[123]|male|female|comp|super|abbr|Am|Br|Au)$", re.UNICODE)
   133     HEADWORD_PRON_RE = re.compile(u"^ +\\[([a-zˌˈːəæɛɒʊɪɔɜɑʌɚɐɹʃʧθðɡʒŋɾʔ ]+)\\]$", re.UNICODE)
   134     HEADWORD_PRON_RE = re.compile(u"^ +\\[([a-zˌˈːəæɛɒʊɪɔɜɑʌɚɐɹʃʧθðɡʒŋɾʔ ]+)\\]$", re.UNICODE)
       
   135     HEADWORD_HOMO_RE = re.compile(u"^ +homo: (\\w+)$", re.UNICODE)
   134     TRANSL_POS_RE = re.compile(u"^(?:n|det|pron|adj|v|adv|prep|conj|num|int|phr|phr\\.v|contr|abbr|prefix)$", re.UNICODE)
   136     TRANSL_POS_RE = re.compile(u"^(?:n|det|pron|adj|v|adv|prep|conj|num|int|phr|phr\\.v|contr|abbr|prefix)$", re.UNICODE)
   135     TRANSL_RE = re.compile(u"^(ru|uk|la|en): ([\\w(].*)$", re.UNICODE)
   137     TRANSL_RE = re.compile(u"^(ru|uk|la|en): ([\\w(].*)$", re.UNICODE)
   136     TRANSL_EX_RE = re.compile(u"""^(ru|uk|la|en)> ([-'"\\w].*)$""", re.UNICODE)
   138     TRANSL_EX_RE = re.compile(u"""^(ru|uk|la|en)> ([-'"\\w].*)$""", re.UNICODE)
   137     TRANSL_GLOS_RE = re.compile(u"^(ru|uk|la|en)= ([-\\w\\d].*)$", re.UNICODE)
   139     TRANSL_GLOS_RE = re.compile(u"^(ru|uk|la|en)= ([-\\w\\d].*)$", re.UNICODE)
   138     TOPIC_RE = re.compile(u"^topic: (\\w.*)$", re.UNICODE)
   140     TOPIC_RE = re.compile(u"^topic: (\\w.*)$", re.UNICODE)
   253         if m is None:
   255         if m is None:
   254             raise ParseException("""There are no headword after "__" delimiter...""")
   256             raise ParseException("""There are no headword after "__" delimiter...""")
   255         word = m.group(1)
   257         word = m.group(1)
   256         pron = None
   258         pron = None
   257         attrs = set()
   259         attrs = set()
       
   260         homo = None
   258         while True:
   261         while True:
   259             self.readline()
   262             self.readline()
   260             if self.eof or len(self.line) == 0:
   263             if self.eof or len(self.line) == 0:
   261                 break
   264                 break
   262             m = self.HEADWORD_RE.match(self.line)
   265             m = self.HEADWORD_RE.match(self.line)
   263             if m is not None:
   266             if m is not None:
   264                 if word is None:
   267                 if word is None:
   265                     raise ParseException("""Didn't match previous headword...""")
   268                     raise ParseException("""Didn't match previous headword...""")
   266                 self.words.append(Headword(word, pron, attrs))
   269                 self.words.append(Headword(word, pron, attrs, homo = homo))
   267                 word = m.group(1)
   270                 word = m.group(1)
   268                 pron = None
   271                 pron = None
   269                 attrs = set()
   272                 attrs = set()
       
   273                 homo = None
   270                 continue
   274                 continue
   271             m = self.HEADWORD_PRON_RE.match(self.line)
   275             m = self.HEADWORD_PRON_RE.match(self.line)
   272             if m is not None:
   276             if m is not None:
   273                 if pron is not None:
   277                 if pron is not None:
   274                     raise ParseException("""Pronunciation is redefined...""")
   278                     raise ParseException("""Pronunciation is redefined...""")
   276                 continue
   280                 continue
   277             m = self.HEADWORD_VAR_RE.match(self.line)
   281             m = self.HEADWORD_VAR_RE.match(self.line)
   278             if m is not None:
   282             if m is not None:
   279                 attrs.add(m.group(1))
   283                 attrs.add(m.group(1))
   280                 continue
   284                 continue
       
   285             m = self.HEADWORD_HOMO_RE.match(self.line)
       
   286             if m is not None:
       
   287                 if homo is not None:
       
   288                     raise ParseException("""Homophones are redefined...""")
       
   289                 homo = [s.strip() for s in m.group(1).split(";")]
       
   290                 continue
   281             raise ParseException("""Line is not a headword or translation or headword attribute...""")
   291             raise ParseException("""Line is not a headword or translation or headword attribute...""")
   282         self.words.append(Headword(word, pron, attrs))
   292         self.words.append(Headword(word, pron, attrs, homo))
   283 
   293 
   284     def parse_translation_continuation(self):
   294     def parse_translation_continuation(self):
   285         string = ""
   295         string = ""
   286         while True:
   296         while True:
   287             self.readline()
   297             self.readline()