py/gadict.py
changeset 412 ece60575a96a
parent 406 f0ac87e10d9a
child 422 c97e9c1febe8
equal deleted inserted replaced
411:2fac252890a5 412:ece60575a96a
    36 
    36 
    37     SEPARATOR_RE = regex.compile(r"^__$")
    37     SEPARATOR_RE = regex.compile(r"^__$")
    38     HEADWORD_RE = regex.compile(r"^(\p{L}.*)$")
    38     HEADWORD_RE = regex.compile(r"^(\p{L}.*)$")
    39     HEADWORD_VAR_RE = regex.compile(r"^ +(s|pl|v[123]|male|female|comp|super)$")
    39     HEADWORD_VAR_RE = regex.compile(r"^ +(s|pl|v[123]|male|female|comp|super)$")
    40     HEADWORD_PRON_RE = regex.compile(r"^ +\[([\p{L}' ]+)\]$")
    40     HEADWORD_PRON_RE = regex.compile(r"^ +\[([\p{L}' ]+)\]$")
    41     TRANSL_POS_RE = regex.compile(r"^n|pron|adj|v|adv|prep|conj|num|int|phr\.v|abbr$")
    41     TRANSL_POS_RE = regex.compile(r"^n|pron|adj|v|adv|prep|conj|num|int|phr\.v|abbr|prefix$")
    42     TRANSL_RE = regex.compile(r"^(ru|uk|la|en): ([\p{L}(][\p{L}\p{P}~ ]*)$")
    42     TRANSL_RE = regex.compile(r"^(ru|uk|la|en): ([\p{L}(].*)$")
    43     TRANSL_EX_RE = regex.compile(r"^(ru|uk|la|en)> (\p{L}.*)$")
    43     TRANSL_EX_RE = regex.compile(r"^(ru|uk|la|en)> (\p{L}.*)$")
       
    44     TOPIC_RE = regex.compile(r"^topic: (\p{L}.*)$")
    44 
    45 
    45     CONT_RE = regex.compile(r"^ +(.*)")
    46     CONT_RE = regex.compile(r"^ +(.*)")
    46 
    47 
    47     TRAILING_SPACES_RE = regex.compile(r"\p{Z}+$")
    48     TRAILING_SPACES_RE = regex.compile(r"\p{Z}+$")
    48 
    49 
    73                 self.parse_article()
    74                 self.parse_article()
    74         except ParseException as ex:
    75         except ParseException as ex:
    75             raise ParseException(ex.msg, self.lineno, self.line)
    76             raise ParseException(ex.msg, self.lineno, self.line)
    76         return self.dom
    77         return self.dom
    77 
    78 
    78     def parse_continuation(self):
    79     def parse_prelude_continuation(self):
    79         string = ""
    80         string = ""
    80         while True:
    81         while True:
    81             self.readline()
    82             self.readline()
    82             if self.eof:
    83             if self.eof:
    83                 return string
    84                 return string
    96             self.readline()
    97             self.readline()
    97             if self.eof:
    98             if self.eof:
    98                 raise ParseException("There are no articles...")
    99                 raise ParseException("There are no articles...")
    99             m = self.PRELUDE_ABOUT_RE.match(self.line)
   100             m = self.PRELUDE_ABOUT_RE.match(self.line)
   100             if m:
   101             if m:
   101                 pre.about += m.group(1) + self.parse_continuation()
   102                 pre.about += m.group(1) + self.parse_prelude_continuation()
   102                 if self.eof:
   103                 if self.eof:
   103                     raise ParseException("There are no articles...")
   104                     raise ParseException("There are no articles...")
   104             if self.SEPARATOR_RE.match(self.line):
   105             if self.SEPARATOR_RE.match(self.line):
   105                 break
   106                 break
   106             m = self.PRELUDE_NAME_RE.match(self.line)
   107             m = self.PRELUDE_NAME_RE.match(self.line)
   171                 attrs.add(m.group(1))
   172                 attrs.add(m.group(1))
   172                 continue
   173                 continue
   173             raise ParseException("""Line is not a headword or translation or headword attribute...""")
   174             raise ParseException("""Line is not a headword or translation or headword attribute...""")
   174         self.words[word] = (pron, attrs)
   175         self.words[word] = (pron, attrs)
   175 
   176 
       
   177     def parse_translation_continuation(self):
       
   178         string = ""
       
   179         while True:
       
   180             self.readline()
       
   181             if self.eof:
       
   182                 return string
       
   183             m = self.CONT_RE.match(self.line)
       
   184             if m is not None:
       
   185                 string += "\n" + m.group(1)
       
   186             else:
       
   187                 return string
       
   188 
   176     def parse_translation(self):
   189     def parse_translation(self):
   177         senses = []
   190         senses = []
   178         pos = None
   191         pos = None
   179         tr = []
   192         tr = []
   180         ex = []
   193         ex = []
   181         while True:
   194         read = True
   182             self.readline()
   195         while True:
       
   196             if read:
       
   197                 self.readline()
       
   198             read = True
   183             if self.eof:
   199             if self.eof:
   184                 break
   200                 break
   185             m = self.SEPARATOR_RE.match(self.line)
   201             m = self.SEPARATOR_RE.match(self.line)
   186             if m is not None:
   202             if m is not None:
   187                 break
   203                 break
   195             if m is not None:
   211             if m is not None:
   196                 if pos is not None:
   212                 if pos is not None:
   197                     raise ParseException("""Each translation should have only one part of speech marker...""")
   213                     raise ParseException("""Each translation should have only one part of speech marker...""")
   198                 pos = m.group(0)
   214                 pos = m.group(0)
   199                 continue
   215                 continue
       
   216             m = self.TOPIC_RE.match(self.line)
       
   217             if m is not None:
       
   218                 # TODO
       
   219                 continue
   200             m = self.TRANSL_RE.match(self.line)
   220             m = self.TRANSL_RE.match(self.line)
   201             if m is not None:
   221             if m is not None:
   202                 tr.append((m.group(1), m.group(2)))
   222                 tr.append((m.group(1), m.group(2) + self.parse_translation_continuation()))
       
   223                 read = False
   203                 continue
   224                 continue
   204             m = self.TRANSL_EX_RE.match(self.line)
   225             m = self.TRANSL_EX_RE.match(self.line)
   205             if m is not None:
   226             if m is not None:
   206                 ex.append((m.group(1), m.group(2)))
   227                 ex.append((m.group(1), m.group(2) + self.parse_translation_continuation()))
       
   228                 read = False
   207                 continue
   229                 continue
   208             raise ParseException("""Uknown syntax...""")
   230             raise ParseException("""Uknown syntax...""")
   209         if len(tr) > 0:
   231         if len(tr) > 0:
   210             senses.append((pos, tr, ex))
   232             senses.append((pos, tr, ex))
   211         self.tran = senses
   233         self.tran = senses