py/gadict.py
changeset 757 5417f2102dc5
parent 740 77668cb05069
child 871 1dfca1e1f42a
equal deleted inserted replaced
756:c1d3555458ad 757:5417f2102dc5
     2 """
     2 """
     3 gadict dictionary format parser.
     3 gadict dictionary format parser.
     4 """
     4 """
     5 
     5 
     6 import sys
     6 import sys
     7 import regex
     7 import re
     8 
     8 
     9 
     9 
    10 class Prelude:
    10 class Prelude:
    11     """Dictionary metainfo structure."""
    11     """Dictionary metainfo structure."""
    12     name = None
    12     name = None
   123         return "<Sense {}>".format(str(self))
   123         return "<Sense {}>".format(str(self))
   124 
   124 
   125 class Parser:
   125 class Parser:
   126     """gadict dictionary format parser."""
   126     """gadict dictionary format parser."""
   127 
   127 
   128     COMMENT_RE = regex.compile(r"^# ")
   128     COMMENT_RE = re.compile("^# ")
   129 
   129 
   130     SEPARATOR_RE = regex.compile(u"^__$")
   130     SEPARATOR_RE = re.compile(u"^__$", re.UNICODE)
   131     HEADWORD_RE = regex.compile( u"^(\\p{L}.*)$" )
   131     HEADWORD_RE = re.compile( u"^(\\w.*)$" )
   132     HEADWORD_VAR_RE = regex.compile(u"^ +(rare|s|pl|v[123]|male|female|comp|super|abbr|Am|Br|Au)$")
   132     HEADWORD_VAR_RE = re.compile(u"^ +(rare|s|pl|v[123]|male|female|comp|super|abbr|Am|Br|Au)$", re.UNICODE)
   133     HEADWORD_PRON_RE = regex.compile(u"^ +\\[([a-zˌˈːəæɛɒʊɪɔɜɑʌɚɹʃʧθðɡʒŋɾ ]+)\\]$")
   133     HEADWORD_PRON_RE = re.compile(u"^ +\\[([a-zˌˈːəæɛɒʊɪɔɜɑʌɚɹʃʧθðɡʒŋɾ ]+)\\]$", re.UNICODE)
   134     TRANSL_POS_RE = regex.compile(u"^(?:n|det|pron|adj|v|adv|prep|conj|num|int|phr|phr\\.v|contr|abbr|prefix)$")
   134     TRANSL_POS_RE = re.compile(u"^(?:n|det|pron|adj|v|adv|prep|conj|num|int|phr|phr\\.v|contr|abbr|prefix)$", re.UNICODE)
   135     TRANSL_RE = regex.compile(u"^(ru|uk|la|en): ([\\p{L}(].*)$")
   135     TRANSL_RE = re.compile(u"^(ru|uk|la|en): ([\\w(].*)$", re.UNICODE)
   136     TRANSL_EX_RE = regex.compile(u"""^(ru|uk|la|en)> ([-'"\\p{L}].*)$""")
   136     TRANSL_EX_RE = re.compile(u"""^(ru|uk|la|en)> ([-'"\\w].*)$""", re.UNICODE)
   137     TRANSL_GLOS_RE = regex.compile(u"^(ru|uk|la|en)= ([-\\p{L}\\p{N}].*)$")
   137     TRANSL_GLOS_RE = re.compile(u"^(ru|uk|la|en)= ([-\\w\\d].*)$", re.UNICODE)
   138     TOPIC_RE = regex.compile(u"^topic: (\\p{L}.*)$")
   138     TOPIC_RE = re.compile(u"^topic: (\\w.*)$", re.UNICODE)
   139     SYN_RE = regex.compile(u"^syn: (\\p{L}.*)$")
   139     SYN_RE = re.compile(u"^syn: (\\w.*)$", re.UNICODE)
   140     ANT_RE = regex.compile(u"^ant: (\\p{L}.*)$")
   140     ANT_RE = re.compile(u"^ant: (\\w.*)$", re.UNICODE)
   141     REL_RE = regex.compile(u"^rel: (\\p{L}.*)$")
   141     REL_RE = re.compile(u"^rel: (\\w.*)$", re.UNICODE)
   142     HYPER_RE = regex.compile(u"^hyper: (\\p{L}.*)$")
   142     HYPER_RE = re.compile(u"^hyper: (\\w.*)$", re.UNICODE)
   143     HYPO_RE = regex.compile(u"^hypo: (\\p{L}.*)$")
   143     HYPO_RE = re.compile(u"^hypo: (\\w.*)$", re.UNICODE)
   144 
   144 
   145     CONT_RE = regex.compile(u"^ +(.*)")
   145     CONT_RE = re.compile(u"^ +(.*)", re.UNICODE)
   146 
   146 
   147     TRAILING_SPACES_RE = regex.compile(u"\\p{Z}+$")
   147     TRAILING_SPACES_RE = re.compile(u"\\s+$", re.UNICODE)
   148 
   148 
   149     PRELUDE_NAME_RE = regex.compile(u"^name: (.*)")
   149     PRELUDE_NAME_RE = re.compile(u"^name: (.*)", re.UNICODE)
   150     PRELUDE_URL_RE = regex.compile(u"^url: (.*)")
   150     PRELUDE_URL_RE = re.compile(u"^url: (.*)", re.UNICODE)
   151     PRELUDE_AUTHOR_RE = regex.compile(u"^by: (.*)")
   151     PRELUDE_AUTHOR_RE = re.compile(u"^by: (.*)", re.UNICODE)
   152     PRELUDE_LICENSE_RE = regex.compile(u"^term: (.*)")
   152     PRELUDE_LICENSE_RE = re.compile(u"^term: (.*)", re.UNICODE)
   153     PRELUDE_ABOUT_RE = regex.compile(u"^about: ?(.*)")
   153     PRELUDE_ABOUT_RE = re.compile(u"^about: ?(.*)", re.UNICODE)
   154 
   154 
   155     def __init__(self):
   155     def __init__(self):
   156         pass
   156         pass
   157 
   157 
   158     def readline(self):
   158     def readline(self):
   159         while True:
   159         while True:
   160             self.line = self.stream.readline()
   160             self.line = self.stream.readline()
   161             self.eof = len(self.line) == 0
   161             self.eof = len(self.line) == 0
   162             if not self.eof:
   162             if not self.eof:
   163                 self.lineno += 1
   163                 self.lineno += 1
       
   164             self.line = self.line.rstrip('\n')
   164             if self.TRAILING_SPACES_RE.search(self.line):
   165             if self.TRAILING_SPACES_RE.search(self.line):
   165                 raise ParseException("Traling spaces detected...\n")
   166                 raise ParseException("Traling spaces detected...\n")
   166             if self.COMMENT_RE.search(self.line):
   167             if self.COMMENT_RE.search(self.line):
   167                 continue
   168                 continue
   168             break
   169             break
   189             if self.eof:
   190             if self.eof:
   190                 return string
   191                 return string
   191             m = self.CONT_RE.match(self.line)
   192             m = self.CONT_RE.match(self.line)
   192             if m is not None:
   193             if m is not None:
   193                 string += "\n" + m.group(1)
   194                 string += "\n" + m.group(1)
   194             elif len(self.line) == 1:
   195             elif len(self.line) == 0:
   195                 string += "\n"
   196                 string += "\n"
   196             else:
   197             else:
   197                 return string
   198                 return string
   198 
   199 
   199     def parse_prelude(self):
   200     def parse_prelude(self):
   237         self.parse_translation()
   238         self.parse_translation()
   238         self.dom.append((self.words, self.tran))
   239         self.dom.append((self.words, self.tran))
   239 
   240 
   240     def parse_empty_line(self):
   241     def parse_empty_line(self):
   241         self.readline()
   242         self.readline()
   242         if self.eof or len(self.line) != 1:
   243         if self.eof or len(self.line) != 0:
   243             raise ParseException(""""__" delimiter should followed by empty line...""")
   244             raise ParseException(""""__" delimiter should followed by empty line...""")
   244 
   245 
   245     def parse_headlines(self):
   246     def parse_headlines(self):
   246         """Try to match word variations with attributed. Assume that `self.line` on preceding empty line."""
   247         """Try to match word variations with attributed. Assume that `self.line` on preceding empty line."""
   247         self.words = []
   248         self.words = []
   254         word = m.group(1)
   255         word = m.group(1)
   255         pron = None
   256         pron = None
   256         attrs = set()
   257         attrs = set()
   257         while True:
   258         while True:
   258             self.readline()
   259             self.readline()
   259             if self.eof or len(self.line) == 1:
   260             if self.eof or len(self.line) == 0:
   260                 break
   261                 break
   261             m = self.HEADWORD_RE.match(self.line)
   262             m = self.HEADWORD_RE.match(self.line)
   262             if m is not None:
   263             if m is not None:
   263                 if word is None:
   264                 if word is None:
   264                     raise ParseException("""Didn't match previous headword...""")
   265                     raise ParseException("""Didn't match previous headword...""")
   307             m = self.SEPARATOR_RE.match(self.line)
   308             m = self.SEPARATOR_RE.match(self.line)
   308             if m is not None:
   309             if m is not None:
   309                 if sense:
   310                 if sense:
   310                     senses.append(sense)
   311                     senses.append(sense)
   311                 break
   312                 break
   312             if len(self.line) == 1:
   313             if len(self.line) == 0:
   313                 if sense:
   314                 if sense:
   314                     senses.append(sense)
   315                     senses.append(sense)
   315                 sense = None
   316                 sense = None
   316                 continue
   317                 continue
   317             m = self.TRANSL_POS_RE.match(self.line)
   318             m = self.TRANSL_POS_RE.match(self.line)