py/gadict.py
changeset 552 7398bc1829d6
parent 542 b5197c70972c
child 553 45a3138c9b4d
equal deleted inserted replaced
551:8ba39e31d608 552:7398bc1829d6
     1 """
     1 """
     2 gadict dictionary format parser.
     2 gadict dictionary format parser.
     3 """
     3 """
     4 
     4 
       
     5 import sys
     5 import regex
     6 import regex
     6 
     7 
     7 
     8 
     8 class Prelude:
     9 class Prelude:
     9     """Dictionary metainfo structure."""
    10     """Dictionary metainfo structure."""
    15 
    16 
    16 
    17 
    17 class ParseException(BaseException):
    18 class ParseException(BaseException):
    18 
    19 
    19     def __init__(self, msg, lineno=None, line=None):
    20     def __init__(self, msg, lineno=None, line=None):
    20         super().__init__()
    21         BaseException.__init__(self)
    21         self.msg = msg
    22         self.msg = msg
    22         self.lineno = lineno
    23         self.lineno = lineno
    23         self.line = line
    24         self.line = line
    24 
    25 
    25     def __repr__(self):
    26     def __repr__(self):
    26         if self.lineno is None:
    27         if self.lineno is None:
    27             return self.msg
    28             return self.msg
    28         elif self.line is None:
    29         elif self.line is None:
    29             return ":{:d}:{:s}".format(self.lineno, self.msg)
    30             return ":{:d}:{:s}".format(self.lineno, self.msg.encode('utf-8'))
    30         else:
    31         else:
    31             return ":{:d}: {:s}\nLINE: {:s}".format(self.lineno, self.msg, self.line)
    32             return ":{:d}: {:s}\nLINE: {:s}".format(self.lineno, self.msg.encode('utf-8'), self.line.encode('utf-8'))
    32 
    33 
    33 class Sense:
    34 class Sense:
    34 
    35 
    35     def __init__(self, pos, tr_list = None, ex_list = None, syn_list = None, ant_list = None, topic_list = None):
    36     def __init__(self, pos, tr_list = None, ex_list = None, syn_list = None, ant_list = None, topic_list = None):
    36         if not pos:
    37         if not pos:
    75 class Parser:
    76 class Parser:
    76     """gadict dictionary format parser."""
    77     """gadict dictionary format parser."""
    77 
    78 
    78     COMMENT_RE = regex.compile(r"^# ")
    79     COMMENT_RE = regex.compile(r"^# ")
    79 
    80 
    80     SEPARATOR_RE = regex.compile(r"^__$")
    81     SEPARATOR_RE = regex.compile(u"^__$")
    81     HEADWORD_RE = regex.compile(r"^(\p{L}.*)$")
    82     HEADWORD_RE = regex.compile( u"^(\\p{L}.*)$" )
    82     HEADWORD_VAR_RE = regex.compile(r"^ +(s|pl|v[123]|male|female|comp|super|abbr|Am|Br|Au)$")
    83     HEADWORD_VAR_RE = regex.compile(u"^ +(s|pl|v[123]|male|female|comp|super|abbr|Am|Br|Au)$")
    83     HEADWORD_PRON_RE = regex.compile(r"^ +\[([\p{L}' ]+)\]$")
    84     HEADWORD_PRON_RE = regex.compile(u"^ +\\[([\p{L}' ]+)\\]$")
    84     TRANSL_POS_RE = regex.compile(r"^n|det|pron|adj|v|adv|prep|conj|num|int|phr|phr\.v|contr|abbr|prefix$")
    85     TRANSL_POS_RE = regex.compile(u"^n|det|pron|adj|v|adv|prep|conj|num|int|phr|phr\\.v|contr|abbr|prefix$")
    85     TRANSL_RE = regex.compile(r"^(ru|uk|la|en): ([\p{L}(].*)$")
    86     TRANSL_RE = regex.compile(u"^(ru|uk|la|en): ([\\p{L}(].*)$")
    86     TRANSL_EX_RE = regex.compile(r"^(ru|uk|la|en)> ([-\p{L}].*)$")
    87     TRANSL_EX_RE = regex.compile(u"^(ru|uk|la|en)> ([-\\p{L}].*)$")
    87     TOPIC_RE = regex.compile(r"^topic: (\p{L}.*)$")
    88     TOPIC_RE = regex.compile(u"^topic: (\\p{L}.*)$")
    88     SYN_RE = regex.compile(r"^syn: (\p{L}.*)$")
    89     SYN_RE = regex.compile(u"^syn: (\\p{L}.*)$")
    89     ANT_RE = regex.compile(r"^ant: (\p{L}.*)$")
    90     ANT_RE = regex.compile(u"^ant: (\\p{L}.*)$")
    90 
    91 
    91     CONT_RE = regex.compile(r"^ +(.*)")
    92     CONT_RE = regex.compile(u"^ +(.*)")
    92 
    93 
    93     TRAILING_SPACES_RE = regex.compile(r"\p{Z}+$")
    94     TRAILING_SPACES_RE = regex.compile(u"\\p{Z}+$")
    94 
    95 
    95     PRELUDE_NAME_RE = regex.compile(r"^name: (.*)")
    96     PRELUDE_NAME_RE = regex.compile(u"^name: (.*)")
    96     PRELUDE_URL_RE = regex.compile(r"^url: (.*)")
    97     PRELUDE_URL_RE = regex.compile(u"^url: (.*)")
    97     PRELUDE_AUTHOR_RE = regex.compile(r"^by: (.*)")
    98     PRELUDE_AUTHOR_RE = regex.compile(u"^by: (.*)")
    98     PRELUDE_LICENSE_RE = regex.compile(r"^term: (.*)")
    99     PRELUDE_LICENSE_RE = regex.compile(u"^term: (.*)")
    99     PRELUDE_ABOUT_RE = regex.compile(r"^about: ?(.*)")
   100     PRELUDE_ABOUT_RE = regex.compile(u"^about: ?(.*)")
   100 
   101 
   101     def __init__(self):
   102     def __init__(self):
   102         pass
   103         pass
   103 
   104 
   104     def readline(self):
   105     def readline(self):
   120         try:
   121         try:
   121             self.parse_prelude()
   122             self.parse_prelude()
   122             while not self.eof:
   123             while not self.eof:
   123                 self.parse_article()
   124                 self.parse_article()
   124         except ParseException as ex:
   125         except ParseException as ex:
       
   126             if sys.version_info.major == 2:
       
   127                 import traceback
       
   128                 traceback.print_exc()
   125             raise ParseException(ex.msg, self.lineno, self.line)
   129             raise ParseException(ex.msg, self.lineno, self.line)
   126         return self.dom
   130         return self.dom
   127 
   131 
   128     def parse_prelude_continuation(self):
   132     def parse_prelude_continuation(self):
   129         string = ""
   133         string = ""