# HG changeset patch # User Oleksandr Gavenko # Date 1459112263 -10800 # Node ID b47698d5ccabe4c4fa922ef7f8ace8e45efdc234 # Parent 791994f955614ade2c830a1b523f602fcbdff878 Parse dictionary metainfo. diff -r 791994f95561 -r b47698d5ccab py/gadict.py --- a/py/gadict.py Sun Mar 27 23:32:42 2016 +0300 +++ b/py/gadict.py Sun Mar 27 23:57:43 2016 +0300 @@ -2,6 +2,14 @@ import regex +class Prelude: + name = None + about = "" + urls = [] + authors = [] + licences = [] + + class ParseException(BaseException): def __init__(self, msg, lineno = None, line = None): @@ -28,8 +36,16 @@ TRANSL_RE = regex.compile(r"^(ru|uk|la|en): ([\p{L}(][\p{L}\p{P}~ ]*)$") TRANSL_EX_RE = regex.compile(r"^(ru|uk|la|en)> (\p{L}.*)$") + CONT_RE = regex.compile(r"^ +(.*)") + TRAILING_SPACES_RE = regex.compile(r"\p{Z}+$") + PRELUDE_NAME_RE = regex.compile(r"^name: (.*)") + PRELUDE_URL_RE = regex.compile(r"^url: (.*)") + PRELUDE_AUTHOR_RE = regex.compile(r"^by: (.*)") + PRELUDE_LICENSE_RE = regex.compile(r"^term: (.*)") + PRELUDE_ABOUT_RE = regex.compile(r"^about: ?(.*)") + def __init__(self): pass @@ -53,14 +69,51 @@ raise ParseException(ex.msg, self.lineno, self.line) from ex return self.dom + def parse_continuation(self): + string = "" + while True: + self.readline() + if self.eof: + return string + m = CONT_RE.match(self.line) + if m is not None: + string += "\n" + m.group(1) + elif len(self.line) == 1: + string += "\n" + else: + return string + def parse_prelude(self): """Read dictionary prelude until first "__" delimiter.""" + pre = Prelude() while True: self.readline() if self.eof: raise ParseException("There are no articles...") + m = self.PRELUDE_ABOUT_RE.match(self.line) + if m: + pre.about += m.group(1) + self.parse_continuation() + if self.eof: + raise ParseException("There are no articles...") if self.SEPARATOR_RE.match(self.line): break + m = self.PRELUDE_NAME_RE.match(self.line) + if m: + pre.name = m.group(1) + continue + m = self.PRELUDE_URL_RE.match(self.line) + if m: + pre.urls.append(m.group(1)) + continue + m = self.PRELUDE_AUTHOR_RE.match(self.line) + if m: + pre.authors.append(m.group(1)) + continue + m = self.PRELUDE_LICENSE_RE.match(self.line) + if m: + pre.licences.append(m.group(1)) + continue + self.dom.append(pre) def parse_article(self): """Try to match article until next "__" delimiter. Assume that `self.line` point to "__" delimiter.""" diff -r 791994f95561 -r b47698d5ccab py/gadict_c5.py --- a/py/gadict_c5.py Sun Mar 27 23:32:42 2016 +0300 +++ b/py/gadict_c5.py Sun Mar 27 23:57:43 2016 +0300 @@ -30,6 +30,28 @@ finally: fin.close() +prelude = dom[0] +if prelude.name is not None: + fout.write("_____\n\n00-database-short\n") + fout.write(prelude.name) + fout.write("\n") +if len(prelude.urls) > 0: + fout.write("_____\n\n00-database-url\n") + for url in prelude.urls: + fout.write(url) + fout.write("\n") +fout.write("_____\n\n00-database-info\n") +if prelude.name is not None: + fout.write("Dictionary name: ") + fout.write(prelude.name) + fout.write("\n\n") +fout.write("Project URLs: ") +fout.write(" , ".join(prelude.urls)) +fout.write("\n\n") +fout.write("Project licenses: ") +fout.write(", ".join(prelude.licences)) +fout.write("\n") + for idx in range(1, len(dom)): article = dom[idx] fout.write("_____\n\n")