Parse dictionary metainfo.
authorOleksandr Gavenko <gavenkoa@gmail.com>
Sun, 27 Mar 2016 23:57:43 +0300
changeset 402 b47698d5ccab
parent 401 791994f95561
child 403 241ecfe2ca3c
Parse dictionary metainfo.
py/gadict.py
py/gadict_c5.py
--- a/py/gadict.py	Sun Mar 27 23:32:42 2016 +0300
+++ b/py/gadict.py	Sun Mar 27 23:57:43 2016 +0300
@@ -2,6 +2,14 @@
 import regex
 
 
+class Prelude:
+    name = None
+    about = ""
+    urls = []
+    authors = []
+    licences = []
+
+
 class ParseException(BaseException):
 
     def __init__(self, msg, lineno = None, line = None):
@@ -28,8 +36,16 @@
     TRANSL_RE = regex.compile(r"^(ru|uk|la|en): ([\p{L}(][\p{L}\p{P}~ ]*)$")
     TRANSL_EX_RE = regex.compile(r"^(ru|uk|la|en)> (\p{L}.*)$")
 
+    CONT_RE = regex.compile(r"^ +(.*)")
+
     TRAILING_SPACES_RE = regex.compile(r"\p{Z}+$")
 
+    PRELUDE_NAME_RE = regex.compile(r"^name: (.*)")
+    PRELUDE_URL_RE = regex.compile(r"^url: (.*)")
+    PRELUDE_AUTHOR_RE = regex.compile(r"^by: (.*)")
+    PRELUDE_LICENSE_RE = regex.compile(r"^term: (.*)")
+    PRELUDE_ABOUT_RE = regex.compile(r"^about: ?(.*)")
+
     def __init__(self):
         pass
 
@@ -53,14 +69,51 @@
             raise ParseException(ex.msg, self.lineno, self.line) from ex
         return self.dom
 
+    def parse_continuation(self):
+        string = ""
+        while True:
+            self.readline()
+            if self.eof:
+                return string
+            m = CONT_RE.match(self.line)
+            if m is not None:
+                string += "\n" + m.group(1)
+            elif len(self.line) == 1:
+                string += "\n"
+            else:
+                return string
+
     def parse_prelude(self):
         """Read dictionary prelude until first "__" delimiter."""
+        pre = Prelude()
         while True:
             self.readline()
             if self.eof:
                 raise ParseException("There are no articles...")
+            m = self.PRELUDE_ABOUT_RE.match(self.line)
+            if m:
+                pre.about += m.group(1) + self.parse_continuation()
+                if self.eof:
+                    raise ParseException("There are no articles...")
             if self.SEPARATOR_RE.match(self.line):
                 break
+            m = self.PRELUDE_NAME_RE.match(self.line)
+            if m:
+                pre.name = m.group(1)
+                continue
+            m = self.PRELUDE_URL_RE.match(self.line)
+            if m:
+                pre.urls.append(m.group(1))
+                continue
+            m = self.PRELUDE_AUTHOR_RE.match(self.line)
+            if m:
+                pre.authors.append(m.group(1))
+                continue
+            m = self.PRELUDE_LICENSE_RE.match(self.line)
+            if m:
+                pre.licences.append(m.group(1))
+                continue
+        self.dom.append(pre)
 
     def parse_article(self):
         """Try to match article until next "__" delimiter. Assume that `self.line` point to "__" delimiter."""
--- a/py/gadict_c5.py	Sun Mar 27 23:32:42 2016 +0300
+++ b/py/gadict_c5.py	Sun Mar 27 23:57:43 2016 +0300
@@ -30,6 +30,28 @@
 finally:
     fin.close()
 
+prelude = dom[0]
+if prelude.name is not None:
+    fout.write("_____\n\n00-database-short\n")
+    fout.write(prelude.name)
+    fout.write("\n")
+if len(prelude.urls) > 0:
+    fout.write("_____\n\n00-database-url\n")
+    for url in prelude.urls:
+        fout.write(url)
+        fout.write("\n")
+fout.write("_____\n\n00-database-info\n")
+if prelude.name is not None:
+    fout.write("Dictionary name: ")
+    fout.write(prelude.name)
+    fout.write("\n\n")
+fout.write("Project URLs: ")
+fout.write(" , ".join(prelude.urls))
+fout.write("\n\n")
+fout.write("Project licenses: ")
+fout.write(", ".join(prelude.licences))
+fout.write("\n")
+
 for idx in range(1, len(dom)):
     article = dom[idx]
     fout.write("_____\n\n")