Add frequency markers to dictd dictionary and Anki cards.
authorOleksandr Gavenko <gavenkoa@gmail.com>
Tue, 08 Nov 2016 19:01:27 +0200
changeset 646 2d488cfc4c0c
parent 645 6d4a074cea27
child 647 6ae5399c8087
Add frequency markers to dictd dictionary and Anki cards.
Makefile
py/gadict_c5.py
py/gadict_freq.py
py/gadict_srs_anki.py
--- a/Makefile	Tue Nov 08 18:12:50 2016 +0200
+++ b/Makefile	Tue Nov 08 19:01:27 2016 +0200
@@ -395,6 +395,9 @@
 ################################################################
 # Word frequency statistic.
 
+# For dictd and anki.
+FREQLIST_OPT := -freq:freq:GSL=wordlist/gsl.freq -freq:freq:AWL=wordlist/awl.freq -freq:freq:NGSL=wordlist/ngsl.freq -freq:freq:NAWL=wordlist/nawl.freq
+
 FREQ_DEP :=
 FREQ_FILTER :=
 
@@ -482,18 +485,22 @@
 
 # -B  suppress __pycache__ dir
 
-dist/dictd/gadict_en-ru.c5: gadict_en-ru+uk.gadict py/gadict.py py/gadict_c5.py | dist/dictd/
+dist/dictd/gadict_en-ru+uk.c5: gadict_en-ru+uk.gadict py/gadict.py py/gadict_c5.py $(MAKEFILE_LIST) | dist/dictd/
+	python3 -B py/gadict_c5.py $(FREQLIST_OPT)  $< $@
+	echo "gadict En-Ru+Uk"> dist/dictd/gadict_en-ru+uk.c5.name
+
+dist/dictd/gadict_en-ru.c5: gadict_en-ru+uk.gadict py/gadict.py py/gadict_c5.py $(MAKEFILE_LIST) | dist/dictd/
 	python3 -B py/gadict_c5.py  $< $@ -lang:ru
 	echo "gadict En-Ru"> dist/dictd/gadict_en-ru.c5.name
 
-dist/dictd/gadict_en-uk.c5: gadict_en-ru+uk.gadict py/gadict.py py/gadict_c5.py | dist/dictd/
+dist/dictd/gadict_en-uk.c5: gadict_en-ru+uk.gadict py/gadict.py py/gadict_c5.py $(MAKEFILE_LIST) | dist/dictd/
 	python3 -B py/gadict_c5.py  $< $@ -lang:uk
 	echo "gadict En-Uk"> dist/dictd/gadict_en-uk.c5.name
 
-dist/dictd/gadict_voa.c5: gadict_voa.gadict py/gadict.py py/gadict_c5.py | dist/dictd/
+dist/dictd/gadict_voa.c5: gadict_voa.gadict py/gadict.py py/gadict_c5.py $(MAKEFILE_LIST) | dist/dictd/
 	python3 -B py/gadict_c5.py  $< $@ -lang:en
 
-dist/dictd/%.c5: %.gadict py/gadict.py py/gadict_c5.py | dist/dictd/
+dist/dictd/%.c5: %.gadict py/gadict.py py/gadict_c5.py $(MAKEFILE_LIST) | dist/dictd/
 	python3 -B py/gadict_c5.py  $< $@
 
 dist/dictd/:
@@ -506,10 +513,10 @@
 anki: $(SRS_ANKI_FILES)
 
 dist/srs/%.apkg: %.gadict %.del py/gadict.py py/gadict_srs_anki.py $(MAKEFILE_LIST) | dist/srs/
-	PYTHONPATH=/usr/share/anki: python -B py/gadict_srs_anki.py $< $@
+	PYTHONPATH=/usr/share/anki: python -B py/gadict_srs_anki.py $(FREQLIST_OPT) $< $@
 
 dist/srs/%.apkg: %.gadict py/gadict.py py/gadict_srs_anki.py $(MAKEFILE_LIST) | dist/srs/
-	PYTHONPATH=/usr/share/anki: python -B py/gadict_srs_anki.py $< $@
+	PYTHONPATH=/usr/share/anki: python -B py/gadict_srs_anki.py $(FREQLIST_OPT) $< $@
 
 dist/srs/gadict_en-ru+uk.tab.txt: gadict_en-ru+uk.gadict py/gadict.py py/gadict_srs_tab.py $(MAKEFILE_LIST) | dist/srs/
 	python3 -B py/gadict_srs_tab.py  $< $@ -lang:ru,uk
--- a/py/gadict_c5.py	Tue Nov 08 18:12:50 2016 +0200
+++ b/py/gadict_c5.py	Tue Nov 08 19:01:27 2016 +0200
@@ -7,11 +7,13 @@
 import regex
 
 import gadict
+import gadict_freq
 
 
 FINAME = None
 FONAME = None
 LANGS = None
+FREQ_SOURCES = []
 
 # -lang:ru,uk
 ARG_LANG_RE = regex.compile("-lang:(.+)")
@@ -34,10 +36,18 @@
             continue
         m = ARG_FREQ_RE.match(arg)
         if m:
-            LANGS = set(arg.split(","))
-            for lang in LANGS:
-                if len(lang) != 2:
-                    raise Exception("Incorrect language specification: '{:s}'".format(arg))
+            mode = m.group(1)
+            tag = m.group(2)
+            fname = m.group(3)
+            with io.open(fname, mode='r', buffering=1, encoding="utf-8") as stream:
+                if mode == "var":
+                    parser = gadict_freq.HeadVarParser(stream)
+                elif mode == "freq":
+                    parser = gadict_freq.FreqlistParser(stream)
+                else:
+                    raise Exception("Unsupported mode: '{:s}'".format(mode))
+                wlist = parser.parse()
+            FREQ_SOURCES.append((tag, set(wlist)))
             continue
         if arg.startswith("-"):
             raise Exception("Unsupported option format: '{:s}'".format(arg))
@@ -98,6 +108,7 @@
 
 
 for (headwords, translations) in DOM[1:]:
+    identity = headwords[0].headword
     FOUT.write("_____\n\n")
     title = "; ".join([h.headword for h in headwords])
     FOUT.write(title)
@@ -180,3 +191,10 @@
             FOUT.write("⇒ ")
             FOUT.write(tr)
             FOUT.write("\n")
+    freqtags = []
+    for (freqtag, freqset) in FREQ_SOURCES:
+        if identity in freqset:
+            freqtags.append(freqtag)
+    if len(freqtags) > 0:
+        FOUT.write(",".join(["{{{:s}}}".format(tag) for tag in freqtags]))
+        FOUT.write("\n")
--- a/py/gadict_freq.py	Tue Nov 08 18:12:50 2016 +0200
+++ b/py/gadict_freq.py	Tue Nov 08 19:01:27 2016 +0200
@@ -68,7 +68,7 @@
             self.lineno += 1
             m = self.FREQ_RE.match(line)
             if not m:
-                raise Exception("Line {:d} is not in NUM WORD format\n".format(self.lineno, line))
+                raise Exception("Line '{:s}' #{:d} is not in NUM WORD format\n".format(line, self.lineno))
             headword = m.group(2).strip().lower()
             wlist.append(headword)
         return wlist
--- a/py/gadict_srs_anki.py	Tue Nov 08 18:12:50 2016 +0200
+++ b/py/gadict_srs_anki.py	Tue Nov 08 19:01:27 2016 +0200
@@ -1,4 +1,5 @@
 # -*- coding: utf-8 -*-
+"""Anki card writer"""
 
 import os
 import io
@@ -10,11 +11,13 @@
 import regex
 
 import gadict
+import gadict_freq
 
 
 FINAME = None
 FONAME = None
 LANGS = None
+FREQ_SOURCES = []
 
 # -lang:ru,uk
 ARG_LANG_RE = regex.compile("-lang:(.+)")
@@ -37,10 +40,18 @@
             continue
         m = ARG_FREQ_RE.match(arg)
         if m:
-            LANGS = set(arg.split(","))
-            for lang in LANGS:
-                if len(lang) != 2:
-                    raise Exception("Incorrect language specification: '{:s}'".format(arg))
+            mode = m.group(1)
+            tag = m.group(2)
+            fname = m.group(3)
+            with io.open(fname, mode='r', buffering=1, encoding="utf-8") as stream:
+                if mode == "var":
+                    parser = gadict_freq.HeadVarParser(stream)
+                elif mode == "freq":
+                    parser = gadict_freq.FreqlistParser(stream)
+                else:
+                    raise Exception("Unsupported mode: '{:s}'".format(mode))
+                wlist = parser.parse()
+            FREQ_SOURCES.append((tag, set(wlist)))
             continue
         if arg.startswith("-"):
             raise Exception("Unsupported option format: '{:s}'".format(arg))
@@ -166,6 +177,10 @@
 span.glos {
   font-size: .95em;
 }
+.freq {
+  color: red;
+  font-weight: bold;
+}
 .del {
   color: red;
   font-weight: bold;
@@ -294,6 +309,14 @@
 
     for (headwords, translations) in DOM[1:]:
         identity = headwords[0].headword
+        freqtags = []
+        for (freqtag, freqset) in FREQ_SOURCES:
+            if identity in freqset:
+                freqtags.append(freqtag)
+        freqmsg = None
+        if len(freqtags) > 0:
+            freqmsg = ",".join(freqtags)
+            freqmsg = "<div class='freq'>{:s}</div>".format(freqmsg)
         buf = []
         v1, v2, v3 = (None, None, None)
         singular, plural = (None, None)
@@ -323,6 +346,8 @@
             if 'pl' in hw.attrs:
                 plural = (hw.headword, hw.pron)
             buf.append("</div>")
+        if freqmsg:
+            buf.append(freqmsg)
         direct_from = "".join(buf)
         buf = []
         for sense in translations: