# HG changeset patch # User Oleksandr Gavenko # Date 1478624487 -7200 # Node ID 2d488cfc4c0c2e11a2bce6493519fea3e5590928 # Parent 6d4a074cea27b3e9f91ee1a081b5c5d1e0edc344 Add frequency markers to dictd dictionary and Anki cards. diff -r 6d4a074cea27 -r 2d488cfc4c0c Makefile --- a/Makefile Tue Nov 08 18:12:50 2016 +0200 +++ b/Makefile Tue Nov 08 19:01:27 2016 +0200 @@ -395,6 +395,9 @@ ################################################################ # Word frequency statistic. +# For dictd and anki. +FREQLIST_OPT := -freq:freq:GSL=wordlist/gsl.freq -freq:freq:AWL=wordlist/awl.freq -freq:freq:NGSL=wordlist/ngsl.freq -freq:freq:NAWL=wordlist/nawl.freq + FREQ_DEP := FREQ_FILTER := @@ -482,18 +485,22 @@ # -B suppress __pycache__ dir -dist/dictd/gadict_en-ru.c5: gadict_en-ru+uk.gadict py/gadict.py py/gadict_c5.py | dist/dictd/ +dist/dictd/gadict_en-ru+uk.c5: gadict_en-ru+uk.gadict py/gadict.py py/gadict_c5.py $(MAKEFILE_LIST) | dist/dictd/ + python3 -B py/gadict_c5.py $(FREQLIST_OPT) $< $@ + echo "gadict En-Ru+Uk"> dist/dictd/gadict_en-ru+uk.c5.name + +dist/dictd/gadict_en-ru.c5: gadict_en-ru+uk.gadict py/gadict.py py/gadict_c5.py $(MAKEFILE_LIST) | dist/dictd/ python3 -B py/gadict_c5.py $< $@ -lang:ru echo "gadict En-Ru"> dist/dictd/gadict_en-ru.c5.name -dist/dictd/gadict_en-uk.c5: gadict_en-ru+uk.gadict py/gadict.py py/gadict_c5.py | dist/dictd/ +dist/dictd/gadict_en-uk.c5: gadict_en-ru+uk.gadict py/gadict.py py/gadict_c5.py $(MAKEFILE_LIST) | dist/dictd/ python3 -B py/gadict_c5.py $< $@ -lang:uk echo "gadict En-Uk"> dist/dictd/gadict_en-uk.c5.name -dist/dictd/gadict_voa.c5: gadict_voa.gadict py/gadict.py py/gadict_c5.py | dist/dictd/ +dist/dictd/gadict_voa.c5: gadict_voa.gadict py/gadict.py py/gadict_c5.py $(MAKEFILE_LIST) | dist/dictd/ python3 -B py/gadict_c5.py $< $@ -lang:en -dist/dictd/%.c5: %.gadict py/gadict.py py/gadict_c5.py | dist/dictd/ +dist/dictd/%.c5: %.gadict py/gadict.py py/gadict_c5.py $(MAKEFILE_LIST) | dist/dictd/ python3 -B py/gadict_c5.py $< $@ dist/dictd/: @@ -506,10 +513,10 @@ anki: $(SRS_ANKI_FILES) dist/srs/%.apkg: %.gadict %.del py/gadict.py py/gadict_srs_anki.py $(MAKEFILE_LIST) | dist/srs/ - PYTHONPATH=/usr/share/anki: python -B py/gadict_srs_anki.py $< $@ + PYTHONPATH=/usr/share/anki: python -B py/gadict_srs_anki.py $(FREQLIST_OPT) $< $@ dist/srs/%.apkg: %.gadict py/gadict.py py/gadict_srs_anki.py $(MAKEFILE_LIST) | dist/srs/ - PYTHONPATH=/usr/share/anki: python -B py/gadict_srs_anki.py $< $@ + PYTHONPATH=/usr/share/anki: python -B py/gadict_srs_anki.py $(FREQLIST_OPT) $< $@ dist/srs/gadict_en-ru+uk.tab.txt: gadict_en-ru+uk.gadict py/gadict.py py/gadict_srs_tab.py $(MAKEFILE_LIST) | dist/srs/ python3 -B py/gadict_srs_tab.py $< $@ -lang:ru,uk diff -r 6d4a074cea27 -r 2d488cfc4c0c py/gadict_c5.py --- a/py/gadict_c5.py Tue Nov 08 18:12:50 2016 +0200 +++ b/py/gadict_c5.py Tue Nov 08 19:01:27 2016 +0200 @@ -7,11 +7,13 @@ import regex import gadict +import gadict_freq FINAME = None FONAME = None LANGS = None +FREQ_SOURCES = [] # -lang:ru,uk ARG_LANG_RE = regex.compile("-lang:(.+)") @@ -34,10 +36,18 @@ continue m = ARG_FREQ_RE.match(arg) if m: - LANGS = set(arg.split(",")) - for lang in LANGS: - if len(lang) != 2: - raise Exception("Incorrect language specification: '{:s}'".format(arg)) + mode = m.group(1) + tag = m.group(2) + fname = m.group(3) + with io.open(fname, mode='r', buffering=1, encoding="utf-8") as stream: + if mode == "var": + parser = gadict_freq.HeadVarParser(stream) + elif mode == "freq": + parser = gadict_freq.FreqlistParser(stream) + else: + raise Exception("Unsupported mode: '{:s}'".format(mode)) + wlist = parser.parse() + FREQ_SOURCES.append((tag, set(wlist))) continue if arg.startswith("-"): raise Exception("Unsupported option format: '{:s}'".format(arg)) @@ -98,6 +108,7 @@ for (headwords, translations) in DOM[1:]: + identity = headwords[0].headword FOUT.write("_____\n\n") title = "; ".join([h.headword for h in headwords]) FOUT.write(title) @@ -180,3 +191,10 @@ FOUT.write("⇒ ") FOUT.write(tr) FOUT.write("\n") + freqtags = [] + for (freqtag, freqset) in FREQ_SOURCES: + if identity in freqset: + freqtags.append(freqtag) + if len(freqtags) > 0: + FOUT.write(",".join(["{{{:s}}}".format(tag) for tag in freqtags])) + FOUT.write("\n") diff -r 6d4a074cea27 -r 2d488cfc4c0c py/gadict_freq.py --- a/py/gadict_freq.py Tue Nov 08 18:12:50 2016 +0200 +++ b/py/gadict_freq.py Tue Nov 08 19:01:27 2016 +0200 @@ -68,7 +68,7 @@ self.lineno += 1 m = self.FREQ_RE.match(line) if not m: - raise Exception("Line {:d} is not in NUM WORD format\n".format(self.lineno, line)) + raise Exception("Line '{:s}' #{:d} is not in NUM WORD format\n".format(line, self.lineno)) headword = m.group(2).strip().lower() wlist.append(headword) return wlist diff -r 6d4a074cea27 -r 2d488cfc4c0c py/gadict_srs_anki.py --- a/py/gadict_srs_anki.py Tue Nov 08 18:12:50 2016 +0200 +++ b/py/gadict_srs_anki.py Tue Nov 08 19:01:27 2016 +0200 @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +"""Anki card writer""" import os import io @@ -10,11 +11,13 @@ import regex import gadict +import gadict_freq FINAME = None FONAME = None LANGS = None +FREQ_SOURCES = [] # -lang:ru,uk ARG_LANG_RE = regex.compile("-lang:(.+)") @@ -37,10 +40,18 @@ continue m = ARG_FREQ_RE.match(arg) if m: - LANGS = set(arg.split(",")) - for lang in LANGS: - if len(lang) != 2: - raise Exception("Incorrect language specification: '{:s}'".format(arg)) + mode = m.group(1) + tag = m.group(2) + fname = m.group(3) + with io.open(fname, mode='r', buffering=1, encoding="utf-8") as stream: + if mode == "var": + parser = gadict_freq.HeadVarParser(stream) + elif mode == "freq": + parser = gadict_freq.FreqlistParser(stream) + else: + raise Exception("Unsupported mode: '{:s}'".format(mode)) + wlist = parser.parse() + FREQ_SOURCES.append((tag, set(wlist))) continue if arg.startswith("-"): raise Exception("Unsupported option format: '{:s}'".format(arg)) @@ -166,6 +177,10 @@ span.glos { font-size: .95em; } +.freq { + color: red; + font-weight: bold; +} .del { color: red; font-weight: bold; @@ -294,6 +309,14 @@ for (headwords, translations) in DOM[1:]: identity = headwords[0].headword + freqtags = [] + for (freqtag, freqset) in FREQ_SOURCES: + if identity in freqset: + freqtags.append(freqtag) + freqmsg = None + if len(freqtags) > 0: + freqmsg = ",".join(freqtags) + freqmsg = "
{:s}
".format(freqmsg) buf = [] v1, v2, v3 = (None, None, None) singular, plural = (None, None) @@ -323,6 +346,8 @@ if 'pl' in hw.attrs: plural = (hw.headword, hw.pron) buf.append("") + if freqmsg: + buf.append(freqmsg) direct_from = "".join(buf) buf = [] for sense in translations: