Add ``rare`` attribute to headword to filter low frequency headwords out from
Anki deck.
--- a/contrib/gadict.el Tue Nov 08 19:01:27 2016 +0200
+++ b/contrib/gadict.el Wed Nov 09 03:16:32 2016 +0200
@@ -24,7 +24,7 @@
(defconst gadict--art-lang-regex (regexp-opt '("en" "ru" "uk" "la")))
(defconst gadict--art-rel-regex (regexp-opt '("ant" "syn" "rel" "topic" "hyper" "hypo")))
-(defconst gadict--art-var-regex (regexp-opt '("v1" "v2" "v3" "s" "pl" "male" "female" "abbr" "comp" "super" "Am" "Br" "Au")))
+(defconst gadict--art-var-regex (regexp-opt '("rare" "v1" "v2" "v3" "s" "pl" "male" "female" "abbr" "comp" "super" "Am" "Br" "Au")))
(defconst gadict--art-pos-regex (regexp-opt '("n" "v" "adj" "adv" "pron" "prep" "num" "conj" "int" "phr" "phr.v" "contr" "abbr" "prefix")))
(defgroup gadict nil
--- a/py/gadict.py Tue Nov 08 19:01:27 2016 +0200
+++ b/py/gadict.py Wed Nov 09 03:16:32 2016 +0200
@@ -129,7 +129,7 @@
SEPARATOR_RE = regex.compile(u"^__$")
HEADWORD_RE = regex.compile( u"^(\\p{L}.*)$" )
- HEADWORD_VAR_RE = regex.compile(u"^ +(s|pl|v[123]|male|female|comp|super|abbr|Am|Br|Au)$")
+ HEADWORD_VAR_RE = regex.compile(u"^ +(rare|s|pl|v[123]|male|female|comp|super|abbr|Am|Br|Au)$")
HEADWORD_PRON_RE = regex.compile(u"^ +\\[([a-zˌˈːəæɒʊɪɔɜɑʌʃʧθðɡʒŋ ]+)\\]$")
TRANSL_POS_RE = regex.compile(u"^(?:n|det|pron|adj|v|adv|prep|conj|num|int|phr|phr\\.v|contr|abbr|prefix)$")
TRANSL_RE = regex.compile(u"^(ru|uk|la|en): ([\\p{L}(].*)$")
--- a/py/gadict_srs_anki.py Tue Nov 08 19:01:27 2016 +0200
+++ b/py/gadict_srs_anki.py Wed Nov 09 03:16:32 2016 +0200
@@ -309,6 +309,8 @@
for (headwords, translations) in DOM[1:]:
identity = headwords[0].headword
+ if 'rare' in identity.attrs:
+ continue
freqtags = []
for (freqtag, freqset) in FREQ_SOURCES:
if identity in freqset:
--- a/www/HACKING.rst Tue Nov 08 19:01:27 2016 +0200
+++ b/www/HACKING.rst Wed Nov 09 03:16:32 2016 +0200
@@ -150,6 +150,16 @@
* ``Br`` - Great Britain
* ``Au`` - Australian
+``rare`` attribute to first headword used as marker that word has low frequency.
+SRS file writers skip entries marked as ``rare``. I found it convenient to check
+frequency with:
+
+https://books.google.com/ngrams/
+ Google N-grams from books 1800-2010.
+
+For cut-off point I chose ``beseech`` word. All less frequent words receive
+``rare`` marker.
+
C5 dictionary source file format
================================