Add ``rare`` attribute to headword to filter low frequency headwords out from
authorOleksandr Gavenko <gavenkoa@gmail.com>
Wed, 09 Nov 2016 03:16:32 +0200
changeset 647 6ae5399c8087
parent 646 2d488cfc4c0c
child 648 09d3bfcedd0f
Add ``rare`` attribute to headword to filter low frequency headwords out from Anki deck.
contrib/gadict.el
py/gadict.py
py/gadict_srs_anki.py
www/HACKING.rst
--- a/contrib/gadict.el	Tue Nov 08 19:01:27 2016 +0200
+++ b/contrib/gadict.el	Wed Nov 09 03:16:32 2016 +0200
@@ -24,7 +24,7 @@
 
 (defconst gadict--art-lang-regex (regexp-opt '("en" "ru" "uk" "la")))
 (defconst gadict--art-rel-regex (regexp-opt '("ant" "syn" "rel" "topic" "hyper" "hypo")))
-(defconst gadict--art-var-regex (regexp-opt '("v1" "v2" "v3" "s" "pl" "male" "female" "abbr" "comp" "super" "Am" "Br" "Au")))
+(defconst gadict--art-var-regex (regexp-opt '("rare" "v1" "v2" "v3" "s" "pl" "male" "female" "abbr" "comp" "super" "Am" "Br" "Au")))
 (defconst gadict--art-pos-regex (regexp-opt '("n" "v" "adj" "adv" "pron" "prep" "num" "conj" "int" "phr" "phr.v" "contr" "abbr" "prefix")))
 
 (defgroup gadict nil
--- a/py/gadict.py	Tue Nov 08 19:01:27 2016 +0200
+++ b/py/gadict.py	Wed Nov 09 03:16:32 2016 +0200
@@ -129,7 +129,7 @@
 
     SEPARATOR_RE = regex.compile(u"^__$")
     HEADWORD_RE = regex.compile( u"^(\\p{L}.*)$" )
-    HEADWORD_VAR_RE = regex.compile(u"^ +(s|pl|v[123]|male|female|comp|super|abbr|Am|Br|Au)$")
+    HEADWORD_VAR_RE = regex.compile(u"^ +(rare|s|pl|v[123]|male|female|comp|super|abbr|Am|Br|Au)$")
     HEADWORD_PRON_RE = regex.compile(u"^ +\\[([a-zˌˈːəæɒʊɪɔɜɑʌʃʧθðɡʒŋ ]+)\\]$")
     TRANSL_POS_RE = regex.compile(u"^(?:n|det|pron|adj|v|adv|prep|conj|num|int|phr|phr\\.v|contr|abbr|prefix)$")
     TRANSL_RE = regex.compile(u"^(ru|uk|la|en): ([\\p{L}(].*)$")
--- a/py/gadict_srs_anki.py	Tue Nov 08 19:01:27 2016 +0200
+++ b/py/gadict_srs_anki.py	Wed Nov 09 03:16:32 2016 +0200
@@ -309,6 +309,8 @@
 
     for (headwords, translations) in DOM[1:]:
         identity = headwords[0].headword
+        if 'rare' in identity.attrs:
+            continue
         freqtags = []
         for (freqtag, freqset) in FREQ_SOURCES:
             if identity in freqset:
--- a/www/HACKING.rst	Tue Nov 08 19:01:27 2016 +0200
+++ b/www/HACKING.rst	Wed Nov 09 03:16:32 2016 +0200
@@ -150,6 +150,16 @@
 * ``Br`` - Great Britain
 * ``Au`` - Australian
 
+``rare`` attribute to first headword used as marker that word has low frequency.
+SRS file writers skip entries marked as ``rare``. I found it convenient to check
+frequency with:
+
+https://books.google.com/ngrams/
+  Google N-grams from books 1800-2010.
+
+For cut-off point I chose ``beseech`` word. All less frequent words receive
+``rare`` marker.
+
 C5 dictionary source file format
 ================================