# HG changeset patch # User Oleksandr Gavenko # Date 1478654192 -7200 # Node ID 6ae5399c808730c59408fb3d8ba23379a35abb5b # Parent 2d488cfc4c0c2e11a2bce6493519fea3e5590928 Add ``rare`` attribute to headword to filter low frequency headwords out from Anki deck. diff -r 2d488cfc4c0c -r 6ae5399c8087 contrib/gadict.el --- a/contrib/gadict.el Tue Nov 08 19:01:27 2016 +0200 +++ b/contrib/gadict.el Wed Nov 09 03:16:32 2016 +0200 @@ -24,7 +24,7 @@ (defconst gadict--art-lang-regex (regexp-opt '("en" "ru" "uk" "la"))) (defconst gadict--art-rel-regex (regexp-opt '("ant" "syn" "rel" "topic" "hyper" "hypo"))) -(defconst gadict--art-var-regex (regexp-opt '("v1" "v2" "v3" "s" "pl" "male" "female" "abbr" "comp" "super" "Am" "Br" "Au"))) +(defconst gadict--art-var-regex (regexp-opt '("rare" "v1" "v2" "v3" "s" "pl" "male" "female" "abbr" "comp" "super" "Am" "Br" "Au"))) (defconst gadict--art-pos-regex (regexp-opt '("n" "v" "adj" "adv" "pron" "prep" "num" "conj" "int" "phr" "phr.v" "contr" "abbr" "prefix"))) (defgroup gadict nil diff -r 2d488cfc4c0c -r 6ae5399c8087 py/gadict.py --- a/py/gadict.py Tue Nov 08 19:01:27 2016 +0200 +++ b/py/gadict.py Wed Nov 09 03:16:32 2016 +0200 @@ -129,7 +129,7 @@ SEPARATOR_RE = regex.compile(u"^__$") HEADWORD_RE = regex.compile( u"^(\\p{L}.*)$" ) - HEADWORD_VAR_RE = regex.compile(u"^ +(s|pl|v[123]|male|female|comp|super|abbr|Am|Br|Au)$") + HEADWORD_VAR_RE = regex.compile(u"^ +(rare|s|pl|v[123]|male|female|comp|super|abbr|Am|Br|Au)$") HEADWORD_PRON_RE = regex.compile(u"^ +\\[([a-zˌˈːəæɒʊɪɔɜɑʌʃʧθðɡʒŋ ]+)\\]$") TRANSL_POS_RE = regex.compile(u"^(?:n|det|pron|adj|v|adv|prep|conj|num|int|phr|phr\\.v|contr|abbr|prefix)$") TRANSL_RE = regex.compile(u"^(ru|uk|la|en): ([\\p{L}(].*)$") diff -r 2d488cfc4c0c -r 6ae5399c8087 py/gadict_srs_anki.py --- a/py/gadict_srs_anki.py Tue Nov 08 19:01:27 2016 +0200 +++ b/py/gadict_srs_anki.py Wed Nov 09 03:16:32 2016 +0200 @@ -309,6 +309,8 @@ for (headwords, translations) in DOM[1:]: identity = headwords[0].headword + if 'rare' in identity.attrs: + continue freqtags = [] for (freqtag, freqset) in FREQ_SOURCES: if identity in freqset: diff -r 2d488cfc4c0c -r 6ae5399c8087 www/HACKING.rst --- a/www/HACKING.rst Tue Nov 08 19:01:27 2016 +0200 +++ b/www/HACKING.rst Wed Nov 09 03:16:32 2016 +0200 @@ -150,6 +150,16 @@ * ``Br`` - Great Britain * ``Au`` - Australian +``rare`` attribute to first headword used as marker that word has low frequency. +SRS file writers skip entries marked as ``rare``. I found it convenient to check +frequency with: + +https://books.google.com/ngrams/ + Google N-grams from books 1800-2010. + +For cut-off point I chose ``beseech`` word. All less frequent words receive +``rare`` marker. + C5 dictionary source file format ================================