Treat first argument in format like the rest arguments. Targets to print
statistic about missing words from wordlist.
--- a/Makefile Thu Dec 29 00:44:28 2016 +0200
+++ b/Makefile Thu Dec 29 01:17:44 2016 +0200
@@ -452,7 +452,34 @@
.PHONY: freq
freq: dist/wordlist/gadict_en-ru+uk.list py/gadict_freq.py $(FREQ_DEP) $(BUILD_SCRIPTS)
- python3 -B py/gadict_freq.py dist/wordlist/gadict_en-ru+uk.list $(FREQ_FILTER)
+ python3 -B py/gadict_freq.py +b:dist/wordlist/gadict_en-ru+uk.list $(FREQ_FILTER)
+
+.PHONY: missing
+missing: dist/wordlist/gadict_en-ru+uk.list $(FREQ_DEP) $(BUILD_SCRIPTS)
+ @echo ===== VOA =====
+ python3 -B py/gadict_freq.py -b:dist/wordlist/gadict_en-ru+uk.list +b:dist/wordlist/voa.list
+ @echo ===== GSL =====
+ python3 -B py/gadict_freq.py -b:dist/wordlist/gadict_en-ru+uk.list +f:wordlist/gsl.freq
+ @echo ===== AWL =====
+ python3 -B py/gadict_freq.py -b:dist/wordlist/gadict_en-ru+uk.list +f:wordlist/awl.freq
+ @echo ===== NGSL =====
+ python3 -B py/gadict_freq.py -b:dist/wordlist/gadict_en-ru+uk.list +f:wordlist/ngsl.freq
+ @echo ===== NAWL =====
+ python3 -B py/gadict_freq.py -b:dist/wordlist/gadict_en-ru+uk.list +f:wordlist/nawl.freq
+ @echo ===== BSL =====
+ python3 -B py/gadict_freq.py -b:dist/wordlist/gadict_en-ru+uk.list +b:wordlist/bsl.var
+ @echo ===== TSL =====
+ python3 -B py/gadict_freq.py -b:dist/wordlist/gadict_en-ru+uk.list +b:wordlist/tsl.var
+
+.PHONY: missing-stat
+missing-stat: dist/wordlist/gadict_en-ru+uk.list $(FREQ_DEP) $(BUILD_SCRIPTS)
+ @printf "%4s: %4s / %4s\n" VOA `python3 -B py/gadict_freq.py -b:dist/wordlist/gadict_en-ru+uk.list +b:dist/wordlist/voa.list | wc -l` `wc -l <dist/wordlist/voa.list`
+ @printf "%4s: %4s / %4s\n" GSL `python3 -B py/gadict_freq.py -b:dist/wordlist/gadict_en-ru+uk.list +f:wordlist/gsl.freq | wc -l` `wc -l <wordlist/gsl.freq`
+ @printf "%4s: %4s / %4s\n" AWL `python3 -B py/gadict_freq.py -b:dist/wordlist/gadict_en-ru+uk.list +f:wordlist/awl.freq | wc -l` `wc -l <wordlist/awl.freq`
+ @printf "%4s: %4s / %4s\n" NGSL `python3 -B py/gadict_freq.py -b:dist/wordlist/gadict_en-ru+uk.list +f:wordlist/ngsl.freq | wc -l` `wc -l <wordlist/ngsl.freq`
+ @printf "%4s: %4s / %4s\n" NAWL `python3 -B py/gadict_freq.py -b:dist/wordlist/gadict_en-ru+uk.list +f:wordlist/nawl.freq | wc -l` `wc -l <wordlist/nawl.freq`
+ @printf "%4s: %4s / %4s\n" BSL `python3 -B py/gadict_freq.py -b:dist/wordlist/gadict_en-ru+uk.list +b:wordlist/bsl.var | wc -l` `wc -l <wordlist/bsl.var`
+ @printf "%4s: %4s / %4s\n" TSL `python3 -B py/gadict_freq.py -b:dist/wordlist/gadict_en-ru+uk.list +b:wordlist/tsl.var | wc -l` `wc -l <wordlist/tsl.var`
dist/wordlist/%.list: %.gadict py/gadict_headwords.py $(BUILD_SCRIPTS) | dist/wordlist/
python3 -B py/gadict_headwords.py $< $@
--- a/py/gadict_freq.py Thu Dec 29 00:44:28 2016 +0200
+++ b/py/gadict_freq.py Thu Dec 29 01:17:44 2016 +0200
@@ -19,7 +19,7 @@
wlist.append(line)
return wlist
-class HeadVarParser:
+class WordformParser:
BASEVAR_RE = regex.compile(u"^(\t)?(.*)$")
@@ -80,17 +80,12 @@
raise Exception(USAGE)
FINAME = sys.argv[1]
- with io.open(FINAME, mode='r', buffering=1, encoding="utf-8") as stream:
- parser = WordlistParser(stream)
- HEADWORDS = parser.parse()
-
COMMAND_RE = regex.compile("([-+])([0-9]+)?([bf]):([^:]+)")
-
IN_SET = set()
EX_SET = set()
- for idx in range(2, len(sys.argv)):
+ for idx in range(1, len(sys.argv)):
spec = sys.argv[idx]
m = COMMAND_RE.match(spec)
if not m:
@@ -102,7 +97,7 @@
limit = int(limit)
with io.open(fname, mode='r', buffering=1, encoding="utf-8") as stream:
if mode == "b":
- parser = HeadVarParser(stream, limit)
+ parser = WordformParser(stream, limit)
elif mode == "f":
parser = FreqlistParser(stream, limit)
else:
@@ -112,16 +107,14 @@
except:
print("Error during processing: {:s}".format(fname))
raise
- wlist = set(wlist)
+ wlist = set([w.lower() for w in wlist])
if m.group(1) == "+":
IN_SET |= wlist
else:
EX_SET |= wlist
- for headword in HEADWORDS:
- if any(c in headword for c in " '."):
- continue
- normilized= headword.lower()
- if (len(IN_SET) == 0 or normilized in IN_SET) and not normilized in EX_SET:
- print(headword)
+ for headword in IN_SET - EX_SET:
+ # if any(c in headword for c in " '."):
+ # continue
+ print(headword)