Search rare words in gadict.
authorOleksandr Gavenko <gavenkoa@gmail.com>
Tue, 08 Nov 2016 17:44:04 +0200
changeset 643 c2c32f45dde6
parent 642 c1032aea6265
child 644 e38cd6112193
Search rare words in gadict.
Makefile
py/gadict_freq.py
py/gadict_headwords.py
--- a/Makefile	Tue Nov 08 17:39:22 2016 +0200
+++ b/Makefile	Tue Nov 08 17:44:04 2016 +0200
@@ -393,6 +393,64 @@
 	mkdir -p $@
 
 ################################################################
+# Word frequency statistic.
+
+FREQ_DEP :=
+FREQ_FILTER :=
+
+# BNC_COCA_BASEWORD := $(wildcard wordlist/bnc+coca/basewrd[0-1]?.txt)
+# FREQ_DEP += $(BNC_COCA_BASEWORD)
+# FREQ_FILTER += $(patsubst %,-b:%,$(BNC_COCA_BASEWORD))
+
+GSL_BASEWORD := $(wildcard wordlist/gsl_1000_*.var)
+# FREQ_DEP += $(GSL_BASEWORD)
+# FREQ_FILTER += $(patsubst %,-b:%,$(GSL_BASEWORD))
+FREQ_DEP += dist/wordlist/gsl.var
+FREQ_FILTER += -b:dist/wordlist/gsl.var
+
+NGSL_BASEWORD := $(wildcard wordlist/ngsl_1000_[1-3].var wordlist/ngsl_supplemental.var)
+# FREQ_DEP += $(NGSL_BASEWORD)
+# FREQ_FILTER += $(patsubst %,-b:%,$(NGSL_BASEWORD))
+FREQ_DEP += dist/wordlist/ngsl.var
+FREQ_FILTER += -b:dist/wordlist/ngsl.var
+
+# FAMOUS_FREQLIST := wordlist/awl.freq wordlist/gsl.freq wordlist/nawl.freq wordlist/ngsl.freq
+# FREQ_FILTER += $(patsubst %,-f:%,$(FAMOUS_FREQLIST))
+
+AWL_BASEWORD := wordlist/awl.var
+FREQ_DEP += $(AWL_BASEWORD)
+FREQ_FILTER += $(patsubst %,-b:%,$(AWL_BASEWORD))
+
+NAWL_BASEWORD := wordlist/nawl.var
+FREQ_DEP += $(NAWL_BASEWORD)
+FREQ_FILTER += $(patsubst %,-b:%,$(NAWL_BASEWORD))
+
+VOA_BASEWORD := dist/wordlist/voa.list
+FREQ_DEP += $(VOA_BASEWORD)
+FREQ_FILTER += $(patsubst %,-b:%,$(VOA_BASEWORD))
+
+FREQ_FILTER += -12000f:wordlist/oanc.freq
+
+.PHONY: freq
+freq: dist/wordlist/gadict_en-ru+uk.list py/gadict_freq.py $(FREQ_DEP) $(BUILD_SCRIPTS)
+	python3 -B py/gadict_freq.py dist/wordlist/gadict_en-ru+uk.list $(FREQ_FILTER)
+
+dist/wordlist/%.list: %.gadict py/gadict_headwords.py $(BUILD_SCRIPTS) | dist/wordlist/
+	python3 -B py/gadict_headwords.py  $< $@
+
+dist/wordlist/gsl.var: $(GSL_BASEWORD) | dist/wordlist/
+	cat $(GSL_BASEWORD) >$@
+
+dist/wordlist/ngsl.var: $(NGSL_BASEWORD) | dist/wordlist/
+	cat $(NGSL_BASEWORD) >$@
+
+dist/wordlist/voa.list: gadict_voa.gadict py/gadict_headwords.py | dist/wordlist/
+	python3 -B py/gadict_headwords.py gadict_voa.gadict >$@
+
+dist/wordlist/:
+	mkdir -p $@
+
+################################################################
 # Build targets.
 
 .PHONY: all
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/py/gadict_freq.py	Tue Nov 08 17:44:04 2016 +0200
@@ -0,0 +1,127 @@
+
+import sys
+import codecs
+import io
+import regex
+
+class WordlistParser:
+
+    def __init__(self, stream):
+        self.stream = stream
+
+    def parse(self):
+        wlist = []
+        while True:
+            line = self.stream.readline()
+            if len(line) == 0:
+                break
+            line = line.strip()
+            wlist.append(line)
+        return wlist
+
+class BasewordParser:
+
+    BASEWORD_RE = regex.compile(u"^(\t)?(.*)$")
+
+    def __init__(self, stream, limit):
+        self.stream = stream
+        self.limit = limit
+        self.lineno = 0
+        self.cnt = 0
+
+    def parse(self):
+        wlist = []
+        while True:
+            line = self.stream.readline()
+            if len(line) == 0:
+                break
+            self.lineno += 1
+            m = self.BASEWORD_RE.match(line)
+            if not m:
+                raise Exception("Line {:d}: '{:s}' wrong format\n".format(self.lineno, line))
+            tab = m.group(1)
+            if not tab:
+                self.cnt += 1
+            if self.limit and self.cnt > self.limit:
+                break
+            headword = m.group(2).strip().lower()
+            wlist.append(headword)
+        return wlist
+
+class FreqlistParser:
+
+    FREQ_RE = regex.compile(u"^([0-9]+) (.*)$")
+
+    def __init__(self, stream, limit):
+        self.stream = stream
+        self.limit = limit
+        self.lineno = 0
+
+    def parse(self):
+        wlist = []
+        while True:
+            if self.limit and self.lineno >= self.limit:
+                break
+            line = self.stream.readline()
+            if len(line) == 0:
+                break
+            self.lineno += 1
+            m = self.FREQ_RE.match(line)
+            if not m:
+                raise Exception("Line {:d} is not in NUM WORD format\n".format(self.lineno, line))
+            headword = m.group(2).strip().lower()
+            wlist.append(headword)
+        return wlist
+
+if __name__ == '__main__':
+    USAGE = "Usage: PROG  $WORDLIST  [+/-][NUM][b/f]:FREQLIST..."
+
+    if len(sys.argv) < 3:
+        raise Exception(USAGE)
+    FINAME = sys.argv[1]
+
+    with io.open(FINAME, mode='r', buffering=1, encoding="utf-8") as stream:
+        parser = WordlistParser(stream)
+        HEADWORDS = parser.parse()
+
+    COMMAND_RE = regex.compile("([-+])([0-9]+)?([bf]):([^:]+)")
+
+
+    IN_SET = set()
+    EX_SET = set()
+
+    for idx in range(2, len(sys.argv)):
+        spec = sys.argv[idx]
+        m = COMMAND_RE.match(spec)
+        if not m:
+            raise Exception("Wrong FREQLIST spec: '{:s}'\n".format(spec) + USAGE)
+        fname = m.group(4)
+        limit = m.group(2)
+        mode = m.group(3)
+        if limit:
+            limit = int(limit)
+        with io.open(fname, mode='r', buffering=1, encoding="utf-8") as stream:
+            if mode == "b":
+                parser = BasewordParser(stream, limit)
+            elif mode == "f":
+                parser = FreqlistParser(stream, limit)
+            else:
+                raise Expection("Unknown mode in specification...")
+            try:
+                wlist = parser.parse()
+            except:
+                print("Error during processing: {:s}".format(fname))
+                raise
+        wlist = set(wlist)
+        if m.group(1) == "+":
+            IN_SET |= wlist
+        else:
+            EX_SET |= wlist
+
+    for headword in HEADWORDS:
+        if any(c in headword for c in " '."):
+            continue
+        normilized= headword.lower()
+        if (len(IN_SET) == 0 or normilized in IN_SET) and not normilized in EX_SET:
+            print(headword)
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/py/gadict_headwords.py	Tue Nov 08 17:44:04 2016 +0200
@@ -0,0 +1,70 @@
+
+import sys
+import codecs
+import io
+import regex
+
+FINAME = None
+FONAME = None
+if len(sys.argv) >= 2:
+    FINAME = sys.argv[1]
+if len(sys.argv) >= 3:
+    FONAME = sys.argv[2]
+
+FIN = io.open(FINAME, mode='r', buffering=1, encoding="utf-8")
+if FONAME is None:
+    FOUT = sys.stdout
+else:
+    FOUT = codecs.open(FONAME, "w", "utf-8")
+
+
+class GadictParser:
+
+    SEPARATOR_RE = regex.compile(u"^__$")
+    EMPTY_RE = regex.compile( u"^$" )
+    HEADWORD_RE = regex.compile( u"^(\\p{L}.*)$" )
+
+    def __init__(self, stream):
+        self.stream = stream
+        self.lineno = 0
+
+    def parse(self):
+        wlist = []
+        while True:
+            line = self.stream.readline()
+            if len(line) == 0:
+                break
+            self.lineno += 1
+            m = self.SEPARATOR_RE.match(line)
+            if not m:
+                continue
+
+            line = self.stream.readline()
+            if len(line) == 0:
+                break
+            self.lineno += 1
+            m = self.EMPTY_RE.match(line)
+            if not m:
+                raise Exception("Line {:d}: '{:s}' is not empty line\n".format(self.lineno, line))
+
+            line = self.stream.readline()
+            if len(line) == 0:
+                break
+            line = line.strip()
+            self.lineno += 1
+            m = self.HEADWORD_RE.match(line)
+            if not m:
+                raise Exception("Line {:d}: '{:s}' is not a headword\n".format(self.lineno, line))
+
+            wlist.append(line)
+        return wlist
+
+try:
+    parser = GadictParser(FIN)
+    for headword in parser.parse():
+        FOUT.write(headword)
+        FOUT.write("\n")
+finally:
+    FIN.close()
+    FOUT.close()
+