# HG changeset patch # User Oleksandr Gavenko # Date 1478619844 -7200 # Node ID c2c32f45dde6da9ea70dea442aadf1d2c72af2d6 # Parent c1032aea6265c44463b8fe1ef842d8cd15e0b17f Search rare words in gadict. diff -r c1032aea6265 -r c2c32f45dde6 Makefile --- a/Makefile Tue Nov 08 17:39:22 2016 +0200 +++ b/Makefile Tue Nov 08 17:44:04 2016 +0200 @@ -393,6 +393,64 @@ mkdir -p $@ ################################################################ +# Word frequency statistic. + +FREQ_DEP := +FREQ_FILTER := + +# BNC_COCA_BASEWORD := $(wildcard wordlist/bnc+coca/basewrd[0-1]?.txt) +# FREQ_DEP += $(BNC_COCA_BASEWORD) +# FREQ_FILTER += $(patsubst %,-b:%,$(BNC_COCA_BASEWORD)) + +GSL_BASEWORD := $(wildcard wordlist/gsl_1000_*.var) +# FREQ_DEP += $(GSL_BASEWORD) +# FREQ_FILTER += $(patsubst %,-b:%,$(GSL_BASEWORD)) +FREQ_DEP += dist/wordlist/gsl.var +FREQ_FILTER += -b:dist/wordlist/gsl.var + +NGSL_BASEWORD := $(wildcard wordlist/ngsl_1000_[1-3].var wordlist/ngsl_supplemental.var) +# FREQ_DEP += $(NGSL_BASEWORD) +# FREQ_FILTER += $(patsubst %,-b:%,$(NGSL_BASEWORD)) +FREQ_DEP += dist/wordlist/ngsl.var +FREQ_FILTER += -b:dist/wordlist/ngsl.var + +# FAMOUS_FREQLIST := wordlist/awl.freq wordlist/gsl.freq wordlist/nawl.freq wordlist/ngsl.freq +# FREQ_FILTER += $(patsubst %,-f:%,$(FAMOUS_FREQLIST)) + +AWL_BASEWORD := wordlist/awl.var +FREQ_DEP += $(AWL_BASEWORD) +FREQ_FILTER += $(patsubst %,-b:%,$(AWL_BASEWORD)) + +NAWL_BASEWORD := wordlist/nawl.var +FREQ_DEP += $(NAWL_BASEWORD) +FREQ_FILTER += $(patsubst %,-b:%,$(NAWL_BASEWORD)) + +VOA_BASEWORD := dist/wordlist/voa.list +FREQ_DEP += $(VOA_BASEWORD) +FREQ_FILTER += $(patsubst %,-b:%,$(VOA_BASEWORD)) + +FREQ_FILTER += -12000f:wordlist/oanc.freq + +.PHONY: freq +freq: dist/wordlist/gadict_en-ru+uk.list py/gadict_freq.py $(FREQ_DEP) $(BUILD_SCRIPTS) + python3 -B py/gadict_freq.py dist/wordlist/gadict_en-ru+uk.list $(FREQ_FILTER) + +dist/wordlist/%.list: %.gadict py/gadict_headwords.py $(BUILD_SCRIPTS) | dist/wordlist/ + python3 -B py/gadict_headwords.py $< $@ + +dist/wordlist/gsl.var: $(GSL_BASEWORD) | dist/wordlist/ + cat $(GSL_BASEWORD) >$@ + +dist/wordlist/ngsl.var: $(NGSL_BASEWORD) | dist/wordlist/ + cat $(NGSL_BASEWORD) >$@ + +dist/wordlist/voa.list: gadict_voa.gadict py/gadict_headwords.py | dist/wordlist/ + python3 -B py/gadict_headwords.py gadict_voa.gadict >$@ + +dist/wordlist/: + mkdir -p $@ + +################################################################ # Build targets. .PHONY: all diff -r c1032aea6265 -r c2c32f45dde6 py/gadict_freq.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/py/gadict_freq.py Tue Nov 08 17:44:04 2016 +0200 @@ -0,0 +1,127 @@ + +import sys +import codecs +import io +import regex + +class WordlistParser: + + def __init__(self, stream): + self.stream = stream + + def parse(self): + wlist = [] + while True: + line = self.stream.readline() + if len(line) == 0: + break + line = line.strip() + wlist.append(line) + return wlist + +class BasewordParser: + + BASEWORD_RE = regex.compile(u"^(\t)?(.*)$") + + def __init__(self, stream, limit): + self.stream = stream + self.limit = limit + self.lineno = 0 + self.cnt = 0 + + def parse(self): + wlist = [] + while True: + line = self.stream.readline() + if len(line) == 0: + break + self.lineno += 1 + m = self.BASEWORD_RE.match(line) + if not m: + raise Exception("Line {:d}: '{:s}' wrong format\n".format(self.lineno, line)) + tab = m.group(1) + if not tab: + self.cnt += 1 + if self.limit and self.cnt > self.limit: + break + headword = m.group(2).strip().lower() + wlist.append(headword) + return wlist + +class FreqlistParser: + + FREQ_RE = regex.compile(u"^([0-9]+) (.*)$") + + def __init__(self, stream, limit): + self.stream = stream + self.limit = limit + self.lineno = 0 + + def parse(self): + wlist = [] + while True: + if self.limit and self.lineno >= self.limit: + break + line = self.stream.readline() + if len(line) == 0: + break + self.lineno += 1 + m = self.FREQ_RE.match(line) + if not m: + raise Exception("Line {:d} is not in NUM WORD format\n".format(self.lineno, line)) + headword = m.group(2).strip().lower() + wlist.append(headword) + return wlist + +if __name__ == '__main__': + USAGE = "Usage: PROG $WORDLIST [+/-][NUM][b/f]:FREQLIST..." + + if len(sys.argv) < 3: + raise Exception(USAGE) + FINAME = sys.argv[1] + + with io.open(FINAME, mode='r', buffering=1, encoding="utf-8") as stream: + parser = WordlistParser(stream) + HEADWORDS = parser.parse() + + COMMAND_RE = regex.compile("([-+])([0-9]+)?([bf]):([^:]+)") + + + IN_SET = set() + EX_SET = set() + + for idx in range(2, len(sys.argv)): + spec = sys.argv[idx] + m = COMMAND_RE.match(spec) + if not m: + raise Exception("Wrong FREQLIST spec: '{:s}'\n".format(spec) + USAGE) + fname = m.group(4) + limit = m.group(2) + mode = m.group(3) + if limit: + limit = int(limit) + with io.open(fname, mode='r', buffering=1, encoding="utf-8") as stream: + if mode == "b": + parser = BasewordParser(stream, limit) + elif mode == "f": + parser = FreqlistParser(stream, limit) + else: + raise Expection("Unknown mode in specification...") + try: + wlist = parser.parse() + except: + print("Error during processing: {:s}".format(fname)) + raise + wlist = set(wlist) + if m.group(1) == "+": + IN_SET |= wlist + else: + EX_SET |= wlist + + for headword in HEADWORDS: + if any(c in headword for c in " '."): + continue + normilized= headword.lower() + if (len(IN_SET) == 0 or normilized in IN_SET) and not normilized in EX_SET: + print(headword) + diff -r c1032aea6265 -r c2c32f45dde6 py/gadict_headwords.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/py/gadict_headwords.py Tue Nov 08 17:44:04 2016 +0200 @@ -0,0 +1,70 @@ + +import sys +import codecs +import io +import regex + +FINAME = None +FONAME = None +if len(sys.argv) >= 2: + FINAME = sys.argv[1] +if len(sys.argv) >= 3: + FONAME = sys.argv[2] + +FIN = io.open(FINAME, mode='r', buffering=1, encoding="utf-8") +if FONAME is None: + FOUT = sys.stdout +else: + FOUT = codecs.open(FONAME, "w", "utf-8") + + +class GadictParser: + + SEPARATOR_RE = regex.compile(u"^__$") + EMPTY_RE = regex.compile( u"^$" ) + HEADWORD_RE = regex.compile( u"^(\\p{L}.*)$" ) + + def __init__(self, stream): + self.stream = stream + self.lineno = 0 + + def parse(self): + wlist = [] + while True: + line = self.stream.readline() + if len(line) == 0: + break + self.lineno += 1 + m = self.SEPARATOR_RE.match(line) + if not m: + continue + + line = self.stream.readline() + if len(line) == 0: + break + self.lineno += 1 + m = self.EMPTY_RE.match(line) + if not m: + raise Exception("Line {:d}: '{:s}' is not empty line\n".format(self.lineno, line)) + + line = self.stream.readline() + if len(line) == 0: + break + line = line.strip() + self.lineno += 1 + m = self.HEADWORD_RE.match(line) + if not m: + raise Exception("Line {:d}: '{:s}' is not a headword\n".format(self.lineno, line)) + + wlist.append(line) + return wlist + +try: + parser = GadictParser(FIN) + for headword in parser.parse(): + FOUT.write(headword) + FOUT.write("\n") +finally: + FIN.close() + FOUT.close() +