# HG changeset patch # User Oleksandr Gavenko # Date 1689971011 -10800 # Node ID 272ec25b6f1231954888fe7a77ca6d3cca5bed58 # Parent e315df384eef47bab6b9bf96bdbd6c0be57cebf7 Mark word frequency based on Paul Nation BNC+COCA 25k wordfamily list. diff -r e315df384eef -r 272ec25b6f12 Makefile --- a/Makefile Fri Jul 21 23:19:15 2023 +0300 +++ b/Makefile Fri Jul 21 23:23:31 2023 +0300 @@ -508,6 +508,8 @@ $(1)_DEP += $(4) endef +FREQLIST_OPT += -grp:'wordlist/pn25k/basewrd*.txt' + # For gadict_voa. $(eval $(call FREQ_MACRO,VOA_FREQLIST,freq,GSL,wordlist/gsl.freq)) $(eval $(call FREQ_MACRO,VOA_FREQLIST,freq,AWL,wordlist/awl.freq)) diff -r e315df384eef -r 272ec25b6f12 py/gadict_c5.py --- a/py/gadict_c5.py Fri Jul 21 23:19:15 2023 +0300 +++ b/py/gadict_c5.py Fri Jul 21 23:23:31 2023 +0300 @@ -19,6 +19,8 @@ ARG_LANG_RE = re.compile("-lang:(.+)") # -freq:var:TAG=FILE or -freq:freq:TAG=FILE ARG_FREQ_RE = re.compile("-freq:(freq|var):([^=]+)=(.+)") +# -grp:GLOB +ARG_GRP_RE = re.compile("-grp:(.+)") look_for_files = False for idx in range(1, len(sys.argv)): @@ -49,6 +51,13 @@ wlist = parser.parse() FREQ_SOURCES.append((tag, set(wlist))) continue + m = ARG_GRP_RE.match(arg) + if m: + patt = m.group(1) + parser = gadict_freq.WordformGroupParser(patt) + for (tag, wset) in parser.parse().items(): + FREQ_SOURCES.append((tag, wset)) + continue if arg.startswith("-"): raise Exception("Unsupported option format: '{:s}'".format(arg)) if not FINAME: diff -r e315df384eef -r 272ec25b6f12 py/gadict_freq.py --- a/py/gadict_freq.py Fri Jul 21 23:19:15 2023 +0300 +++ b/py/gadict_freq.py Fri Jul 21 23:23:31 2023 +0300 @@ -1,5 +1,6 @@ import sys +import glob import codecs import io import re @@ -23,13 +24,14 @@ BASEVAR_RE = re.compile(u"^(\t)?(.*)$", re.UNICODE) - def __init__(self, stream, limit = None): + def __init__(self, stream, limit = None, ignore_tab = True): self.stream = stream self.limit = limit + self.ignore_tab = ignore_tab self.lineno = 0 self.cnt = 0 - def parse(self): + def parse(self) -> list[str]: wlist = [] while True: line = self.stream.readline() @@ -40,7 +42,7 @@ if not m: raise Exception("Line {:d}: '{:s}' wrong format\n".format(self.lineno, line)) tab = m.group(1) - if tab: + if tab and self.ignore_tab: continue self.cnt += 1 if self.limit and self.cnt > self.limit: @@ -49,6 +51,25 @@ wlist.append(headword) return wlist +class WordformGroupParser: + + def __init__(self, globpatt): + if globpatt.count('*') != 1: + raise Exception("Glob pattern should have exactly one asterisk: {:s}".format(globpatt)) + self.globpatt = globpatt + self.astOff = globpatt.index('*') + self.cnt = 0 + + def parse(self) -> dict[str, set[str]]: + wmap = dict() + for fname in glob.glob(self.globpatt): + beg, end = self.astOff, self.astOff + 1 + len(fname) - len(self.globpatt) + tag = fname[beg:end] + with io.open(fname, mode='r', buffering=1, encoding="utf-8") as stream: + parser = WordformParser(stream, ignore_tab=False) + wmap[tag] = set(parser.parse()) + return wmap + class FreqlistParser: FREQ_RE = re.compile(u"^([0-9]+) (.*)$", re.UNICODE) diff -r e315df384eef -r 272ec25b6f12 py/gadict_srs_anki.py --- a/py/gadict_srs_anki.py Fri Jul 21 23:19:15 2023 +0300 +++ b/py/gadict_srs_anki.py Fri Jul 21 23:23:31 2023 +0300 @@ -32,6 +32,8 @@ # -freq:var:TAG=FILE or -freq:freq:TAG=FILE ARG_FREQ_RE = re.compile("-freq:(freq|var):([^=]+)=(.+)") ARG_DELFILE_RE = re.compile("-delfile=(.+)") +# -grp:GLOB +ARG_GRP_RE = re.compile("-grp:(.+)") look_for_files = False for idx in range(1, len(sys.argv)): @@ -66,6 +68,13 @@ wlist = parser.parse() FREQ_SOURCES.append((tag, set(wlist))) continue + m = ARG_GRP_RE.match(arg) + if m: + patt = m.group(1) + parser = gadict_freq.WordformGroupParser(patt) + for (tag, wset) in parser.parse().items(): + FREQ_SOURCES.append((tag, wset)) + continue m = ARG_DELFILE_RE.match(arg) if m: FDELNAME = m.group(1) diff -r e315df384eef -r 272ec25b6f12 www/HACKING.rst --- a/www/HACKING.rst Fri Jul 21 23:19:15 2023 +0300 +++ b/www/HACKING.rst Fri Jul 21 23:23:31 2023 +0300 @@ -325,7 +325,7 @@ * deviation of word frequency across documents in corpus, usually marked by ``D`` -Sorting numerically on first= column:: +Sorting numerically on first column:: $ sort -k 1nr,2 <$IN >$OUT @@ -360,9 +360,10 @@ http://www.anc.org/data/oanc/download/ OANC download page. - http://www.anc.org/data/oanc/ OANC home page. +https://anc.org/data/anc-second-release/frequency-data/ + 2nd release of ANC. https://en.wikipedia.org/wiki/Word_lists_by_frequency @@ -396,6 +397,8 @@ General Service List at Wikipedia. http://jbauman.com/aboutgsl.html About the General Service List by John Bauman. +https://www.eapfoundation.com/vocab/general/gsl/ + Sheldon Smith about GSL. New General Service List ------------------------ @@ -510,7 +513,7 @@ BNC+COCA wordlist ----------------- -Paul Nation prepare frequency wordlist from combined BNC and COCA corpus: +Paul Nation prepared a frequency wordlist from combined BNC and COCA corpus: http://www.victoria.ac.nz/lals/about/staff/paul-nation Paul Nation's home page and list download page. @@ -524,6 +527,10 @@ http://www.laurenceanthony.net/software/antwordprofiler/ Laurence Anthony's AntWordProfiler home page. +https://www.laurenceanthony.net/resources/wordlists/bnc_coca_cleaned_ver_002_20141015.zip + Direct download link with 25k words + extra (dated by 2014). +https://www.wgtn.ac.nz/lals/resources/paul-nations-resources/vocabulary-lists + Paul's page at Victoria University with download of wordlist (first 10k). Oxford 3000/5000 ----------------