Mark word frequency based on Paul Nation BNC+COCA 25k wordfamily list.
--- a/Makefile Fri Jul 21 23:19:15 2023 +0300
+++ b/Makefile Fri Jul 21 23:23:31 2023 +0300
@@ -508,6 +508,8 @@
$(1)_DEP += $(4)
endef
+FREQLIST_OPT += -grp:'wordlist/pn25k/basewrd*.txt'
+
# For gadict_voa.
$(eval $(call FREQ_MACRO,VOA_FREQLIST,freq,GSL,wordlist/gsl.freq))
$(eval $(call FREQ_MACRO,VOA_FREQLIST,freq,AWL,wordlist/awl.freq))
--- a/py/gadict_c5.py Fri Jul 21 23:19:15 2023 +0300
+++ b/py/gadict_c5.py Fri Jul 21 23:23:31 2023 +0300
@@ -19,6 +19,8 @@
ARG_LANG_RE = re.compile("-lang:(.+)")
# -freq:var:TAG=FILE or -freq:freq:TAG=FILE
ARG_FREQ_RE = re.compile("-freq:(freq|var):([^=]+)=(.+)")
+# -grp:GLOB
+ARG_GRP_RE = re.compile("-grp:(.+)")
look_for_files = False
for idx in range(1, len(sys.argv)):
@@ -49,6 +51,13 @@
wlist = parser.parse()
FREQ_SOURCES.append((tag, set(wlist)))
continue
+ m = ARG_GRP_RE.match(arg)
+ if m:
+ patt = m.group(1)
+ parser = gadict_freq.WordformGroupParser(patt)
+ for (tag, wset) in parser.parse().items():
+ FREQ_SOURCES.append((tag, wset))
+ continue
if arg.startswith("-"):
raise Exception("Unsupported option format: '{:s}'".format(arg))
if not FINAME:
--- a/py/gadict_freq.py Fri Jul 21 23:19:15 2023 +0300
+++ b/py/gadict_freq.py Fri Jul 21 23:23:31 2023 +0300
@@ -1,5 +1,6 @@
import sys
+import glob
import codecs
import io
import re
@@ -23,13 +24,14 @@
BASEVAR_RE = re.compile(u"^(\t)?(.*)$", re.UNICODE)
- def __init__(self, stream, limit = None):
+ def __init__(self, stream, limit = None, ignore_tab = True):
self.stream = stream
self.limit = limit
+ self.ignore_tab = ignore_tab
self.lineno = 0
self.cnt = 0
- def parse(self):
+ def parse(self) -> list[str]:
wlist = []
while True:
line = self.stream.readline()
@@ -40,7 +42,7 @@
if not m:
raise Exception("Line {:d}: '{:s}' wrong format\n".format(self.lineno, line))
tab = m.group(1)
- if tab:
+ if tab and self.ignore_tab:
continue
self.cnt += 1
if self.limit and self.cnt > self.limit:
@@ -49,6 +51,25 @@
wlist.append(headword)
return wlist
+class WordformGroupParser:
+
+ def __init__(self, globpatt):
+ if globpatt.count('*') != 1:
+ raise Exception("Glob pattern should have exactly one asterisk: {:s}".format(globpatt))
+ self.globpatt = globpatt
+ self.astOff = globpatt.index('*')
+ self.cnt = 0
+
+ def parse(self) -> dict[str, set[str]]:
+ wmap = dict()
+ for fname in glob.glob(self.globpatt):
+ beg, end = self.astOff, self.astOff + 1 + len(fname) - len(self.globpatt)
+ tag = fname[beg:end]
+ with io.open(fname, mode='r', buffering=1, encoding="utf-8") as stream:
+ parser = WordformParser(stream, ignore_tab=False)
+ wmap[tag] = set(parser.parse())
+ return wmap
+
class FreqlistParser:
FREQ_RE = re.compile(u"^([0-9]+) (.*)$", re.UNICODE)
--- a/py/gadict_srs_anki.py Fri Jul 21 23:19:15 2023 +0300
+++ b/py/gadict_srs_anki.py Fri Jul 21 23:23:31 2023 +0300
@@ -32,6 +32,8 @@
# -freq:var:TAG=FILE or -freq:freq:TAG=FILE
ARG_FREQ_RE = re.compile("-freq:(freq|var):([^=]+)=(.+)")
ARG_DELFILE_RE = re.compile("-delfile=(.+)")
+# -grp:GLOB
+ARG_GRP_RE = re.compile("-grp:(.+)")
look_for_files = False
for idx in range(1, len(sys.argv)):
@@ -66,6 +68,13 @@
wlist = parser.parse()
FREQ_SOURCES.append((tag, set(wlist)))
continue
+ m = ARG_GRP_RE.match(arg)
+ if m:
+ patt = m.group(1)
+ parser = gadict_freq.WordformGroupParser(patt)
+ for (tag, wset) in parser.parse().items():
+ FREQ_SOURCES.append((tag, wset))
+ continue
m = ARG_DELFILE_RE.match(arg)
if m:
FDELNAME = m.group(1)
--- a/www/HACKING.rst Fri Jul 21 23:19:15 2023 +0300
+++ b/www/HACKING.rst Fri Jul 21 23:23:31 2023 +0300
@@ -325,7 +325,7 @@
* deviation of word frequency across documents in corpus, usually marked by
``D``
-Sorting numerically on first= column::
+Sorting numerically on first column::
$ sort -k 1nr,2 <$IN >$OUT
@@ -360,9 +360,10 @@
http://www.anc.org/data/oanc/download/
OANC download page.
-
http://www.anc.org/data/oanc/
OANC home page.
+https://anc.org/data/anc-second-release/frequency-data/
+ 2nd release of ANC.
https://en.wikipedia.org/wiki/Word_lists_by_frequency
@@ -396,6 +397,8 @@
General Service List at Wikipedia.
http://jbauman.com/aboutgsl.html
About the General Service List by John Bauman.
+https://www.eapfoundation.com/vocab/general/gsl/
+ Sheldon Smith about GSL.
New General Service List
------------------------
@@ -510,7 +513,7 @@
BNC+COCA wordlist
-----------------
-Paul Nation prepare frequency wordlist from combined BNC and COCA corpus:
+Paul Nation prepared a frequency wordlist from combined BNC and COCA corpus:
http://www.victoria.ac.nz/lals/about/staff/paul-nation
Paul Nation's home page and list download page.
@@ -524,6 +527,10 @@
http://www.laurenceanthony.net/software/antwordprofiler/
Laurence Anthony's AntWordProfiler home page.
+https://www.laurenceanthony.net/resources/wordlists/bnc_coca_cleaned_ver_002_20141015.zip
+ Direct download link with 25k words + extra (dated by 2014).
+https://www.wgtn.ac.nz/lals/resources/paul-nations-resources/vocabulary-lists
+ Paul's page at Victoria University with download of wordlist (first 10k).
Oxford 3000/5000
----------------