Mark word frequency based on Paul Nation BNC+COCA 25k wordfamily list.
authorOleksandr Gavenko <gavenkoa@gmail.com>
Fri, 21 Jul 2023 23:23:31 +0300
changeset 1347 272ec25b6f12
parent 1346 e315df384eef
child 1348 8f1c99471195
Mark word frequency based on Paul Nation BNC+COCA 25k wordfamily list.
Makefile
py/gadict_c5.py
py/gadict_freq.py
py/gadict_srs_anki.py
www/HACKING.rst
--- a/Makefile	Fri Jul 21 23:19:15 2023 +0300
+++ b/Makefile	Fri Jul 21 23:23:31 2023 +0300
@@ -508,6 +508,8 @@
 $(1)_DEP += $(4)
 endef
 
+FREQLIST_OPT += -grp:'wordlist/pn25k/basewrd*.txt'
+
 # For gadict_voa.
 $(eval $(call FREQ_MACRO,VOA_FREQLIST,freq,GSL,wordlist/gsl.freq))
 $(eval $(call FREQ_MACRO,VOA_FREQLIST,freq,AWL,wordlist/awl.freq))
--- a/py/gadict_c5.py	Fri Jul 21 23:19:15 2023 +0300
+++ b/py/gadict_c5.py	Fri Jul 21 23:23:31 2023 +0300
@@ -19,6 +19,8 @@
 ARG_LANG_RE = re.compile("-lang:(.+)")
 # -freq:var:TAG=FILE or -freq:freq:TAG=FILE
 ARG_FREQ_RE = re.compile("-freq:(freq|var):([^=]+)=(.+)")
+# -grp:GLOB
+ARG_GRP_RE = re.compile("-grp:(.+)")
 
 look_for_files = False
 for idx in range(1, len(sys.argv)):
@@ -49,6 +51,13 @@
                 wlist = parser.parse()
             FREQ_SOURCES.append((tag, set(wlist)))
             continue
+        m = ARG_GRP_RE.match(arg)
+        if m:
+            patt = m.group(1)
+            parser = gadict_freq.WordformGroupParser(patt)
+            for (tag, wset) in parser.parse().items():
+                FREQ_SOURCES.append((tag, wset))
+            continue
         if arg.startswith("-"):
             raise Exception("Unsupported option format: '{:s}'".format(arg))
     if not FINAME:
--- a/py/gadict_freq.py	Fri Jul 21 23:19:15 2023 +0300
+++ b/py/gadict_freq.py	Fri Jul 21 23:23:31 2023 +0300
@@ -1,5 +1,6 @@
 
 import sys
+import glob
 import codecs
 import io
 import re
@@ -23,13 +24,14 @@
 
     BASEVAR_RE = re.compile(u"^(\t)?(.*)$", re.UNICODE)
 
-    def __init__(self, stream, limit = None):
+    def __init__(self, stream, limit = None, ignore_tab = True):
         self.stream = stream
         self.limit = limit
+        self.ignore_tab = ignore_tab
         self.lineno = 0
         self.cnt = 0
 
-    def parse(self):
+    def parse(self) -> list[str]:
         wlist = []
         while True:
             line = self.stream.readline()
@@ -40,7 +42,7 @@
             if not m:
                 raise Exception("Line {:d}: '{:s}' wrong format\n".format(self.lineno, line))
             tab = m.group(1)
-            if tab:
+            if tab and self.ignore_tab:
                 continue
             self.cnt += 1
             if self.limit and self.cnt > self.limit:
@@ -49,6 +51,25 @@
             wlist.append(headword)
         return wlist
 
+class WordformGroupParser:
+
+    def __init__(self, globpatt):
+        if globpatt.count('*') != 1:
+            raise Exception("Glob pattern should have exactly one asterisk: {:s}".format(globpatt))
+        self.globpatt = globpatt
+        self.astOff = globpatt.index('*')
+        self.cnt = 0
+
+    def parse(self) -> dict[str, set[str]]:
+        wmap = dict()
+        for fname in glob.glob(self.globpatt):
+            beg, end = self.astOff, self.astOff + 1 + len(fname) - len(self.globpatt)
+            tag = fname[beg:end]
+            with io.open(fname, mode='r', buffering=1, encoding="utf-8") as stream:
+                parser = WordformParser(stream, ignore_tab=False)
+                wmap[tag] = set(parser.parse())
+        return wmap
+
 class FreqlistParser:
 
     FREQ_RE = re.compile(u"^([0-9]+) (.*)$", re.UNICODE)
--- a/py/gadict_srs_anki.py	Fri Jul 21 23:19:15 2023 +0300
+++ b/py/gadict_srs_anki.py	Fri Jul 21 23:23:31 2023 +0300
@@ -32,6 +32,8 @@
 # -freq:var:TAG=FILE or -freq:freq:TAG=FILE
 ARG_FREQ_RE = re.compile("-freq:(freq|var):([^=]+)=(.+)")
 ARG_DELFILE_RE = re.compile("-delfile=(.+)")
+# -grp:GLOB
+ARG_GRP_RE = re.compile("-grp:(.+)")
 
 look_for_files = False
 for idx in range(1, len(sys.argv)):
@@ -66,6 +68,13 @@
                 wlist = parser.parse()
             FREQ_SOURCES.append((tag, set(wlist)))
             continue
+        m = ARG_GRP_RE.match(arg)
+        if m:
+            patt = m.group(1)
+            parser = gadict_freq.WordformGroupParser(patt)
+            for (tag, wset) in parser.parse().items():
+                FREQ_SOURCES.append((tag, wset))
+            continue
         m = ARG_DELFILE_RE.match(arg)
         if m:
             FDELNAME = m.group(1)
--- a/www/HACKING.rst	Fri Jul 21 23:19:15 2023 +0300
+++ b/www/HACKING.rst	Fri Jul 21 23:23:31 2023 +0300
@@ -325,7 +325,7 @@
 * deviation of word frequency across documents in corpus, usually marked by
   ``D``
 
-Sorting numerically on first= column::
+Sorting numerically on first column::
 
   $ sort -k 1nr,2 <$IN >$OUT
 
@@ -360,9 +360,10 @@
 
 http://www.anc.org/data/oanc/download/
   OANC download page.
-
 http://www.anc.org/data/oanc/
   OANC home page.
+https://anc.org/data/anc-second-release/frequency-data/
+  2nd release of ANC.
 
 https://en.wikipedia.org/wiki/Word_lists_by_frequency
 
@@ -396,6 +397,8 @@
   General Service List at Wikipedia.
 http://jbauman.com/aboutgsl.html
   About the General Service List by John Bauman.
+https://www.eapfoundation.com/vocab/general/gsl/
+  Sheldon Smith about GSL.
 
 New General Service List
 ------------------------
@@ -510,7 +513,7 @@
 BNC+COCA wordlist
 -----------------
 
-Paul Nation prepare frequency wordlist from combined BNC and COCA corpus:
+Paul Nation prepared a frequency wordlist from combined BNC and COCA corpus:
 
 http://www.victoria.ac.nz/lals/about/staff/paul-nation
   Paul Nation's home page and list download page.
@@ -524,6 +527,10 @@
 
 http://www.laurenceanthony.net/software/antwordprofiler/
   Laurence Anthony's AntWordProfiler home page.
+https://www.laurenceanthony.net/resources/wordlists/bnc_coca_cleaned_ver_002_20141015.zip
+  Direct download link with 25k words + extra (dated by 2014).
+https://www.wgtn.ac.nz/lals/resources/paul-nations-resources/vocabulary-lists
+  Paul's page at Victoria University with download of wordlist (first 10k).
 
 Oxford 3000/5000
 ----------------