py/gadict_freq.py
author Oleksandr Gavenko <gavenkoa@gmail.com>
Fri, 21 Jul 2023 23:23:31 +0300
changeset 1347 272ec25b6f12
parent 1004 181301cc2c0c
permissions -rw-r--r--
Mark word frequency based on Paul Nation BNC+COCA 25k wordfamily list.


import sys
import glob
import codecs
import io
import re

class WordlistParser:

    def __init__(self, stream):
        self.stream = stream

    def parse(self):
        wlist = []
        while True:
            line = self.stream.readline()
            if len(line) == 0:
                break
            line = line.strip()
            wlist.append(line)
        return wlist

class WordformParser:

    BASEVAR_RE = re.compile(u"^(\t)?(.*)$", re.UNICODE)

    def __init__(self, stream, limit = None, ignore_tab = True):
        self.stream = stream
        self.limit = limit
        self.ignore_tab = ignore_tab
        self.lineno = 0
        self.cnt = 0

    def parse(self) -> list[str]:
        wlist = []
        while True:
            line = self.stream.readline()
            if len(line) == 0:
                break
            self.lineno += 1
            m = self.BASEVAR_RE.match(line)
            if not m:
                raise Exception("Line {:d}: '{:s}' wrong format\n".format(self.lineno, line))
            tab = m.group(1)
            if tab and self.ignore_tab:
                continue
            self.cnt += 1
            if self.limit and self.cnt > self.limit:
                break
            headword = m.group(2).strip().lower()
            wlist.append(headword)
        return wlist

class WordformGroupParser:

    def __init__(self, globpatt):
        if globpatt.count('*') != 1:
            raise Exception("Glob pattern should have exactly one asterisk: {:s}".format(globpatt))
        self.globpatt = globpatt
        self.astOff = globpatt.index('*')
        self.cnt = 0

    def parse(self) -> dict[str, set[str]]:
        wmap = dict()
        for fname in glob.glob(self.globpatt):
            beg, end = self.astOff, self.astOff + 1 + len(fname) - len(self.globpatt)
            tag = fname[beg:end]
            with io.open(fname, mode='r', buffering=1, encoding="utf-8") as stream:
                parser = WordformParser(stream, ignore_tab=False)
                wmap[tag] = set(parser.parse())
        return wmap

class FreqlistParser:

    FREQ_RE = re.compile(u"^([0-9]+) (.*)$", re.UNICODE)

    def __init__(self, stream, limit = None):
        self.stream = stream
        self.limit = limit
        self.lineno = 0

    def parse(self):
        wlist = []
        while True:
            if self.limit and self.lineno >= self.limit:
                break
            line = self.stream.readline()
            if len(line) == 0:
                break
            self.lineno += 1
            m = self.FREQ_RE.match(line)
            if not m:
                raise Exception("Line '{:s}' #{:d} is not in NUM WORD format\n".format(line, self.lineno))
            headword = m.group(2).strip().lower()
            wlist.append(headword)
        return wlist

if __name__ == '__main__':
    USAGE = "Usage: PROG  $WORDLIST  [+/-][NUM][b/f]:FREQLIST..."

    if len(sys.argv) < 3:
        raise Exception(USAGE)

    COMMAND_RE = re.compile("([-+])([0-9]+)?([bf]):([^:]+)")

    IN_SET = set()
    EX_SET = set()

    for idx in range(1, len(sys.argv)):
        spec = sys.argv[idx]
        m = COMMAND_RE.match(spec)
        if not m:
            raise Exception("Wrong FREQLIST spec: '{:s}'\n".format(spec) + USAGE)
        fname = m.group(4)
        limit = m.group(2)
        mode = m.group(3)
        if limit:
            limit = int(limit)
        with io.open(fname, mode='r', buffering=1, encoding="utf-8") as stream:
            if mode == "b":
                parser = WordformParser(stream, limit)
            elif mode == "f":
                parser = FreqlistParser(stream, limit)
            else:
                raise Expection("Unknown mode in specification...")
            try:
                wlist = parser.parse()
            except:
                print("Error during processing: {:s}".format(fname))
                raise
        wlist = set([w.lower() for w in wlist])
        if m.group(1) == "+":
            IN_SET |= wlist
        else:
            EX_SET |= wlist

    for headword in IN_SET - EX_SET:
        # if any(c in headword for c in " '."):
        #     continue
        print(headword)