py/gadict_freq.py
author Oleksandr Gavenko <gavenkoa@gmail.com>
Sat, 20 Apr 2019 17:41:33 +0300
changeset 1145 79b55cca9f44
parent 1004 181301cc2c0c
child 1347 272ec25b6f12
permissions -rw-r--r--
Updated Emacs gaphrase to support new format with unique ids. Id is necessary to keep progress in Anki on deck subsequent import.


import sys
import codecs
import io
import re

class WordlistParser:

    def __init__(self, stream):
        self.stream = stream

    def parse(self):
        wlist = []
        while True:
            line = self.stream.readline()
            if len(line) == 0:
                break
            line = line.strip()
            wlist.append(line)
        return wlist

class WordformParser:

    BASEVAR_RE = re.compile(u"^(\t)?(.*)$", re.UNICODE)

    def __init__(self, stream, limit = None):
        self.stream = stream
        self.limit = limit
        self.lineno = 0
        self.cnt = 0

    def parse(self):
        wlist = []
        while True:
            line = self.stream.readline()
            if len(line) == 0:
                break
            self.lineno += 1
            m = self.BASEVAR_RE.match(line)
            if not m:
                raise Exception("Line {:d}: '{:s}' wrong format\n".format(self.lineno, line))
            tab = m.group(1)
            if tab:
                continue
            self.cnt += 1
            if self.limit and self.cnt > self.limit:
                break
            headword = m.group(2).strip().lower()
            wlist.append(headword)
        return wlist

class FreqlistParser:

    FREQ_RE = re.compile(u"^([0-9]+) (.*)$", re.UNICODE)

    def __init__(self, stream, limit = None):
        self.stream = stream
        self.limit = limit
        self.lineno = 0

    def parse(self):
        wlist = []
        while True:
            if self.limit and self.lineno >= self.limit:
                break
            line = self.stream.readline()
            if len(line) == 0:
                break
            self.lineno += 1
            m = self.FREQ_RE.match(line)
            if not m:
                raise Exception("Line '{:s}' #{:d} is not in NUM WORD format\n".format(line, self.lineno))
            headword = m.group(2).strip().lower()
            wlist.append(headword)
        return wlist

if __name__ == '__main__':
    USAGE = "Usage: PROG  $WORDLIST  [+/-][NUM][b/f]:FREQLIST..."

    if len(sys.argv) < 3:
        raise Exception(USAGE)

    COMMAND_RE = re.compile("([-+])([0-9]+)?([bf]):([^:]+)")

    IN_SET = set()
    EX_SET = set()

    for idx in range(1, len(sys.argv)):
        spec = sys.argv[idx]
        m = COMMAND_RE.match(spec)
        if not m:
            raise Exception("Wrong FREQLIST spec: '{:s}'\n".format(spec) + USAGE)
        fname = m.group(4)
        limit = m.group(2)
        mode = m.group(3)
        if limit:
            limit = int(limit)
        with io.open(fname, mode='r', buffering=1, encoding="utf-8") as stream:
            if mode == "b":
                parser = WordformParser(stream, limit)
            elif mode == "f":
                parser = FreqlistParser(stream, limit)
            else:
                raise Expection("Unknown mode in specification...")
            try:
                wlist = parser.parse()
            except:
                print("Error during processing: {:s}".format(fname))
                raise
        wlist = set([w.lower() for w in wlist])
        if m.group(1) == "+":
            IN_SET |= wlist
        else:
            EX_SET |= wlist

    for headword in IN_SET - EX_SET:
        # if any(c in headword for c in " '."):
        #     continue
        print(headword)