py/gadict_freq.py
author Oleksandr Gavenko <gavenkoa@gmail.com>
Sat, 21 Oct 2017 22:14:55 +0300
changeset 956 36454643fec6
parent 804 7aa283584ee6
child 1004 181301cc2c0c
permissions -rw-r--r--
Printed VOA dictionary isn't going to change so there is no need to require LaTeX in built environment to make releases.


import sys
import codecs
import io
import re

class WordlistParser:

    def __init__(self, stream):
        self.stream = stream

    def parse(self):
        wlist = []
        while True:
            line = self.stream.readline()
            if len(line) == 0:
                break
            line = line.strip()
            wlist.append(line)
        return wlist

class WordformParser:

    BASEVAR_RE = re.compile(u"^(\t)?(.*)$", re.UNICODE)

    def __init__(self, stream, limit = None):
        self.stream = stream
        self.limit = limit
        self.lineno = 0
        self.cnt = 0

    def parse(self):
        wlist = []
        while True:
            line = self.stream.readline()
            if len(line) == 0:
                break
            self.lineno += 1
            m = self.BASEVAR_RE.match(line)
            if not m:
                raise Exception("Line {:d}: '{:s}' wrong format\n".format(self.lineno, line))
            tab = m.group(1)
            if tab:
                continue
            self.cnt += 1
            if self.limit and self.cnt > self.limit:
                break
            headword = m.group(2).strip().lower()
            wlist.append(headword)
        return wlist

class FreqlistParser:

    FREQ_RE = re.compile(u"^([0-9]+) (.*)$", re.UNICODE)

    def __init__(self, stream, limit = None):
        self.stream = stream
        self.limit = limit
        self.lineno = 0

    def parse(self):
        wlist = []
        while True:
            if self.limit and self.lineno >= self.limit:
                break
            line = self.stream.readline()
            if len(line) == 0:
                break
            self.lineno += 1
            m = self.FREQ_RE.match(line)
            if not m:
                raise Exception("Line '{:s}' #{:d} is not in NUM WORD format\n".format(line, self.lineno))
            headword = m.group(2).strip().lower()
            wlist.append(headword)
        return wlist

if __name__ == '__main__':
    USAGE = "Usage: PROG  $WORDLIST  [+/-][NUM][b/f]:FREQLIST..."

    if len(sys.argv) < 3:
        raise Exception(USAGE)
    FINAME = sys.argv[1]

    COMMAND_RE = re.compile("([-+])([0-9]+)?([bf]):([^:]+)")

    IN_SET = set()
    EX_SET = set()

    for idx in range(1, len(sys.argv)):
        spec = sys.argv[idx]
        m = COMMAND_RE.match(spec)
        if not m:
            raise Exception("Wrong FREQLIST spec: '{:s}'\n".format(spec) + USAGE)
        fname = m.group(4)
        limit = m.group(2)
        mode = m.group(3)
        if limit:
            limit = int(limit)
        with io.open(fname, mode='r', buffering=1, encoding="utf-8") as stream:
            if mode == "b":
                parser = WordformParser(stream, limit)
            elif mode == "f":
                parser = FreqlistParser(stream, limit)
            else:
                raise Expection("Unknown mode in specification...")
            try:
                wlist = parser.parse()
            except:
                print("Error during processing: {:s}".format(fname))
                raise
        wlist = set([w.lower() for w in wlist])
        if m.group(1) == "+":
            IN_SET |= wlist
        else:
            EX_SET |= wlist

    for headword in IN_SET - EX_SET:
        # if any(c in headword for c in " '."):
        #     continue
        print(headword)