Printed VOA dictionary isn't going to change so there is no need to require
LaTeX in built environment to make releases.
import sys
import codecs
import io
import re
class WordlistParser:
def __init__(self, stream):
self.stream = stream
def parse(self):
wlist = []
while True:
line = self.stream.readline()
if len(line) == 0:
break
line = line.strip()
wlist.append(line)
return wlist
class WordformParser:
BASEVAR_RE = re.compile(u"^(\t)?(.*)$", re.UNICODE)
def __init__(self, stream, limit = None):
self.stream = stream
self.limit = limit
self.lineno = 0
self.cnt = 0
def parse(self):
wlist = []
while True:
line = self.stream.readline()
if len(line) == 0:
break
self.lineno += 1
m = self.BASEVAR_RE.match(line)
if not m:
raise Exception("Line {:d}: '{:s}' wrong format\n".format(self.lineno, line))
tab = m.group(1)
if tab:
continue
self.cnt += 1
if self.limit and self.cnt > self.limit:
break
headword = m.group(2).strip().lower()
wlist.append(headword)
return wlist
class FreqlistParser:
FREQ_RE = re.compile(u"^([0-9]+) (.*)$", re.UNICODE)
def __init__(self, stream, limit = None):
self.stream = stream
self.limit = limit
self.lineno = 0
def parse(self):
wlist = []
while True:
if self.limit and self.lineno >= self.limit:
break
line = self.stream.readline()
if len(line) == 0:
break
self.lineno += 1
m = self.FREQ_RE.match(line)
if not m:
raise Exception("Line '{:s}' #{:d} is not in NUM WORD format\n".format(line, self.lineno))
headword = m.group(2).strip().lower()
wlist.append(headword)
return wlist
if __name__ == '__main__':
USAGE = "Usage: PROG $WORDLIST [+/-][NUM][b/f]:FREQLIST..."
if len(sys.argv) < 3:
raise Exception(USAGE)
FINAME = sys.argv[1]
COMMAND_RE = re.compile("([-+])([0-9]+)?([bf]):([^:]+)")
IN_SET = set()
EX_SET = set()
for idx in range(1, len(sys.argv)):
spec = sys.argv[idx]
m = COMMAND_RE.match(spec)
if not m:
raise Exception("Wrong FREQLIST spec: '{:s}'\n".format(spec) + USAGE)
fname = m.group(4)
limit = m.group(2)
mode = m.group(3)
if limit:
limit = int(limit)
with io.open(fname, mode='r', buffering=1, encoding="utf-8") as stream:
if mode == "b":
parser = WordformParser(stream, limit)
elif mode == "f":
parser = FreqlistParser(stream, limit)
else:
raise Expection("Unknown mode in specification...")
try:
wlist = parser.parse()
except:
print("Error during processing: {:s}".format(fname))
raise
wlist = set([w.lower() for w in wlist])
if m.group(1) == "+":
IN_SET |= wlist
else:
EX_SET |= wlist
for headword in IN_SET - EX_SET:
# if any(c in headword for c in " '."):
# continue
print(headword)