Switch to built-in `re` Python module over `regex`.
import sys
import codecs
import io
import re
class WordlistParser:
def __init__(self, stream):
self.stream = stream
def parse(self):
wlist = []
while True:
line = self.stream.readline()
if len(line) == 0:
break
line = line.strip()
wlist.append(line)
return wlist
class WordformParser:
BASEVAR_RE = re.compile(u"^(\t)?(.*)$", re.UNICODE)
def __init__(self, stream, limit = None):
self.stream = stream
self.limit = limit
self.lineno = 0
self.cnt = 0
def parse(self):
wlist = []
while True:
line = self.stream.readline()
if len(line) == 0:
break
self.lineno += 1
m = self.BASEVAR_RE.match(line)
if not m:
raise Exception("Line {:d}: '{:s}' wrong format\n".format(self.lineno, line))
tab = m.group(1)
if not tab:
self.cnt += 1
if self.limit and self.cnt > self.limit:
break
headword = m.group(2).strip().lower()
wlist.append(headword)
return wlist
class FreqlistParser:
FREQ_RE = re.compile(u"^([0-9]+) (.*)$", re.UNICODE)
def __init__(self, stream, limit = None):
self.stream = stream
self.limit = limit
self.lineno = 0
def parse(self):
wlist = []
while True:
if self.limit and self.lineno >= self.limit:
break
line = self.stream.readline()
if len(line) == 0:
break
self.lineno += 1
m = self.FREQ_RE.match(line)
if not m:
raise Exception("Line '{:s}' #{:d} is not in NUM WORD format\n".format(line, self.lineno))
headword = m.group(2).strip().lower()
wlist.append(headword)
return wlist
if __name__ == '__main__':
USAGE = "Usage: PROG $WORDLIST [+/-][NUM][b/f]:FREQLIST..."
if len(sys.argv) < 3:
raise Exception(USAGE)
FINAME = sys.argv[1]
COMMAND_RE = re.compile("([-+])([0-9]+)?([bf]):([^:]+)")
IN_SET = set()
EX_SET = set()
for idx in range(1, len(sys.argv)):
spec = sys.argv[idx]
m = COMMAND_RE.match(spec)
if not m:
raise Exception("Wrong FREQLIST spec: '{:s}'\n".format(spec) + USAGE)
fname = m.group(4)
limit = m.group(2)
mode = m.group(3)
if limit:
limit = int(limit)
with io.open(fname, mode='r', buffering=1, encoding="utf-8") as stream:
if mode == "b":
parser = WordformParser(stream, limit)
elif mode == "f":
parser = FreqlistParser(stream, limit)
else:
raise Expection("Unknown mode in specification...")
try:
wlist = parser.parse()
except:
print("Error during processing: {:s}".format(fname))
raise
wlist = set([w.lower() for w in wlist])
if m.group(1) == "+":
IN_SET |= wlist
else:
EX_SET |= wlist
for headword in IN_SET - EX_SET:
# if any(c in headword for c in " '."):
# continue
print(headword)