--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/py/gadict_freq.py Tue Nov 08 17:44:04 2016 +0200
@@ -0,0 +1,127 @@
+
+import sys
+import codecs
+import io
+import regex
+
+class WordlistParser:
+
+ def __init__(self, stream):
+ self.stream = stream
+
+ def parse(self):
+ wlist = []
+ while True:
+ line = self.stream.readline()
+ if len(line) == 0:
+ break
+ line = line.strip()
+ wlist.append(line)
+ return wlist
+
+class BasewordParser:
+
+ BASEWORD_RE = regex.compile(u"^(\t)?(.*)$")
+
+ def __init__(self, stream, limit):
+ self.stream = stream
+ self.limit = limit
+ self.lineno = 0
+ self.cnt = 0
+
+ def parse(self):
+ wlist = []
+ while True:
+ line = self.stream.readline()
+ if len(line) == 0:
+ break
+ self.lineno += 1
+ m = self.BASEWORD_RE.match(line)
+ if not m:
+ raise Exception("Line {:d}: '{:s}' wrong format\n".format(self.lineno, line))
+ tab = m.group(1)
+ if not tab:
+ self.cnt += 1
+ if self.limit and self.cnt > self.limit:
+ break
+ headword = m.group(2).strip().lower()
+ wlist.append(headword)
+ return wlist
+
+class FreqlistParser:
+
+ FREQ_RE = regex.compile(u"^([0-9]+) (.*)$")
+
+ def __init__(self, stream, limit):
+ self.stream = stream
+ self.limit = limit
+ self.lineno = 0
+
+ def parse(self):
+ wlist = []
+ while True:
+ if self.limit and self.lineno >= self.limit:
+ break
+ line = self.stream.readline()
+ if len(line) == 0:
+ break
+ self.lineno += 1
+ m = self.FREQ_RE.match(line)
+ if not m:
+ raise Exception("Line {:d} is not in NUM WORD format\n".format(self.lineno, line))
+ headword = m.group(2).strip().lower()
+ wlist.append(headword)
+ return wlist
+
+if __name__ == '__main__':
+ USAGE = "Usage: PROG $WORDLIST [+/-][NUM][b/f]:FREQLIST..."
+
+ if len(sys.argv) < 3:
+ raise Exception(USAGE)
+ FINAME = sys.argv[1]
+
+ with io.open(FINAME, mode='r', buffering=1, encoding="utf-8") as stream:
+ parser = WordlistParser(stream)
+ HEADWORDS = parser.parse()
+
+ COMMAND_RE = regex.compile("([-+])([0-9]+)?([bf]):([^:]+)")
+
+
+ IN_SET = set()
+ EX_SET = set()
+
+ for idx in range(2, len(sys.argv)):
+ spec = sys.argv[idx]
+ m = COMMAND_RE.match(spec)
+ if not m:
+ raise Exception("Wrong FREQLIST spec: '{:s}'\n".format(spec) + USAGE)
+ fname = m.group(4)
+ limit = m.group(2)
+ mode = m.group(3)
+ if limit:
+ limit = int(limit)
+ with io.open(fname, mode='r', buffering=1, encoding="utf-8") as stream:
+ if mode == "b":
+ parser = BasewordParser(stream, limit)
+ elif mode == "f":
+ parser = FreqlistParser(stream, limit)
+ else:
+ raise Expection("Unknown mode in specification...")
+ try:
+ wlist = parser.parse()
+ except:
+ print("Error during processing: {:s}".format(fname))
+ raise
+ wlist = set(wlist)
+ if m.group(1) == "+":
+ IN_SET |= wlist
+ else:
+ EX_SET |= wlist
+
+ for headword in HEADWORDS:
+ if any(c in headword for c in " '."):
+ continue
+ normilized= headword.lower()
+ if (len(IN_SET) == 0 or normilized in IN_SET) and not normilized in EX_SET:
+ print(headword)
+