py/gadict_freq.py
changeset 723 53095b480a73
parent 646 2d488cfc4c0c
child 757 5417f2102dc5
--- a/py/gadict_freq.py	Thu Dec 29 00:44:28 2016 +0200
+++ b/py/gadict_freq.py	Thu Dec 29 01:17:44 2016 +0200
@@ -19,7 +19,7 @@
             wlist.append(line)
         return wlist
 
-class HeadVarParser:
+class WordformParser:
 
     BASEVAR_RE = regex.compile(u"^(\t)?(.*)$")
 
@@ -80,17 +80,12 @@
         raise Exception(USAGE)
     FINAME = sys.argv[1]
 
-    with io.open(FINAME, mode='r', buffering=1, encoding="utf-8") as stream:
-        parser = WordlistParser(stream)
-        HEADWORDS = parser.parse()
-
     COMMAND_RE = regex.compile("([-+])([0-9]+)?([bf]):([^:]+)")
 
-
     IN_SET = set()
     EX_SET = set()
 
-    for idx in range(2, len(sys.argv)):
+    for idx in range(1, len(sys.argv)):
         spec = sys.argv[idx]
         m = COMMAND_RE.match(spec)
         if not m:
@@ -102,7 +97,7 @@
             limit = int(limit)
         with io.open(fname, mode='r', buffering=1, encoding="utf-8") as stream:
             if mode == "b":
-                parser = HeadVarParser(stream, limit)
+                parser = WordformParser(stream, limit)
             elif mode == "f":
                 parser = FreqlistParser(stream, limit)
             else:
@@ -112,16 +107,14 @@
             except:
                 print("Error during processing: {:s}".format(fname))
                 raise
-        wlist = set(wlist)
+        wlist = set([w.lower() for w in wlist])
         if m.group(1) == "+":
             IN_SET |= wlist
         else:
             EX_SET |= wlist
 
-    for headword in HEADWORDS:
-        if any(c in headword for c in " '."):
-            continue
-        normilized= headword.lower()
-        if (len(IN_SET) == 0 or normilized in IN_SET) and not normilized in EX_SET:
-            print(headword)
+    for headword in IN_SET - EX_SET:
+        # if any(c in headword for c in " '."):
+        #     continue
+        print(headword)