py/gadict_freq.py
changeset 1347 272ec25b6f12
parent 1004 181301cc2c0c
--- a/py/gadict_freq.py	Fri Jul 21 23:19:15 2023 +0300
+++ b/py/gadict_freq.py	Fri Jul 21 23:23:31 2023 +0300
@@ -1,5 +1,6 @@
 
 import sys
+import glob
 import codecs
 import io
 import re
@@ -23,13 +24,14 @@
 
     BASEVAR_RE = re.compile(u"^(\t)?(.*)$", re.UNICODE)
 
-    def __init__(self, stream, limit = None):
+    def __init__(self, stream, limit = None, ignore_tab = True):
         self.stream = stream
         self.limit = limit
+        self.ignore_tab = ignore_tab
         self.lineno = 0
         self.cnt = 0
 
-    def parse(self):
+    def parse(self) -> list[str]:
         wlist = []
         while True:
             line = self.stream.readline()
@@ -40,7 +42,7 @@
             if not m:
                 raise Exception("Line {:d}: '{:s}' wrong format\n".format(self.lineno, line))
             tab = m.group(1)
-            if tab:
+            if tab and self.ignore_tab:
                 continue
             self.cnt += 1
             if self.limit and self.cnt > self.limit:
@@ -49,6 +51,25 @@
             wlist.append(headword)
         return wlist
 
+class WordformGroupParser:
+
+    def __init__(self, globpatt):
+        if globpatt.count('*') != 1:
+            raise Exception("Glob pattern should have exactly one asterisk: {:s}".format(globpatt))
+        self.globpatt = globpatt
+        self.astOff = globpatt.index('*')
+        self.cnt = 0
+
+    def parse(self) -> dict[str, set[str]]:
+        wmap = dict()
+        for fname in glob.glob(self.globpatt):
+            beg, end = self.astOff, self.astOff + 1 + len(fname) - len(self.globpatt)
+            tag = fname[beg:end]
+            with io.open(fname, mode='r', buffering=1, encoding="utf-8") as stream:
+                parser = WordformParser(stream, ignore_tab=False)
+                wmap[tag] = set(parser.parse())
+        return wmap
+
 class FreqlistParser:
 
     FREQ_RE = re.compile(u"^([0-9]+) (.*)$", re.UNICODE)