--- a/py/gadict_freq.py Fri Jul 21 23:19:15 2023 +0300
+++ b/py/gadict_freq.py Fri Jul 21 23:23:31 2023 +0300
@@ -1,5 +1,6 @@
import sys
+import glob
import codecs
import io
import re
@@ -23,13 +24,14 @@
BASEVAR_RE = re.compile(u"^(\t)?(.*)$", re.UNICODE)
- def __init__(self, stream, limit = None):
+ def __init__(self, stream, limit = None, ignore_tab = True):
self.stream = stream
self.limit = limit
+ self.ignore_tab = ignore_tab
self.lineno = 0
self.cnt = 0
- def parse(self):
+ def parse(self) -> list[str]:
wlist = []
while True:
line = self.stream.readline()
@@ -40,7 +42,7 @@
if not m:
raise Exception("Line {:d}: '{:s}' wrong format\n".format(self.lineno, line))
tab = m.group(1)
- if tab:
+ if tab and self.ignore_tab:
continue
self.cnt += 1
if self.limit and self.cnt > self.limit:
@@ -49,6 +51,25 @@
wlist.append(headword)
return wlist
+class WordformGroupParser:
+
+ def __init__(self, globpatt):
+ if globpatt.count('*') != 1:
+ raise Exception("Glob pattern should have exactly one asterisk: {:s}".format(globpatt))
+ self.globpatt = globpatt
+ self.astOff = globpatt.index('*')
+ self.cnt = 0
+
+ def parse(self) -> dict[str, set[str]]:
+ wmap = dict()
+ for fname in glob.glob(self.globpatt):
+ beg, end = self.astOff, self.astOff + 1 + len(fname) - len(self.globpatt)
+ tag = fname[beg:end]
+ with io.open(fname, mode='r', buffering=1, encoding="utf-8") as stream:
+ parser = WordformParser(stream, ignore_tab=False)
+ wmap[tag] = set(parser.parse())
+ return wmap
+
class FreqlistParser:
FREQ_RE = re.compile(u"^([0-9]+) (.*)$", re.UNICODE)