diff -r e315df384eef -r 272ec25b6f12 py/gadict_freq.py --- a/py/gadict_freq.py Fri Jul 21 23:19:15 2023 +0300 +++ b/py/gadict_freq.py Fri Jul 21 23:23:31 2023 +0300 @@ -1,5 +1,6 @@ import sys +import glob import codecs import io import re @@ -23,13 +24,14 @@ BASEVAR_RE = re.compile(u"^(\t)?(.*)$", re.UNICODE) - def __init__(self, stream, limit = None): + def __init__(self, stream, limit = None, ignore_tab = True): self.stream = stream self.limit = limit + self.ignore_tab = ignore_tab self.lineno = 0 self.cnt = 0 - def parse(self): + def parse(self) -> list[str]: wlist = [] while True: line = self.stream.readline() @@ -40,7 +42,7 @@ if not m: raise Exception("Line {:d}: '{:s}' wrong format\n".format(self.lineno, line)) tab = m.group(1) - if tab: + if tab and self.ignore_tab: continue self.cnt += 1 if self.limit and self.cnt > self.limit: @@ -49,6 +51,25 @@ wlist.append(headword) return wlist +class WordformGroupParser: + + def __init__(self, globpatt): + if globpatt.count('*') != 1: + raise Exception("Glob pattern should have exactly one asterisk: {:s}".format(globpatt)) + self.globpatt = globpatt + self.astOff = globpatt.index('*') + self.cnt = 0 + + def parse(self) -> dict[str, set[str]]: + wmap = dict() + for fname in glob.glob(self.globpatt): + beg, end = self.astOff, self.astOff + 1 + len(fname) - len(self.globpatt) + tag = fname[beg:end] + with io.open(fname, mode='r', buffering=1, encoding="utf-8") as stream: + parser = WordformParser(stream, ignore_tab=False) + wmap[tag] = set(parser.parse()) + return wmap + class FreqlistParser: FREQ_RE = re.compile(u"^([0-9]+) (.*)$", re.UNICODE)