|
1 |
|
2 import sys |
|
3 import codecs |
|
4 import io |
|
5 import regex |
|
6 |
|
7 class WordlistParser: |
|
8 |
|
9 def __init__(self, stream): |
|
10 self.stream = stream |
|
11 |
|
12 def parse(self): |
|
13 wlist = [] |
|
14 while True: |
|
15 line = self.stream.readline() |
|
16 if len(line) == 0: |
|
17 break |
|
18 line = line.strip() |
|
19 wlist.append(line) |
|
20 return wlist |
|
21 |
|
22 class BasewordParser: |
|
23 |
|
24 BASEWORD_RE = regex.compile(u"^(\t)?(.*)$") |
|
25 |
|
26 def __init__(self, stream, limit): |
|
27 self.stream = stream |
|
28 self.limit = limit |
|
29 self.lineno = 0 |
|
30 self.cnt = 0 |
|
31 |
|
32 def parse(self): |
|
33 wlist = [] |
|
34 while True: |
|
35 line = self.stream.readline() |
|
36 if len(line) == 0: |
|
37 break |
|
38 self.lineno += 1 |
|
39 m = self.BASEWORD_RE.match(line) |
|
40 if not m: |
|
41 raise Exception("Line {:d}: '{:s}' wrong format\n".format(self.lineno, line)) |
|
42 tab = m.group(1) |
|
43 if not tab: |
|
44 self.cnt += 1 |
|
45 if self.limit and self.cnt > self.limit: |
|
46 break |
|
47 headword = m.group(2).strip().lower() |
|
48 wlist.append(headword) |
|
49 return wlist |
|
50 |
|
51 class FreqlistParser: |
|
52 |
|
53 FREQ_RE = regex.compile(u"^([0-9]+) (.*)$") |
|
54 |
|
55 def __init__(self, stream, limit): |
|
56 self.stream = stream |
|
57 self.limit = limit |
|
58 self.lineno = 0 |
|
59 |
|
60 def parse(self): |
|
61 wlist = [] |
|
62 while True: |
|
63 if self.limit and self.lineno >= self.limit: |
|
64 break |
|
65 line = self.stream.readline() |
|
66 if len(line) == 0: |
|
67 break |
|
68 self.lineno += 1 |
|
69 m = self.FREQ_RE.match(line) |
|
70 if not m: |
|
71 raise Exception("Line {:d} is not in NUM WORD format\n".format(self.lineno, line)) |
|
72 headword = m.group(2).strip().lower() |
|
73 wlist.append(headword) |
|
74 return wlist |
|
75 |
|
76 if __name__ == '__main__': |
|
77 USAGE = "Usage: PROG $WORDLIST [+/-][NUM][b/f]:FREQLIST..." |
|
78 |
|
79 if len(sys.argv) < 3: |
|
80 raise Exception(USAGE) |
|
81 FINAME = sys.argv[1] |
|
82 |
|
83 with io.open(FINAME, mode='r', buffering=1, encoding="utf-8") as stream: |
|
84 parser = WordlistParser(stream) |
|
85 HEADWORDS = parser.parse() |
|
86 |
|
87 COMMAND_RE = regex.compile("([-+])([0-9]+)?([bf]):([^:]+)") |
|
88 |
|
89 |
|
90 IN_SET = set() |
|
91 EX_SET = set() |
|
92 |
|
93 for idx in range(2, len(sys.argv)): |
|
94 spec = sys.argv[idx] |
|
95 m = COMMAND_RE.match(spec) |
|
96 if not m: |
|
97 raise Exception("Wrong FREQLIST spec: '{:s}'\n".format(spec) + USAGE) |
|
98 fname = m.group(4) |
|
99 limit = m.group(2) |
|
100 mode = m.group(3) |
|
101 if limit: |
|
102 limit = int(limit) |
|
103 with io.open(fname, mode='r', buffering=1, encoding="utf-8") as stream: |
|
104 if mode == "b": |
|
105 parser = BasewordParser(stream, limit) |
|
106 elif mode == "f": |
|
107 parser = FreqlistParser(stream, limit) |
|
108 else: |
|
109 raise Expection("Unknown mode in specification...") |
|
110 try: |
|
111 wlist = parser.parse() |
|
112 except: |
|
113 print("Error during processing: {:s}".format(fname)) |
|
114 raise |
|
115 wlist = set(wlist) |
|
116 if m.group(1) == "+": |
|
117 IN_SET |= wlist |
|
118 else: |
|
119 EX_SET |= wlist |
|
120 |
|
121 for headword in HEADWORDS: |
|
122 if any(c in headword for c in " '."): |
|
123 continue |
|
124 normilized= headword.lower() |
|
125 if (len(IN_SET) == 0 or normilized in IN_SET) and not normilized in EX_SET: |
|
126 print(headword) |
|
127 |