equal
deleted
inserted
replaced
|
1 |
|
2 # Download WordNet database one time: |
|
3 # import nltk |
|
4 # nltk.download() |
|
5 |
|
6 from nltk.stem import WordNetLemmatizer |
|
7 |
|
8 lemmatiser = WordNetLemmatizer() |
|
9 |
|
10 import enchant |
|
11 dict = enchant.Dict('en_US') |
|
12 |
|
13 FREQ = {} |
|
14 def add(word, cnt): |
|
15 if word in FREQ: |
|
16 FREQ[word] += cnt |
|
17 else: |
|
18 FREQ[word] = cnt |
|
19 |
|
20 in_ = open('oanc.txt', 'r') |
|
21 sum = 0 |
|
22 for line in in_: |
|
23 (cnt, word) = line.split('\t') |
|
24 cnt = int(cnt) |
|
25 word = word.strip() |
|
26 quote = word.find("'") |
|
27 if quote >= 0: |
|
28 word = word[:quote] |
|
29 if len(word) == 0: |
|
30 continue |
|
31 if not dict.check(word): |
|
32 continue |
|
33 word_n = lemmatiser.lemmatize(word, pos='n') |
|
34 word_v = lemmatiser.lemmatize(word, pos='v') |
|
35 word_a = lemmatiser.lemmatize(word, pos='a') |
|
36 word_r = lemmatiser.lemmatize(word, pos='r') |
|
37 sum += 1 |
|
38 if word_n != word: |
|
39 add(word_n, cnt) |
|
40 continue |
|
41 if word_v != word: |
|
42 add(word_v, cnt) |
|
43 continue |
|
44 if word_a != word: |
|
45 add(word_a, cnt) |
|
46 continue |
|
47 if word_r != word: |
|
48 add(word_r, cnt) |
|
49 continue |
|
50 add(word, cnt) |
|
51 in_.close() |
|
52 |
|
53 print(sum) |
|
54 |
|
55 |
|
56 out = open('oanc.new.txt', 'w') |
|
57 for word, cnt in FREQ.items(): |
|
58 out.write("{} {}\n".format(cnt, word)) |
|
59 out.close() |