author | Oleksandr Gavenko <gavenkoa@gmail.com> |
Fri, 11 Nov 2016 00:19:47 +0200 | |
changeset 667 | 5f69f0776c37 |
parent 635 | 445ee650a9ba |
child 1348 | 8f1c99471195 |
permissions | -rw-r--r-- |
635
445ee650a9ba
Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
1 |
|
445ee650a9ba
Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
2 |
# Download WordNet database one time: |
445ee650a9ba
Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
3 |
# import nltk |
445ee650a9ba
Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
4 |
# nltk.download() |
445ee650a9ba
Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
5 |
|
445ee650a9ba
Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
6 |
from nltk.stem import WordNetLemmatizer |
445ee650a9ba
Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
7 |
|
445ee650a9ba
Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
8 |
lemmatiser = WordNetLemmatizer() |
445ee650a9ba
Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
9 |
|
445ee650a9ba
Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
10 |
import enchant |
445ee650a9ba
Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
11 |
dict = enchant.Dict('en_US') |
445ee650a9ba
Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
12 |
|
445ee650a9ba
Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
13 |
FREQ = {} |
445ee650a9ba
Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
14 |
def add(word, cnt): |
445ee650a9ba
Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
15 |
if word in FREQ: |
445ee650a9ba
Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
16 |
FREQ[word] += cnt |
445ee650a9ba
Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
17 |
else: |
445ee650a9ba
Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
18 |
FREQ[word] = cnt |
445ee650a9ba
Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
19 |
|
445ee650a9ba
Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
20 |
in_ = open('oanc.txt', 'r') |
445ee650a9ba
Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
21 |
sum = 0 |
445ee650a9ba
Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
22 |
for line in in_: |
445ee650a9ba
Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
23 |
(cnt, word) = line.split('\t') |
445ee650a9ba
Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
24 |
cnt = int(cnt) |
445ee650a9ba
Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
25 |
word = word.strip() |
445ee650a9ba
Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
26 |
quote = word.find("'") |
445ee650a9ba
Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
27 |
if quote >= 0: |
445ee650a9ba
Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
28 |
word = word[:quote] |
445ee650a9ba
Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
29 |
if len(word) == 0: |
445ee650a9ba
Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
30 |
continue |
445ee650a9ba
Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
31 |
if not dict.check(word): |
445ee650a9ba
Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
32 |
continue |
445ee650a9ba
Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
33 |
word_n = lemmatiser.lemmatize(word, pos='n') |
445ee650a9ba
Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
34 |
word_v = lemmatiser.lemmatize(word, pos='v') |
445ee650a9ba
Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
35 |
word_a = lemmatiser.lemmatize(word, pos='a') |
445ee650a9ba
Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
36 |
word_r = lemmatiser.lemmatize(word, pos='r') |
445ee650a9ba
Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
37 |
sum += 1 |
445ee650a9ba
Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
38 |
if word_n != word: |
445ee650a9ba
Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
39 |
add(word_n, cnt) |
445ee650a9ba
Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
40 |
continue |
445ee650a9ba
Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
41 |
if word_v != word: |
445ee650a9ba
Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
42 |
add(word_v, cnt) |
445ee650a9ba
Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
43 |
continue |
445ee650a9ba
Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
44 |
if word_a != word: |
445ee650a9ba
Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
45 |
add(word_a, cnt) |
445ee650a9ba
Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
46 |
continue |
445ee650a9ba
Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
47 |
if word_r != word: |
445ee650a9ba
Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
48 |
add(word_r, cnt) |
445ee650a9ba
Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
49 |
continue |
445ee650a9ba
Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
50 |
add(word, cnt) |
445ee650a9ba
Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
51 |
in_.close() |
445ee650a9ba
Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
52 |
|
445ee650a9ba
Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
53 |
print(sum) |
445ee650a9ba
Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
54 |
|
445ee650a9ba
Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
55 |
|
445ee650a9ba
Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
56 |
out = open('oanc.new.txt', 'w') |
445ee650a9ba
Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
57 |
for word, cnt in FREQ.items(): |
445ee650a9ba
Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
58 |
out.write("{} {}\n".format(cnt, word)) |
445ee650a9ba
Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff
changeset
|
59 |
out.close() |