obsolete/oanc.py
author Oleksandr Gavenko <gavenkoa@gmail.com>
Mon, 27 Nov 2017 10:24:20 +0200
changeset 983 a7c7af336365
parent 635 445ee650a9ba
child 1348 8f1c99471195
permissions -rw-r--r--
Added new articles and relations.
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
635
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     1
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     2
# Download WordNet database one time:
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     3
# import nltk
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     4
# nltk.download()
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     5
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     6
from nltk.stem import WordNetLemmatizer
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     7
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     8
lemmatiser = WordNetLemmatizer()
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     9
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    10
import enchant
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    11
dict = enchant.Dict('en_US')
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    12
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    13
FREQ = {}
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    14
def add(word, cnt):
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    15
    if word in FREQ:
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    16
        FREQ[word] += cnt
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    17
    else:
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    18
        FREQ[word] = cnt
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    19
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    20
in_ = open('oanc.txt', 'r')
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    21
sum = 0
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    22
for line in in_:
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    23
    (cnt, word) = line.split('\t')
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    24
    cnt = int(cnt)
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    25
    word = word.strip()
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    26
    quote = word.find("'")
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    27
    if quote >= 0:
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    28
        word = word[:quote]
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    29
    if len(word) == 0:
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    30
        continue
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    31
    if not dict.check(word):
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    32
        continue
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    33
    word_n = lemmatiser.lemmatize(word, pos='n')
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    34
    word_v = lemmatiser.lemmatize(word, pos='v')
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    35
    word_a = lemmatiser.lemmatize(word, pos='a')
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    36
    word_r = lemmatiser.lemmatize(word, pos='r')
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    37
    sum += 1
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    38
    if word_n != word:
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    39
        add(word_n, cnt)
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    40
        continue
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    41
    if word_v != word:
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    42
        add(word_v, cnt)
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    43
        continue
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    44
    if word_a != word:
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    45
        add(word_a, cnt)
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    46
        continue
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    47
    if word_r != word:
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    48
        add(word_r, cnt)
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    49
        continue
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    50
    add(word, cnt)
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    51
in_.close()
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    52
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    53
print(sum)
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    54
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    55
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    56
out = open('oanc.new.txt', 'w')
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    57
for word, cnt in FREQ.items():
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    58
    out.write("{} {}\n".format(cnt, word))
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    59
out.close()