obsolete/oanc.py
author Oleksandr Gavenko <gavenkoa@gmail.com>
Sat, 25 Nov 2017 20:53:59 +0200
changeset 982 d5ee8644c18e
parent 635 445ee650a9ba
child 1348 8f1c99471195
permissions -rw-r--r--
Added new articles and relations. Fixed pronunciation.


# Download WordNet database one time:
# import nltk
# nltk.download()

from nltk.stem import WordNetLemmatizer

lemmatiser = WordNetLemmatizer()

import enchant
dict = enchant.Dict('en_US')

FREQ = {}
def add(word, cnt):
    if word in FREQ:
        FREQ[word] += cnt
    else:
        FREQ[word] = cnt

in_ = open('oanc.txt', 'r')
sum = 0
for line in in_:
    (cnt, word) = line.split('\t')
    cnt = int(cnt)
    word = word.strip()
    quote = word.find("'")
    if quote >= 0:
        word = word[:quote]
    if len(word) == 0:
        continue
    if not dict.check(word):
        continue
    word_n = lemmatiser.lemmatize(word, pos='n')
    word_v = lemmatiser.lemmatize(word, pos='v')
    word_a = lemmatiser.lemmatize(word, pos='a')
    word_r = lemmatiser.lemmatize(word, pos='r')
    sum += 1
    if word_n != word:
        add(word_n, cnt)
        continue
    if word_v != word:
        add(word_v, cnt)
        continue
    if word_a != word:
        add(word_a, cnt)
        continue
    if word_r != word:
        add(word_r, cnt)
        continue
    add(word, cnt)
in_.close()

print(sum)


out = open('oanc.new.txt', 'w')
for word, cnt in FREQ.items():
    out.write("{} {}\n".format(cnt, word))
out.close()