obsolete/oanc.py
changeset 635 445ee650a9ba
child 1348 8f1c99471195
equal deleted inserted replaced
634:4f97d314c5e5 635:445ee650a9ba
       
     1 
       
     2 # Download WordNet database one time:
       
     3 # import nltk
       
     4 # nltk.download()
       
     5 
       
     6 from nltk.stem import WordNetLemmatizer
       
     7 
       
     8 lemmatiser = WordNetLemmatizer()
       
     9 
       
    10 import enchant
       
    11 dict = enchant.Dict('en_US')
       
    12 
       
    13 FREQ = {}
       
    14 def add(word, cnt):
       
    15     if word in FREQ:
       
    16         FREQ[word] += cnt
       
    17     else:
       
    18         FREQ[word] = cnt
       
    19 
       
    20 in_ = open('oanc.txt', 'r')
       
    21 sum = 0
       
    22 for line in in_:
       
    23     (cnt, word) = line.split('\t')
       
    24     cnt = int(cnt)
       
    25     word = word.strip()
       
    26     quote = word.find("'")
       
    27     if quote >= 0:
       
    28         word = word[:quote]
       
    29     if len(word) == 0:
       
    30         continue
       
    31     if not dict.check(word):
       
    32         continue
       
    33     word_n = lemmatiser.lemmatize(word, pos='n')
       
    34     word_v = lemmatiser.lemmatize(word, pos='v')
       
    35     word_a = lemmatiser.lemmatize(word, pos='a')
       
    36     word_r = lemmatiser.lemmatize(word, pos='r')
       
    37     sum += 1
       
    38     if word_n != word:
       
    39         add(word_n, cnt)
       
    40         continue
       
    41     if word_v != word:
       
    42         add(word_v, cnt)
       
    43         continue
       
    44     if word_a != word:
       
    45         add(word_a, cnt)
       
    46         continue
       
    47     if word_r != word:
       
    48         add(word_r, cnt)
       
    49         continue
       
    50     add(word, cnt)
       
    51 in_.close()
       
    52 
       
    53 print(sum)
       
    54 
       
    55 
       
    56 out = open('oanc.new.txt', 'w')
       
    57 for word, cnt in FREQ.items():
       
    58     out.write("{} {}\n".format(cnt, word))
       
    59 out.close()