obsolete/oanc.py
author Oleksandr Gavenko <gavenkoa@gmail.com>
Tue, 05 Dec 2023 13:24:46 +0200
changeset 1353 dcda231188dc
parent 1348 8f1c99471195
permissions -rw-r--r--
New articles.
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
635
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     1
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     2
# Download WordNet database one time:
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     3
# import nltk
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     4
# nltk.download()
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     5
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     6
from nltk.stem import WordNetLemmatizer
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     7
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     8
lemmatiser = WordNetLemmatizer()
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     9
1348
8f1c99471195 enchant was removed from Cygwin.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 635
diff changeset
    10
# apt-cyg install python39-enchant
8f1c99471195 enchant was removed from Cygwin.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 635
diff changeset
    11
# import enchant
8f1c99471195 enchant was removed from Cygwin.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 635
diff changeset
    12
# dict = enchant.Dict('en_US')
635
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    13
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    14
FREQ = {}
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    15
def add(word, cnt):
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    16
    if word in FREQ:
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    17
        FREQ[word] += cnt
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    18
    else:
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    19
        FREQ[word] = cnt
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    20
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    21
in_ = open('oanc.txt', 'r')
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    22
sum = 0
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    23
for line in in_:
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    24
    (cnt, word) = line.split('\t')
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    25
    cnt = int(cnt)
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    26
    word = word.strip()
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    27
    quote = word.find("'")
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    28
    if quote >= 0:
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    29
        word = word[:quote]
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    30
    if len(word) == 0:
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    31
        continue
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    32
    if not dict.check(word):
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    33
        continue
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    34
    word_n = lemmatiser.lemmatize(word, pos='n')
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    35
    word_v = lemmatiser.lemmatize(word, pos='v')
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    36
    word_a = lemmatiser.lemmatize(word, pos='a')
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    37
    word_r = lemmatiser.lemmatize(word, pos='r')
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    38
    sum += 1
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    39
    if word_n != word:
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    40
        add(word_n, cnt)
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    41
        continue
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    42
    if word_v != word:
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    43
        add(word_v, cnt)
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    44
        continue
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    45
    if word_a != word:
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    46
        add(word_a, cnt)
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    47
        continue
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    48
    if word_r != word:
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    49
        add(word_r, cnt)
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    50
        continue
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    51
    add(word, cnt)
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    52
in_.close()
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    53
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    54
print(sum)
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    55
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    56
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    57
out = open('oanc.new.txt', 'w')
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    58
for word, cnt in FREQ.items():
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    59
    out.write("{} {}\n".format(cnt, word))
445ee650a9ba Add OANC frequency wordlist.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    60
out.close()