--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/obsolete/oanc.py Tue Nov 08 16:11:20 2016 +0200
@@ -0,0 +1,59 @@
+
+# Download WordNet database one time:
+# import nltk
+# nltk.download()
+
+from nltk.stem import WordNetLemmatizer
+
+lemmatiser = WordNetLemmatizer()
+
+import enchant
+dict = enchant.Dict('en_US')
+
+FREQ = {}
+def add(word, cnt):
+ if word in FREQ:
+ FREQ[word] += cnt
+ else:
+ FREQ[word] = cnt
+
+in_ = open('oanc.txt', 'r')
+sum = 0
+for line in in_:
+ (cnt, word) = line.split('\t')
+ cnt = int(cnt)
+ word = word.strip()
+ quote = word.find("'")
+ if quote >= 0:
+ word = word[:quote]
+ if len(word) == 0:
+ continue
+ if not dict.check(word):
+ continue
+ word_n = lemmatiser.lemmatize(word, pos='n')
+ word_v = lemmatiser.lemmatize(word, pos='v')
+ word_a = lemmatiser.lemmatize(word, pos='a')
+ word_r = lemmatiser.lemmatize(word, pos='r')
+ sum += 1
+ if word_n != word:
+ add(word_n, cnt)
+ continue
+ if word_v != word:
+ add(word_v, cnt)
+ continue
+ if word_a != word:
+ add(word_a, cnt)
+ continue
+ if word_r != word:
+ add(word_r, cnt)
+ continue
+ add(word, cnt)
+in_.close()
+
+print(sum)
+
+
+out = open('oanc.new.txt', 'w')
+for word, cnt in FREQ.items():
+ out.write("{} {}\n".format(cnt, word))
+out.close()