obsolete/oanc.py
changeset 635 445ee650a9ba
child 1348 8f1c99471195
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/obsolete/oanc.py	Tue Nov 08 16:11:20 2016 +0200
@@ -0,0 +1,59 @@
+
+# Download WordNet database one time:
+# import nltk
+# nltk.download()
+
+from nltk.stem import WordNetLemmatizer
+
+lemmatiser = WordNetLemmatizer()
+
+import enchant
+dict = enchant.Dict('en_US')
+
+FREQ = {}
+def add(word, cnt):
+    if word in FREQ:
+        FREQ[word] += cnt
+    else:
+        FREQ[word] = cnt
+
+in_ = open('oanc.txt', 'r')
+sum = 0
+for line in in_:
+    (cnt, word) = line.split('\t')
+    cnt = int(cnt)
+    word = word.strip()
+    quote = word.find("'")
+    if quote >= 0:
+        word = word[:quote]
+    if len(word) == 0:
+        continue
+    if not dict.check(word):
+        continue
+    word_n = lemmatiser.lemmatize(word, pos='n')
+    word_v = lemmatiser.lemmatize(word, pos='v')
+    word_a = lemmatiser.lemmatize(word, pos='a')
+    word_r = lemmatiser.lemmatize(word, pos='r')
+    sum += 1
+    if word_n != word:
+        add(word_n, cnt)
+        continue
+    if word_v != word:
+        add(word_v, cnt)
+        continue
+    if word_a != word:
+        add(word_a, cnt)
+        continue
+    if word_r != word:
+        add(word_r, cnt)
+        continue
+    add(word, cnt)
+in_.close()
+
+print(sum)
+
+
+out = open('oanc.new.txt', 'w')
+for word, cnt in FREQ.items():
+    out.write("{} {}\n".format(cnt, word))
+out.close()