gadict: comparison obsolete/oanc.py

equal deleted inserted replaced

-:4f97d314c5e5
+:445ee650a9ba
+# Download WordNet database one time:
+# import nltk
+# nltk.download()
+from nltk.stem import WordNetLemmatizer
+lemmatiser = WordNetLemmatizer()
+import enchant
+dict = enchant.Dict('en_US')
+FREQ = {}
+def add(word, cnt):
+if word in FREQ:
+FREQ[word] += cnt
+else:
+FREQ[word] = cnt
+in_ = open('oanc.txt', 'r')
+sum = 0
+for line in in_:
+(cnt, word) = line.split('\t')
+cnt = int(cnt)
+word = word.strip()
+quote = word.find("'")
+if quote >= 0:
+word = word[:quote]
+if len(word) == 0:
+continue
+if not dict.check(word):
+continue
+word_n = lemmatiser.lemmatize(word, pos='n')
+word_v = lemmatiser.lemmatize(word, pos='v')
+word_a = lemmatiser.lemmatize(word, pos='a')
+word_r = lemmatiser.lemmatize(word, pos='r')
+sum += 1
+if word_n != word:
+add(word_n, cnt)
+continue
+if word_v != word:
+add(word_v, cnt)
+continue
+if word_a != word:
+add(word_a, cnt)
+continue
+if word_r != word:
+add(word_r, cnt)
+continue
+add(word, cnt)
+in_.close()
+print(sum)
+out = open('oanc.new.txt', 'w')
+for word, cnt in FREQ.items():
+out.write("{} {}\n".format(cnt, word))
+out.close()

changeset 635	445ee650a9ba
child 1348	8f1c99471195