197 lines
6.2 KiB
Python
Executable File
197 lines
6.2 KiB
Python
Executable File
#!/usr/bin/python2
|
|
|
|
# Randy Armknecht
|
|
# 19 Feb 2014
|
|
#
|
|
# Playing around with the Natural Language Processing Toolkit (nltk)
|
|
# http://www.nltk.org/
|
|
#
|
|
|
|
from __future__ import division
|
|
import sys
|
|
import nltk
|
|
from nltk.corpus import cmudict
|
|
from nltk.corpus import stopwords
|
|
from pprint import pprint
|
|
from hyphen import Hyphenator as hy
|
|
|
|
DICT = cmudict.dict()
|
|
SYLLABLE_AVG = 1.66
|
|
|
|
# START - Implemented from http://www.slideshare.net/pbpimpale/natural-language-toolkit-nltk-basics
|
|
def unusual_words(text):
|
|
text_vocab = set(w.lower() for w in text if w.isalpha())
|
|
english_vocab = set(w.lower() for w in nltk.corpus.words.words())
|
|
|
|
unusual = text_vocab.difference(english_vocab)
|
|
return sorted(unusual)
|
|
|
|
def problem_words(text):
|
|
return sorted(set(w.lower() for w in text if not w.isalpha()))
|
|
|
|
def content_fraction(text):
|
|
stopwords = nltk.corpus.stopwords.words('english')
|
|
content = [w for w in text if w.lower() not in stopwords]
|
|
return len(content) / len(text)
|
|
|
|
def plot_word_freq(text):
|
|
text_vocab = [w.lower() for w in text if w.isalpha()]
|
|
fdist = nltk.FreqDist(text_vocab)
|
|
fdist.plot()
|
|
|
|
def long_words(text,length=10):
|
|
text_vocab = [w.lower() for w in text if w.isalpha()]
|
|
return set([w for w in text_vocab if len(w) > length])
|
|
|
|
def topic_words(text,length=7,freq=7):
|
|
text_vocab = [w.lower() for w in text if w.isalpha()]
|
|
fdist = nltk.FreqDist(text_vocab)
|
|
return sorted([w for w in set(text_vocab) if len(w) > length and fdist[w] > freq])
|
|
|
|
def vocab_size(text):
|
|
return len(set(text))
|
|
|
|
def vocab_richness(text):
|
|
return len(text) / vocab_size(text)
|
|
|
|
def word_context(text,word):
|
|
return text.concordance(word)
|
|
|
|
# END - Implemented from http://www.slideshare.net/pbpimpale/natural-language-toolkit-nltk-basics
|
|
|
|
def get_raw(fname):
|
|
data = ""
|
|
with open(fname) as f:
|
|
data = f.read()
|
|
return data
|
|
|
|
def massage_raw(raw):
|
|
modified = ''.join([character for character in raw if ord(character) < 128])
|
|
sentences = nltk.sent_tokenize(modified)
|
|
words = nltk.word_tokenize(modified)
|
|
tokens = []
|
|
stops = [unicode(word) for word in stopwords.words('english')] + [',', '.', '?', '!', ':', ';', '-', ')', '(']
|
|
for w in words:
|
|
if w not in stops:
|
|
tokens.append(w)
|
|
return (nltk.Text(tokens), sentences)
|
|
|
|
def nsyl(word):
|
|
return len([i for i in DICT[word.lower()][0] if i[-1].isdigit()])
|
|
# return [len(list(y for y in x if y[-1].isdigit())) for x in DICT[word.lower()]][0]
|
|
|
|
# http://stackoverflow.com/a/5615724 translated to python
|
|
def count_syllables(word):
|
|
# Special Cases
|
|
if word in ['ll', 'noye', 'shae']:
|
|
return 1
|
|
|
|
# Back to Our Regular Scheduled Programming
|
|
vowels = ['a','e','i','o','u','y']
|
|
curword = word
|
|
syls = 0
|
|
lastWasVowel = False
|
|
|
|
for wc in curword:
|
|
foundVowel = False
|
|
for v in vowels:
|
|
# Don't Count Diphthongs
|
|
if v == wc and lastWasVowel:
|
|
foundVowel = True
|
|
lastWasVowel = True
|
|
break;
|
|
elif v == wc and not lastWasVowel:
|
|
syls += 1
|
|
foundVowel = True
|
|
lastWasVowel = True
|
|
break;
|
|
|
|
# If Fully cycle and no vowel found, set lastWasVowel to False
|
|
if not foundVowel:
|
|
lastWasVowel = False
|
|
|
|
# Remove es, it's usually silent
|
|
if len(curword) > 2 and curword[-2:] == "es":
|
|
syls -= 1
|
|
elif len(curword) > 1 and curword[-1] == "e":
|
|
syls -= 1
|
|
|
|
return syls
|
|
|
|
|
|
# Modified form of https://gist.github.com/drinks/2483508
|
|
def flesch_kincaid(text,sentences):
|
|
syllables = []
|
|
misses = []
|
|
words = [word for word in text if (len(word) > 1) or (word.lower() in ['a', 'i'])]
|
|
|
|
for word in words:
|
|
try:
|
|
ns = nsyl(word)
|
|
syllables.append(ns)
|
|
except KeyError:
|
|
n = count_syllables(word.lower())
|
|
if n == 0:
|
|
misses.append(word.lower())
|
|
else:
|
|
syllables.append(n)
|
|
|
|
word_count = len(words) - len(misses)
|
|
sentence_count = len(sentences)
|
|
syllable_count = sum(syllables)
|
|
|
|
#m_dist = nltk.FreqDist(misses)
|
|
#for t in m_dist.keys():
|
|
# print m_dist[t], t, count_syllables(t)
|
|
#for m in set(misses):
|
|
# print "%s %d" % (m, m_dist[m])
|
|
|
|
words_sents = word_count / sentence_count
|
|
syl_words = syllable_count / word_count
|
|
|
|
if word_count > 0 and sentence_count > 0:
|
|
results = {
|
|
'words': word_count,
|
|
'syllables': syllable_count,
|
|
'missed_count': len(misses),
|
|
'missed_pct': len(misses) / (word_count + len(misses)),
|
|
'sentences': sentence_count,
|
|
'grade_level': (0.39 * words_sents) + (11.8 * syl_words) - 15.59,
|
|
'reading_ease': 206.835 - (1.015 * words_sents) - (84.6 * syl_words),
|
|
}
|
|
return results
|
|
|
|
# From: http://engineroom.trackmaven.com/blog/monthly-challenge-natural-language-processing/
|
|
def top10_bigrams(words):
|
|
bigram_measure = nltk.collocations.BigramAssocMeasures()
|
|
bigram_finder = nltk.collocations.BigramCollocationFinder.from_words(words)
|
|
|
|
# Filter to top 20 results; otherwise processing is long
|
|
bigram_finder.apply_freq_filter(20)
|
|
for bigram in bigram_finder.score_ngrams(bigram_measure.raw_freq)[:10]:
|
|
print(bigram)
|
|
|
|
# Modified the above to print trigrams, and look at words with a frequency of at least 10
|
|
def top10_trigrams(words):
|
|
trigram_measure = nltk.collocations.TrigramAssocMeasures()
|
|
trigram_finder = nltk.collocations.TrigramCollocationFinder.from_words(words)
|
|
|
|
# Filter at least 10 instances of each word, and measure based on pmi metric
|
|
# http://www.nltk.org/api/nltk.metrics.html#nltk.metrics.association.NgramAssocMeasures.pmi
|
|
trigram_finder.apply_freq_filter(10)
|
|
for trigram in trigram_finder.score_ngrams(trigram_measure.pmi)[:10]:
|
|
print(trigram)
|
|
|
|
if __name__ == "__main__":
|
|
if len(sys.argv) is not 2:
|
|
print("Usage: %s <text_file>" % (sys.argv[0]))
|
|
sys.exit(0)
|
|
|
|
(text,sentences) = massage_raw(get_raw(sys.argv[1]))
|
|
pprint(flesch_kincaid(text,sentences))
|
|
print("\nBigrams\n====================")
|
|
top10_bigrams(text)
|
|
|
|
print("\nTrigrams\n====================")
|
|
top10_trigrams(text)
|