#!/usr/bin/python2 # Randy Armknecht # 19 Feb 2014 # # Playing around with the Natural Language Processing Toolkit (nltk) # http://www.nltk.org/ # from __future__ import division import sys import nltk from nltk.corpus import cmudict from nltk.corpus import stopwords from pprint import pprint from hyphen import Hyphenator as hy DICT = cmudict.dict() SYLLABLE_AVG = 1.66 # START - Implemented from http://www.slideshare.net/pbpimpale/natural-language-toolkit-nltk-basics def unusual_words(text): text_vocab = set(w.lower() for w in text if w.isalpha()) english_vocab = set(w.lower() for w in nltk.corpus.words.words()) unusual = text_vocab.difference(english_vocab) return sorted(unusual) def problem_words(text): return sorted(set(w.lower() for w in text if not w.isalpha())) def content_fraction(text): stopwords = nltk.corpus.stopwords.words('english') content = [w for w in text if w.lower() not in stopwords] return len(content) / len(text) def plot_word_freq(text): text_vocab = [w.lower() for w in text if w.isalpha()] fdist = nltk.FreqDist(text_vocab) fdist.plot() def long_words(text,length=10): text_vocab = [w.lower() for w in text if w.isalpha()] return set([w for w in text_vocab if len(w) > length]) def topic_words(text,length=7,freq=7): text_vocab = [w.lower() for w in text if w.isalpha()] fdist = nltk.FreqDist(text_vocab) return sorted([w for w in set(text_vocab) if len(w) > length and fdist[w] > freq]) def vocab_size(text): return len(set(text)) def vocab_richness(text): return len(text) / vocab_size(text) def word_context(text,word): return text.concordance(word) # END - Implemented from http://www.slideshare.net/pbpimpale/natural-language-toolkit-nltk-basics def get_raw(fname): data = "" with open(fname) as f: data = f.read() return data def massage_raw(raw): modified = ''.join([character for character in raw if ord(character) < 128]) sentences = nltk.sent_tokenize(modified) words = nltk.word_tokenize(modified) tokens = [] stops = [unicode(word) for word in stopwords.words('english')] + [',', '.', '?', '!', ':', ';', '-', ')', '('] for w in words: if w not in stops: tokens.append(w) return (nltk.Text(tokens), sentences) def nsyl(word): return len([i for i in DICT[word.lower()][0] if i[-1].isdigit()]) # return [len(list(y for y in x if y[-1].isdigit())) for x in DICT[word.lower()]][0] # http://stackoverflow.com/a/5615724 translated to python def count_syllables(word): # Special Cases if word in ['ll', 'noye', 'shae']: return 1 # Back to Our Regular Scheduled Programming vowels = ['a','e','i','o','u','y'] curword = word syls = 0 lastWasVowel = False for wc in curword: foundVowel = False for v in vowels: # Don't Count Diphthongs if v == wc and lastWasVowel: foundVowel = True lastWasVowel = True break; elif v == wc and not lastWasVowel: syls += 1 foundVowel = True lastWasVowel = True break; # If Fully cycle and no vowel found, set lastWasVowel to False if not foundVowel: lastWasVowel = False # Remove es, it's usually silent if len(curword) > 2 and curword[-2:] == "es": syls -= 1 elif len(curword) > 1 and curword[-1] == "e": syls -= 1 return syls # Modified form of https://gist.github.com/drinks/2483508 def flesch_kincaid(text,sentences): syllables = [] misses = [] words = [word for word in text if (len(word) > 1) or (word.lower() in ['a', 'i'])] for word in words: try: ns = nsyl(word) syllables.append(ns) except KeyError: n = count_syllables(word.lower()) if n == 0: misses.append(word.lower()) else: syllables.append(n) word_count = len(words) - len(misses) sentence_count = len(sentences) syllable_count = sum(syllables) #m_dist = nltk.FreqDist(misses) #for t in m_dist.keys(): # print m_dist[t], t, count_syllables(t) #for m in set(misses): # print "%s %d" % (m, m_dist[m]) words_sents = word_count / sentence_count syl_words = syllable_count / word_count if word_count > 0 and sentence_count > 0: results = { 'words': word_count, 'syllables': syllable_count, 'missed_count': len(misses), 'missed_pct': len(misses) / (word_count + len(misses)), 'sentences': sentence_count, 'grade_level': (0.39 * words_sents) + (11.8 * syl_words) - 15.59, 'reading_ease': 206.835 - (1.015 * words_sents) - (84.6 * syl_words), } return results # From: http://engineroom.trackmaven.com/blog/monthly-challenge-natural-language-processing/ def top10_bigrams(words): bigram_measure = nltk.collocations.BigramAssocMeasures() bigram_finder = nltk.collocations.BigramCollocationFinder.from_words(words) # Filter to top 20 results; otherwise processing is long bigram_finder.apply_freq_filter(20) for bigram in bigram_finder.score_ngrams(bigram_measure.raw_freq)[:10]: print(bigram) # Modified the above to print trigrams, and look at words with a frequency of at least 10 def top10_trigrams(words): trigram_measure = nltk.collocations.TrigramAssocMeasures() trigram_finder = nltk.collocations.TrigramCollocationFinder.from_words(words) # Filter at least 10 instances of each word, and measure based on pmi metric # http://www.nltk.org/api/nltk.metrics.html#nltk.metrics.association.NgramAssocMeasures.pmi trigram_finder.apply_freq_filter(10) for trigram in trigram_finder.score_ngrams(trigram_measure.pmi)[:10]: print(trigram) if __name__ == "__main__": if len(sys.argv) is not 2: print("Usage: %s " % (sys.argv[0])) sys.exit(0) (text,sentences) = massage_raw(get_raw(sys.argv[1])) pprint(flesch_kincaid(text,sentences)) print("\nBigrams\n====================") top10_bigrams(text) print("\nTrigrams\n====================") top10_trigrams(text)