nlpfun/basic_info.py

#!/usr/bin/python2

# Randy Armknecht
# 19 Feb 2014
#
# Playing around with the Natural Language Processing Toolkit (nltk)
# http://www.nltk.org/
#

from __future__ import division
import sys
import nltk
from nltk.corpus import cmudict
from nltk.corpus import stopwords
from pprint import pprint
from hyphen import Hyphenator as hy

DICT = cmudict.dict()
SYLLABLE_AVG = 1.66

# START - Implemented from http://www.slideshare.net/pbpimpale/natural-language-toolkit-nltk-basics
def unusual_words(text):
    text_vocab = set(w.lower() for w in text if w.isalpha())
    english_vocab = set(w.lower() for w in nltk.corpus.words.words())

    unusual = text_vocab.difference(english_vocab)
    return sorted(unusual)

def problem_words(text):
    return sorted(set(w.lower() for w in text if not w.isalpha()))

def content_fraction(text):
    stopwords = nltk.corpus.stopwords.words('english')
    content = [w for w in text if w.lower() not in stopwords]
    return len(content) / len(text)

def plot_word_freq(text):
    text_vocab = [w.lower() for w in text if w.isalpha()]
    fdist = nltk.FreqDist(text_vocab)
    fdist.plot()

def long_words(text,length=10):
    text_vocab = [w.lower() for w in text if w.isalpha()]
    return set([w for w in text_vocab if len(w) > length])

def topic_words(text,length=7,freq=7):
    text_vocab = [w.lower() for w in text if w.isalpha()]
    fdist = nltk.FreqDist(text_vocab)
    return sorted([w for w in set(text_vocab) if len(w) > length and fdist[w] > freq])

def vocab_size(text):
    return len(set(text))

def vocab_richness(text):
    return len(text) / vocab_size(text)

def word_context(text,word):
    return text.concordance(word)

# END - Implemented from http://www.slideshare.net/pbpimpale/natural-language-toolkit-nltk-basics

def get_raw(fname):
    data = ""
    with open(fname) as f:
        data = f.read()
    return data

def massage_raw(raw):
    modified = ''.join([character for character in raw if ord(character) < 128])
    sentences = nltk.sent_tokenize(modified)
    words = nltk.word_tokenize(modified)
    tokens = []
    stops = [unicode(word) for word in stopwords.words('english')] + [',', '.', '?', '!', ':', ';', '-', ')', '(']
    for w in words:
        if w not in stops:
            tokens.append(w)
    return (nltk.Text(tokens), sentences)

def nsyl(word):
    return len([i for i in DICT[word.lower()][0] if i[-1].isdigit()])
#    return [len(list(y for y in x if y[-1].isdigit())) for x in DICT[word.lower()]][0]

# http://stackoverflow.com/a/5615724 translated to python
def count_syllables(word):
    # Special Cases
    if word in ['ll', 'noye', 'shae']:
        return 1

    # Back to Our Regular Scheduled Programming
    vowels = ['a','e','i','o','u','y']
    curword = word
    syls = 0
    lastWasVowel = False

    for wc in curword:
        foundVowel = False
        for v in vowels:
            # Don't Count Diphthongs
            if v == wc and lastWasVowel:
                foundVowel = True
                lastWasVowel = True
                break;
            elif v == wc and not lastWasVowel:
                syls += 1
                foundVowel = True
                lastWasVowel = True
                break;

        # If Fully cycle and no vowel found, set lastWasVowel to False
        if not foundVowel:
            lastWasVowel = False

    # Remove es, it's usually silent
    if len(curword) > 2 and curword[-2:] == "es":
        syls -= 1
    elif len(curword) > 1 and curword[-1] == "e":
        syls -= 1

    return syls


# Modified form of https://gist.github.com/drinks/2483508
def flesch_kincaid(text,sentences):
    syllables = []
    misses = []
    words = [word for word in text if (len(word) > 1) or (word.lower() in ['a', 'i'])]

    for word in words:
        try:
            ns = nsyl(word)
            syllables.append(ns)
        except KeyError:
            n = count_syllables(word.lower())
            if n == 0:
                misses.append(word.lower())
            else:
                syllables.append(n)

    word_count = len(words) - len(misses)
    sentence_count = len(sentences)
    syllable_count = sum(syllables)

    #m_dist =  nltk.FreqDist(misses)
    #for t in m_dist.keys():
    #    print m_dist[t], t, count_syllables(t)
    #for m in set(misses):
    #    print "%s %d" % (m, m_dist[m])

    words_sents = word_count / sentence_count
    syl_words = syllable_count / word_count

    if word_count > 0 and sentence_count > 0:
        results = {
            'words': word_count,
            'syllables': syllable_count,
            'missed_count': len(misses),
            'missed_pct': len(misses) / (word_count + len(misses)),
            'sentences': sentence_count,
            'grade_level': (0.39 * words_sents) + (11.8 * syl_words) - 15.59,
            'reading_ease': 206.835 - (1.015 * words_sents) - (84.6 * syl_words),
        }
    return results

# From: http://engineroom.trackmaven.com/blog/monthly-challenge-natural-language-processing/
def top10_bigrams(words):
    bigram_measure = nltk.collocations.BigramAssocMeasures()
    bigram_finder = nltk.collocations.BigramCollocationFinder.from_words(words)

    # Filter to top 20 results; otherwise processing is long
    bigram_finder.apply_freq_filter(20)
    for bigram in bigram_finder.score_ngrams(bigram_measure.raw_freq)[:10]:
        print(bigram)

# Modified the above to print trigrams, and look at words with a frequency of at least 10
def top10_trigrams(words):
    trigram_measure = nltk.collocations.TrigramAssocMeasures()
    trigram_finder = nltk.collocations.TrigramCollocationFinder.from_words(words)

    # Filter at least 10 instances of each word, and measure based on pmi metric
    # http://www.nltk.org/api/nltk.metrics.html#nltk.metrics.association.NgramAssocMeasures.pmi
    trigram_finder.apply_freq_filter(10)
    for trigram in trigram_finder.score_ngrams(trigram_measure.pmi)[:10]:
        print(trigram)

if __name__ == "__main__":
    if len(sys.argv) is not 2:
        print("Usage: %s <text_file>" % (sys.argv[0]))
        sys.exit(0)

    (text,sentences) = massage_raw(get_raw(sys.argv[1]))
    pprint(flesch_kincaid(text,sentences))
    print("\nBigrams\n====================")
    top10_bigrams(text)

    print("\nTrigrams\n====================")
    top10_trigrams(text)