"""This module contains code from Think Python by Allen B. Downey http://thinkpython.com Copyright 2012 Allen B. Downey License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html """ import string import random from bisect import bisect from analyze_book import * def random_word(hist): """Chooses a random word from a histogram. The probability of each word is proportional to its frequency. This could be made faster by computing the cumulative frequencies once and reusing them. """ words = [] freqs = [] total_freq = 0 # make a list of words and a list of cumulative frequencies for word, freq in hist.items(): total_freq += freq words.append(word) freqs.append(total_freq) # choose a random value and find its location in the cumulative list x = random.randint(0, total_freq-1) index = bisect(freqs, x) return words[index] if __name__ == '__main__': hist = process_file('emma.txt', skip_header=True) print 'Total number of words:', total_words(hist) print 'Number of different words:', different_words(hist) t = most_common(hist) print 'The most common words are:' for freq, word in t[0:20]: print word, '\t', freq words = process_file('words.txt', skip_header=False) diff = subtract(hist, words) print "The words in the book that aren't in the word list are:" for word in diff: print word, print "\n\nHere are some random words from the book" for i in range(100): print random_word(hist),