"""This module contains code from Think Python by Allen B. Downey http://thinkpython.com Copyright 2012 Allen B. Downey License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html """ import string import random def process_file(filename, skip_header): """Makes a histogram that contains the words from a file. filename: string skip_header: boolean, whether to skip the Gutenberg header Returns: map from each word to the number of times it appears. """ hist = {} fp = file(filename) if skip_header: skip_gutenberg_header(fp) for line in fp: process_line(line, hist) return hist def skip_gutenberg_header(fp): """Reads from fp until it finds the line that ends the header. fp: open file object """ for line in fp: if line.startswith('*END*THE SMALL PRINT!'): break def process_line(line, hist): """Adds the words in the line to the histogram. Modifies hist. line: string hist: histogram (map from word to frequency) """ # replace hyphens with spaces before splitting line = line.replace('-', ' ') for word in line.split(): # remove punctuation and convert to lowercase word = word.strip(string.punctuation + string.whitespace) word = word.lower() # update the histogram hist[word] = hist.get(word, 0) + 1 def most_common(hist): """Makes a list of the key-value pairs from a histogram and sorts them in descending order by frequency.""" t = [] for key, value in hist.items(): t.append((value, key)) t.sort() t.reverse() return t def print_most_common(hist, num=10): """Prints the most commons words in a histgram and their frequencies. hist: histogram (map from word to frequency num: number of words to print """ t = most_common(hist) print 'The most common words are:' for freq, word in t[:num]: print word, '\t', freq def subtract(d1, d2): """Returns a dictionary with all keys that appear in d1 but not d2. d1, d2: dictionaries """ res = {} for key in d1: if key not in d2: res[key] = None return res def total_words(hist): """Returns the total of the frequencies in a histogram.""" return sum(hist.values()) def different_words(hist): """Returns the number of different words in a histogram.""" return len(hist) def random_word(hist): """Chooses a random word from a histogram. The probability of each word is proportional to its frequency. """ t = [] for word, freq in hist.items(): t.extend([word] * freq) return random.choice(t) if __name__ == '__main__': hist = process_file('emma.txt', skip_header=True) print 'Total number of words:', total_words(hist) print 'Number of different words:', different_words(hist) t = most_common(hist) print 'The most common words are:' for freq, word in t[0:20]: print word, '\t', freq words = process_file('words.txt', skip_header=False) diff = subtract(hist, words) print "The words in the book that aren't in the word list are:" for word in diff.keys(): print word, print "\n\nHere are some random words from the book" for i in range(100): print random_word(hist),