#!/usr/bin/env python from operator import itemgetter d = {} # Limit the number of letters limit = 4 # Alternatively, select words to exclude removethese = ["the", "and", "i", "a", "is", "of", "in", "to", "we", "he", "with", "--", "at", "are", "you", "it", "but", "for"] f = open('obamamccain.txt') for line in f: oneline = line.strip().split() if len(oneline) > 0: # Ignore the all caps speaker identifier if (oneline[0][-1] == ":"): for word in range(1, len(oneline)): if len(oneline[word]) > limit: d[oneline[word].lower()] = d.get(oneline[word].lower(), 0) + 1 # Ignore the moderator's questions elif oneline[0] == "LEHRER:" or oneline[0] == "IFILL:": pass else: for word in range(len(oneline)): if len(oneline[word]) > limit: d[oneline[word].lower()] = d.get(oneline[word].lower(), 0) + 1 """ # Uncomment this region to remove particular words for word in removethese: del d[word] """ posses = d.items() posses.sort(key=itemgetter(1), reverse=True) counter = 0 for key, value in posses: if counter < 30: print "%s\t\t%s" % (key, value) counter += 1