""" count_words.py Count word frequencies in moby_dick.txt . This illustrates (a) reading words from a file, including how to ignoring punctuation, (b) using a python dictionary for counting things, and (c) a bit of somewhat tricky sorting. The moby_dick.txt file has been extracted from the one at gutenberg.org; google things like "moby dick text" to find it. # --- tests --- # create a test word file >>> testfilename = '_test_words_.txt' >>> create_test_file(testfilename) # analyze the test file >>> testcount = get_word_count(filename=testfilename, verbose=False) >>> testcount['one'] == 4 and testcount['two'] == 2 True >>> sort_by_count(testcount) [(4, 'one'), (2, 'two'), (1, 'three'), (1, 'four')] # cleanup test word file >>> import os; os.remove(testfilename) # --- running it (on an iMac 2014 4GHz i7 with 32GB RAM) --- $ python --version Python 3.7.3 $ time python count_words.py python count_words.py Counting words in 'moby_dick.txt'... done. Processed 208433 words in 22446 lines. Top 100 words are 1 14065 : the 2 6437 : of 3 6257 : and 4 4534 : a 5 4489 : to 6 4048 : in 7 2881 : that 8 2484 : his 9 2318 : it 10 1929 : i 11 1762 : but 12 1720 : he 13 1698 : as 14 1690 : with 15 1672 : is 16 1618 : was 17 1568 : for 18 1463 : all 19 1339 : this 20 1294 : at 21 1169 : by 22 1115 : not 23 1071 : from 24 1028 : on 25 1023 : so 26 1021 : be 27 1018 : him 28 874 : one 29 871 : whale 30 846 : you 31 764 : had 32 751 : have 33 745 : now 34 738 : there 35 687 : or 36 674 : were 37 637 : they 38 617 : which 39 604 : some 40 603 : their 41 600 : then 42 592 : me 43 590 : when 44 584 : my 45 583 : are 46 583 : an 47 566 : like 48 565 : no 49 560 : upon 50 530 : what 51 516 : into 52 508 : out 53 496 : up 54 495 : more 55 468 : if 56 457 : its 57 447 : them 58 435 : old 59 426 : man 60 425 : we 61 421 : would 62 408 : been 63 406 : ahab 64 397 : over 65 394 : ye 66 392 : other 67 386 : these 68 376 : will 69 371 : ship 70 369 : only 71 364 : such 72 362 : whales 73 362 : though 74 359 : sea 75 357 : down 76 338 : yet 77 329 : who 78 321 : time 79 319 : her 80 318 : any 81 317 : very 82 313 : long 83 306 : still 84 302 : those 85 300 : than 86 300 : about 87 294 : do 88 292 : captain 89 289 : before 90 288 : great 91 287 : has 92 286 : said 93 280 : seemed 94 279 : must 95 278 : two 96 276 : here 97 273 : most 98 272 : last 99 259 : thou 100 259 : head real 0m0.568s user 0m0.391s sys 0m0.018s Some words that look to me like they're indicative of this novel : whale 29th old 58th man 59th ahab 63rd ye 65th ship 69th sea 74th captain 88th thou 99th This sort of information might be useful in a text genre indentification system, for example. Jim Mahoney | cs.marlboro.college | Nov 2019 | MIT License """ def create_test_file(testfilename): """ create a short word test file """ with open(testfilename, 'w') as testfile: testfile.write("one; two, ! one three\n one four one two\n") def get_word_count(filename, verbose=True): """ Return dictionary of {word1:count1, word2:count2, ...}. If verbose, also print a summary of total words and lines. """ count = {} # {word:number} dictionary to create ignore = ( ',', '.', '"', "'", ';', ':', '!', '(', ')', '*', '$' ) if verbose: print("Counting words in '{}'...".format(filename)) (nlines, nwords) = (0, 0) # number of lines & words input = open(filename, 'r') while True: # Loop over lines : line = input.readline() # Read in a line. if not line: # Stop if no more text to read. break nlines = nlines + 1 words = line.split() # Split the line into words. for word in words: # Loop over words : nwords = nwords + 1 word = word.lower() # Convert to lowercase for char in ignore: # Remove puncutation word = word.replace(char, '') if word: # Not empty string? if word in count: # Seen it already? count[word] = 1 + count[word] # Increment count. else: # Otherwise, count[word] = 1 # Initialize count. if verbose: print("done. Processed {} words in {} lines.".format(nwords, nlines)) return count def sort_by_count(wordcount_dict): """ Given a dictionary {word1:count1, word2:count2, ...}, return a list of sorted hi to low by count, e.g. [(10000,'the'), (5000,'a'), (200,'him'),...] >>> sort_by_count({'a':3, 'b':10, 'c':5}) [(10, 'b'), (5, 'c'), (3, 'a')] """ # The idea here is to # (a) swap the pairs to put the numbers in front, then # (b) sort the pairs, using those numbers, then finally # (c) reverse the list so that the biggest is first. pairs = list(wordcount_dict.items()) # e.g. [('teapot',234), ... ] for i in range(len(pairs)): (word, freq) = pairs[i] # Extract the pair pairs[i] = (freq, word) # ... and stick 'em back in, swapped. pairs.sort() # e.g. [(1,'accidental'), (1,'acute'),...] pairs.reverse() # e.g. [(14065, 'the'), (6437, 'of'), ...] return pairs def print_some_pairs(pairs, n=100): """ Print some of the (int,string) pairs from [(count, word),...] """ print("Top {} words are".format(n)) for i in range(n): print(" {:4} {:6} : {:<20} ".format(i+1, pairs[i][0], pairs[i][1])) def main(): wordcount = get_word_count('moby_dick.txt') pairs_hi_to_low = sort_by_count(wordcount) print_some_pairs(pairs_hi_to_low) if __name__=='__main__': import doctest doctest.testmod() main()