count_words.py

"""
 count_words.py 

 Count word frequencies in moby_dick.txt .

 This illustrates
  (a) reading words from a file, including how to ignoring punctuation,
  (b) using a python dictionary for counting things, and
  (c) a bit of somewhat tricky sorting.

 The moby_dick.txt file has been extracted from the one at 
 gutenberg.org; google things like "moby dick text" to find it.

 # --- tests ---

    # create a test word file
    >>> testfilename = '_test_words_.txt'
    >>> create_test_file(testfilename)
    
    # analyze the test file
    >>> testcount = get_word_count(filename=testfilename, verbose=False)
    >>> testcount['one'] == 4 and testcount['two'] == 2
    True
    >>> sort_by_count(testcount)
    [(4, 'one'), (2, 'two'), (1, 'three'), (1, 'four')]

    # cleanup test word file
    >>> import os; os.remove(testfilename)
    
 # --- running it (on an iMac 2014 4GHz i7 with 32GB RAM) ---

    $ python --version
    Python 3.7.3

    $ time python count_words.py 
    python count_words.py 
    Counting words in 'moby_dick.txt'...
    done.  Processed 208433 words in 22446 lines.
    Top 100 words are
        1  14065 : the                  
        2   6437 : of                   
        3   6257 : and                  
        4   4534 : a                    
        5   4489 : to                   
        6   4048 : in                   
        7   2881 : that                 
        8   2484 : his                  
        9   2318 : it                   
       10   1929 : i                    
       11   1762 : but                  
       12   1720 : he                   
       13   1698 : as                   
       14   1690 : with                 
       15   1672 : is                   
       16   1618 : was                  
       17   1568 : for                  
       18   1463 : all                  
       19   1339 : this                 
       20   1294 : at                   
       21   1169 : by                   
       22   1115 : not                  
       23   1071 : from                 
       24   1028 : on                   
       25   1023 : so                   
       26   1021 : be                   
       27   1018 : him                  
       28    874 : one                  
       29    871 : whale                
       30    846 : you                  
       31    764 : had                  
       32    751 : have                 
       33    745 : now                  
       34    738 : there                
       35    687 : or                   
       36    674 : were                 
       37    637 : they                 
       38    617 : which                
       39    604 : some                 
       40    603 : their                
       41    600 : then                 
       42    592 : me                   
       43    590 : when                 
       44    584 : my                   
       45    583 : are                  
       46    583 : an                   
       47    566 : like                 
       48    565 : no                   
       49    560 : upon                 
       50    530 : what                 
       51    516 : into                 
       52    508 : out                  
       53    496 : up                   
       54    495 : more                 
       55    468 : if                   
       56    457 : its                  
       57    447 : them                 
       58    435 : old                  
       59    426 : man                  
       60    425 : we                   
       61    421 : would                
       62    408 : been                 
       63    406 : ahab                 
       64    397 : over                 
       65    394 : ye                   
       66    392 : other                
       67    386 : these                
       68    376 : will                 
       69    371 : ship                 
       70    369 : only                 
       71    364 : such                 
       72    362 : whales               
       73    362 : though               
       74    359 : sea                  
       75    357 : down                 
       76    338 : yet                  
       77    329 : who                  
       78    321 : time                 
       79    319 : her                  
       80    318 : any                  
       81    317 : very                 
       82    313 : long                 
       83    306 : still                
       84    302 : those                
       85    300 : than                 
       86    300 : about                
       87    294 : do                   
       88    292 : captain              
       89    289 : before               
       90    288 : great                
       91    287 : has                  
       92    286 : said                 
       93    280 : seemed               
       94    279 : must                 
       95    278 : two                  
       96    276 : here                 
       97    273 : most                 
       98    272 : last                 
       99    259 : thou                 
      100    259 : head                 

    real    0m0.568s
    user    0m0.391s
    sys	    0m0.018s

 Some words that look to me like they're indicative of this novel :

    whale      29th
    old        58th
    man        59th
    ahab       63rd
    ye         65th
    ship       69th
    sea        74th
    captain    88th
    thou       99th

 This sort of information might be useful in a text genre
 indentification system, for example.
 
 Jim Mahoney | cs.marlboro.college | Nov 2019 | MIT License
"""

def create_test_file(testfilename):
    """ create a short word test file """
    with open(testfilename, 'w') as testfile:
        testfile.write("one; two, ! one three\n one four one two\n")

def get_word_count(filename, verbose=True):
    """ Return dictionary of {word1:count1, word2:count2, ...}.
        If verbose, also print a summary of total words and lines. 
    """
    count = {}      # {word:number} dictionary to create
    ignore = ( ',', '.', '"', "'", ';', ':', '!', '(', ')', '*', '$' )
    if verbose:
        print("Counting words in '{}'...".format(filename))
    (nlines, nwords) = (0, 0)     # number of lines & words
    input = open(filename, 'r')
    while True:                             # Loop over lines :
        line = input.readline()             #   Read in a line.
        if not line:                        #   Stop if no more text to read.
            break
        nlines = nlines + 1
        words = line.split()                #   Split the line into words.
        for word in words:                  #   Loop over words :
            nwords = nwords + 1
            word = word.lower()                #     Convert to lowercase
            for char in ignore:                #     Remove puncutation
                word = word.replace(char, '')
            if word:                           # Not empty string?
                if word in count:                  #     Seen it already?
                    count[word] = 1 + count[word]  #        Increment count.
                else:                              #     Otherwise,
                    count[word] = 1                #        Initialize count.
    if verbose:
        print("done.  Processed {} words in {} lines.".format(nwords, nlines))
    return count

def sort_by_count(wordcount_dict):
    """ Given a dictionary {word1:count1, word2:count2, ...},
        return a list of sorted hi to low by count,
        e.g. [(10000,'the'), (5000,'a'), (200,'him'),...] 
        >>> sort_by_count({'a':3, 'b':10, 'c':5})
        [(10, 'b'), (5, 'c'), (3, 'a')]
    """
    # The idea here is to
    #  (a) swap the pairs to put the numbers in front, then
    #  (b) sort the pairs, using those numbers, then finally
    #  (c) reverse the list so that the biggest is first.
    pairs = list(wordcount_dict.items())  # e.g. [('teapot',234), ... ]
    for i in range(len(pairs)):
        (word, freq) = pairs[i]     # Extract the pair
        pairs[i] = (freq, word)     # ... and stick 'em back in, swapped.
    pairs.sort()                    # e.g. [(1,'accidental'), (1,'acute'),...]
    pairs.reverse()                 # e.g. [(14065, 'the'), (6437, 'of'), ...]
    return pairs

def print_some_pairs(pairs, n=100):
    """ Print some of the (int,string) pairs from [(count, word),...] """
    print("Top {} words are".format(n))
    for i in range(n):
        print(" {:4} {:6} : {:<20} ".format(i+1, pairs[i][0], pairs[i][1]))

def main():
    wordcount = get_word_count('moby_dick.txt')
    pairs_hi_to_low = sort_by_count(wordcount)
    print_some_pairs(pairs_hi_to_low)

if __name__=='__main__':
    import doctest
    doctest.testmod()
    main()