"""
 analyze.py

 Analyze the counts of words in a file (by default 'hound.txt')
 using a python dictionary ... and a few other tricks.

        $ python analyze.py 
        Analyzing words in 'hound.txt'.
        Number words is 59926.
        First five are ['the', 'hound', 'of', 'the', 'baskervilles'].
        Number of different ones is 6889.
        Here are the top 100 : 
             i             word     count 
          ----   --------------  -------- 
             1              the      3343 
             2              and      1620 
             3               of      1595 
             4                i      1488 
             5               to      1406 
             6                a      1306 
             7             that      1127 
             8               it       952 
             9               in       906 
            10               he       894 
            11              was       792 
            12              you       779 
            13              his       691 
            14               is       608 
            15             have       538 
            16              had       502 
            17               my       479 
            18               we       466 
            19             with       440 
            20            which       415 
            21              for       414 
            22               as       405 
            23              but       388 
            24              not       368 
            25               at       363 
            26               be       324 
            27             this       323 
            28             upon       315 
            29            there       297 
            30              sir       291 
            31             from       286 
            32              him       276 
            33               me       269 
            34              one       236 
            35               on       233 
            36               so       231 
            37             said       229 
            38                s       229 
            39             been       219 
            40               by       211 
            41              all       210 
            42             what       206 
            43              our       205 
            44               if       205 
            45             were       204 
            46               an       201 
            47               no       200 
            48            could       200 
            49              are       196 
            50                -       192 
            51            would       191 
            52             very       190 
            53              man       183 
            54             your       177 
            55             will       174 
            56               do       168 
            57              has       166 
            58               us       159 
            59              out       157 
            60              her       151 
            61              who       148 
            62             when       143 
            63           holmes       143 
            64             some       140 
            65           should       139 
            66             more       134 
            67              she       133 
            68            henry       128 
            69             then       123 
            70               or       123 
            71             moor       122 
            72               up       116 
            73             over       112 
            74              did       111 
            75             know       110 
            76             into       110 
            77               dr       109 
            78             they       108 
            79              can       108 
            80             down       106 
            81              see       104 
            82            about       103 
            83              may       101 
            84             only        97 
            85              how        96 
            86      baskerville        93 
            87              any        93 
            88             than        88 
            89             must        87 
            90           before        87 
            91              own        85 
            92            might        85 
            93               am        84 
            94          charles        83 
            95            where        81 
            96            think        80 
            97             them        79 
            98              two        76 
            99             come        76 
           100        stapleton        75 
 
 Jim Mahoney | cs.bennington.college | MIT License | March 2021 
"""

puncuation = ['"', ';', '.', '?', "'"]


def line_to_words(line):
    """ convert a line of text into a list of words """
    for char in puncuation:
        line = line.replace(char, ' ')
    line = line.strip()
    return [word.lower() for word in line.split() if word != '']

def all_words(filename):
    """ return a list of the words in the given file """
    words = []
    with open(filename) as file:
        for line in file:
            words += line_to_words(line)
    return words

def count_words(words):
    """ return a dictionary of {word:count} given [word0, word1, ...] """
    count = {}
    for word in words:
        count[word] = count.get(word, 0) + 1
    return count

def flip(pairs):
    """ Convert pairs [(a0,b0), ...] to [(b0,a0), ...] """
    return [(pair[1], pair[0]) for pair in pairs]

def sort_counts(counts):
    """ Given {word:count}, return sorted [(word0,count0), ...] """
    word_count = list(counts.items())
    count_word = flip(word_count)
    count_word.sort(reverse=True)
    return flip(count_word)

def print_words(word_count):
    """ print a table of words and their counts """
    print(f"  {'i':>4}   {'word':>14}  {'count':>8} ")
    print(f"  {'-'*4:>4}   {'-'*14:>14}  {'-'*8:>8} ")
    for i in range(100):
        (word, count) = word_count[i]
        print(f"  {i+1:>4}   {word:>14}  {count:>8} ")

def main():
    filename = 'hound.txt'
    print(f"Analyzing words in '{filename}'.")
    words = all_words(filename)
    counts = count_words(words)
    print(f"Number words is {len(words)}.")
    print(f"First five are {words[:5]}.")
    print(f"Number of different ones is {len(counts)}.")
    print(f"Here are the top 100 : ")
    print_words(sort_counts(counts))

main()