""" analyze.py Analyze the counts of words in a file (by default 'hound.txt') using a python dictionary ... and a few other tricks. $ python analyze.py Analyzing words in 'hound.txt'. Number words is 59926. First five are ['the', 'hound', 'of', 'the', 'baskervilles']. Number of different ones is 6889. Here are the top 100 : i word count ---- -------------- -------- 1 the 3343 2 and 1620 3 of 1595 4 i 1488 5 to 1406 6 a 1306 7 that 1127 8 it 952 9 in 906 10 he 894 11 was 792 12 you 779 13 his 691 14 is 608 15 have 538 16 had 502 17 my 479 18 we 466 19 with 440 20 which 415 21 for 414 22 as 405 23 but 388 24 not 368 25 at 363 26 be 324 27 this 323 28 upon 315 29 there 297 30 sir 291 31 from 286 32 him 276 33 me 269 34 one 236 35 on 233 36 so 231 37 said 229 38 s 229 39 been 219 40 by 211 41 all 210 42 what 206 43 our 205 44 if 205 45 were 204 46 an 201 47 no 200 48 could 200 49 are 196 50 - 192 51 would 191 52 very 190 53 man 183 54 your 177 55 will 174 56 do 168 57 has 166 58 us 159 59 out 157 60 her 151 61 who 148 62 when 143 63 holmes 143 64 some 140 65 should 139 66 more 134 67 she 133 68 henry 128 69 then 123 70 or 123 71 moor 122 72 up 116 73 over 112 74 did 111 75 know 110 76 into 110 77 dr 109 78 they 108 79 can 108 80 down 106 81 see 104 82 about 103 83 may 101 84 only 97 85 how 96 86 baskerville 93 87 any 93 88 than 88 89 must 87 90 before 87 91 own 85 92 might 85 93 am 84 94 charles 83 95 where 81 96 think 80 97 them 79 98 two 76 99 come 76 100 stapleton 75 Jim Mahoney | cs.bennington.college | MIT License | March 2021 """ puncuation = ['"', ';', '.', '?', "'"] def line_to_words(line): """ convert a line of text into a list of words """ for char in puncuation: line = line.replace(char, ' ') line = line.strip() return [word.lower() for word in line.split() if word != ''] def all_words(filename): """ return a list of the words in the given file """ words = [] with open(filename) as file: for line in file: words += line_to_words(line) return words def count_words(words): """ return a dictionary of {word:count} given [word0, word1, ...] """ count = {} for word in words: count[word] = count.get(word, 0) + 1 return count def flip(pairs): """ Convert pairs [(a0,b0), ...] to [(b0,a0), ...] """ return [(pair[1], pair[0]) for pair in pairs] def sort_counts(counts): """ Given {word:count}, return sorted [(word0,count0), ...] """ word_count = list(counts.items()) count_word = flip(word_count) count_word.sort(reverse=True) return flip(count_word) def print_words(word_count): """ print a table of words and their counts """ print(f" {'i':>4} {'word':>14} {'count':>8} ") print(f" {'-'*4:>4} {'-'*14:>14} {'-'*8:>8} ") for i in range(100): (word, count) = word_count[i] print(f" {i+1:>4} {word:>14} {count:>8} ") def main(): filename = 'hound.txt' print(f"Analyzing words in '{filename}'.") words = all_words(filename) counts = count_words(words) print(f"Number words is {len(words)}.") print(f"First five are {words[:5]}.") print(f"Number of different ones is {len(counts)}.") print(f"Here are the top 100 : ") print_words(sort_counts(counts)) main()