""" hashing.py Thinking about hash functions. Given a set of keys to be hashed, a good hash function will spread out that hash values in a fairly uniform way, thus reducing the number of potential collisions. Here we take a large list of words, take the hash function of each, place those numbers into a much smaller number of bins, and see how spread out they are across the bins. The hash functions examined are id python's built-in hash function dbj2 a classic 32bit hash function by Dan Bernstein ascii_sum a naive "sum the ascii values" hash function Here's what it looks like when it runs. $ python hashing.py -- id -- Looking at distribution of id() values ... Total number of words is 235887. Distributing id(word) among 1001 bins. Number of non-empty bins is 1001. Largest bin count is 245. Minimum bin count is 226. -- djb2 -- Looking at distribution of djb2() values ... Total number of words is 235887. Distributing djb2(word) among 1001 bins. Number of non-empty bins is 1001. Largest bin count is 290. Minimum bin count is 170. -- ascii_sum -- Looking at distribution of ascii_sum() values ... Total number of words is 235887. Distributing ascii_sum(word) among 1001 bins. Number of non-empty bins is 1001. Largest bin count is 630. Minimum bin count is 25. Jim Mahoney | cs.bennington.college | MIT License | March 2021 """ from collections import Counter all_words = open('/usr/share/dict/words').read().split('\n') def djb2(string): """ Dan Benstein's 32bit hash function """ # See for example http://www.cse.yorku.ca/~oz/hash.html # This is what you might call "bit munging"; # the sort of code you'd more typically find in C than in python. bit32 = 0xFFFFFFFF # 32 bits, all 1 hash = 5381 for c in string: hash = ((( hash << 5) + hash) + ord(c)) & bit32 # That is the same as this but (hopefully) faster ; # * the << operator is "bit shift left" # * the & operator is "bit and" # hash = (hash*33 + ord(char)) % 2**32 return hash def ascii_sum(string): """ return sum of ord(char) """ return sum([ord(char) for char in string]) def describe_hash_bins(hash_func, bins=1001, words=all_words): """ print an analysis of the given hash function """ hash_name = hash_func.__name__ hashes = [hash_func(word) % bins for word in words] counts = Counter(hashes) # A python library dict structure for counting print() print(f" -- {hash_name} --") print(f" Looking at distribution of {hash_name}() values ... ") print(f" Total number of words is {len(words)}.") print(f" Distributing {hash_name}(word) among {bins} bins.") print(f" Number of non-empty bins is {len(counts)}.") print(f" Largest bin count is {max(counts.values())}.") print(f" Minimum bin count is {min(counts.values())}.") describe_hash_bins(id) describe_hash_bins(djb2) describe_hash_bins(ascii_sum) print()