""" hash_table_again.py Another hash table implementation A *lot* faster than the hash_table_demo.py, though to be fair this one does use more memory as needed, in the "chain lists" if not in the hash table. $ python hash_table_again.py -- hash table test 1 get('cat') : meow get('dog') : woof get('not in table') : None -- hash table test 2 -- Found key 'abc', value is '1'. Did not find key 'def'. Found key 'bbb', value is '3'. -- hash table test 3 -- With n= 101, half each yes/no, elapsed= 0.000429. With n= 1001, half each yes/no, elapsed= 0.004252. With n=10001, half each yes/no, elapsed= 0.043250. -------------------- This one uses "separate chaining" to handle collisions, with a python list (a "chain") for each hash bucket. table [0] is [] [1] is [ ('a', 1) ] ... key ---> [hash] is [ ('b', 2), ('c', 3) ] chain One Jim Mahoney | cs.bennington.college | MIT License | March 2021 """ from utilities import random_words from time import time def hash_function(string): """ Given a string, return a (largish) integer hash value """ # This is one of the classic hash functions, by Dan Bernstein, # known as "djb2" # # See for example https://theartincode.stanis.me/008-djb2/ # The mod arithmetic keeps the hash value between 0 and 2**32, # in other words and unsigned 32 bit integer. # There are two "magic numbers" here, 5381 and 33, # which apparently give pretty good pseudo-random uniform # behavior for typical strings. hash = 5381 for letter in string: hash = (hash * 33 + ord(letter)) % 4294967296 return hash class HashTable: """ a hash table (python dictionary) with "get" and "set" methods """ def __init__(self, size=1001): """ create an initially empty hash table """ # Each entry in self.table is either # (a) [] # the empty list # (b) [(key1,value1), (key2,value2), ...] # self.size = size # number of bins (both filled & empty). self.entries = 0 # number of bins with stuff in 'em. self.table = [ None ] * self.size # Make each entry an empty list. for i in range(size): self.table[i] = [] def show(self): # return a string showing the whole table return f"" def _index_of(self, key): """ Look for the location of a given key. If the key is already in the table, return (hash, offset), such that self.table[hash][offset] is (key, value) If not, return (hash, -1). """ hash = hash_function(key) % self.size chain = self.table[hash] # [(key,val), ... ] with this hash chain_length = len(chain) for i in range(chain_length): (maybe_key, _) = chain[i] if maybe_key == key: return (hash, i) # found it. # didn't find it return (hash, -1) def set(self, key, value): """ add (or replace) a (key,value) in the hash table """ (hash, offset) = self._index_of(key) chain = self.table[hash] if offset == -1: chain.append( (key, value) ) self.entries += 1 def get(self, key): """ return the corresponding value or None if not found """ (hash, offset) = self._index_of(key) if offset == -1: return None else: chain = self.table[hash] (_, value) = chain[offset] return value def test1(): print("-- hash table test 1") smalltable = HashTable(7) smalltable.set('cat', 'meow') print(smalltable.show()) smalltable.set('dog', 'woof') print(smalltable.show()) print(" get('cat') : ", smalltable.get('cat')) print(" get('dog') : ", smalltable.get('dog')) print(" get('not in table') : ", smalltable.get('not in table')) def test2(): # Some sample data to put into the table. # (Note that 'bbb' has the same hash as 'abc'.) print("-- hash table test 2 --") hashtable = HashTable(5) pairs = [('abc', 1), ('abcd', 2), ('bbb', 3)] for (key, value) in pairs: hashtable.set(key, value) print(hashtable.show()) test_keys = ('abc', 'def', 'bbb') for key in test_keys: value = hashtable.get(key) if value: print(f"Found key '{key}', value is '{value}'. ") else: print(f"Did not find key '{key}'.") def time_test(n, yes_words, no_words): """ return time it takes to put in yes_words, check yes_words & no_words """ start = time() hash_table = HashTable(n) for word in yes_words: hash_table.set(word, len(word)) for word in yes_words: try: assert hash_table.get(word) == len(word) except AssertionError: print(f"Failed to hit word {word}; got {hash_table.get(word)}") for word in no_words: try: assert hash_table.get(word) == None except AssertionError: print(f"Failed to miss word {word}; got {hash_table.get(word)}") return time() - start def test3(): print("-- hash table test 3 --") for n in (101, 1001, 10001): words = random_words(n) yes_words = words[0:n//2] # words to be put in hashtable no_words = words[n//2:] # words not in hashtable elapsed = time_test(n, yes_words, no_words) print(f"With n={n:>5}, half each yes/no, elapsed={elapsed:>12.6f}.") def main(): test1() test2() test3() if __name__ == '__main__': main()