""" bigrams.py bigrams (pairs of letters) and conditional probability - an example See https://en.wikipedia.org/wiki/Bigram https://en.wikipedia.org/wiki/A_Mathematical_Theory_of_Communication $ python bigrams.py -v Trying: cp[(b,a)] # probabilty of 'b' given preceding 'a'; a* is (b,b,c,z) Expecting: 0.5 ok Trying: cp[(a,b)] # probability of 'a' given preceding 'b'; b* is (a) Expecting: 1.0 ok Trying: c['a'] # 8 chars, 4 are 'a' Expecting: 4 ok Trying: get_pairs('ababacaz') Expecting: ('ab', 'ba', 'ab', 'ba', 'ac', 'ca', 'az', 'za') ok Trying: p[a] # p(a) = 4/8 , since 8 'a' of 8 total chars Expecting: 0.5 ok Trying: sum(p.values()) # probabilities add to 1 Expecting: 1.0 ok Trying: p2[ab] # 2 times out of 8 pairs Expecting: 0.25 ok Trying: sum(p2.values()) # probabilities add to 1 Expecting: 1.0 ok Test passed. Jim Mahoney | cs.bennington.college | Oct 2020 | MIT License """ (a,b, ab) = ('a', 'b', 'ab') # just to save some typing def count(things): """ return dictionary of {thing:count} >>> c = count('ababacaz') >>> c['a'] # 8 chars, 4 are 'a' 4 """ counts = {} for thing in things: counts[thing] = 1 + counts.get(thing, 0) return counts def probability(things): """ return probabilities of {thing:p(thing)} >>> p = probability('ababacaz') >>> p[a] # p(a) = 4/8 , since 8 'a' of 8 total chars 0.5 >>> sum(p.values()) # probabilities add to 1 1.0 >>> p2 = probability(get_pairs('ababacaz')) >>> p2[ab] # 2 times out of 8 pairs 0.25 >>> sum(p2.values()) # probabilities add to 1 1.0 """ # These probabilities must sum to 1. counts = count(things) n = len(things) probs = {} for thing in counts: probs[thing] = counts[thing]/n return probs def get_pairs(text): """ return all pairs of sequential letters including wrap around >>> get_pairs('ababacaz') ('ab', 'ba', 'ab', 'ba', 'ac', 'ca', 'az', 'za') """ # The reason to wrap around is to fix for a subtle counting issue. # If we ask for the probability P(a) for a single character, # without wrap-around we'll get different answers depending on # whether that character is the first or second in a pair, since # not all characters in the text are in each category ... unless # we wrap around. For example, if we want P(a|z) to mean "the # probability of 'a' given the we just saw 'z'", and z only shows # up as the last character, then since nothing comes after the z, # this isn't even defined unless we wrap around to allow 'za' to # be one of the pairs. Without wrap around, we need to # distinguish P(a)_1st_char P(a)_2nd_char since the P(a)_1st_char # looks at text[:-1], while P(a)_2nd_char is based on # text[1:]. I'm choosing the wrap around complication to make the # other formulas simpler. pairs = [] for i in range(len(text)-1): pairs.append(text[i:i+2]) pairs.append(text[-1] + text[0]) return tuple(pairs) def conditional(text): """ return dictionary conds[(b,a)] = P(b|a) = probability of b given a for all sequential bigrams 'ab' in text, including wrap-around >>> cp = conditional('ababacaz') # conditional probability >>> cp[(b,a)] # probabilty of 'b' given preceding 'a'; a* is (b,b,c,z) 0.5 >>> cp[(a,b)] # probability of 'a' given preceding 'b'; b* is (a) 1.0 """ # Note that this won't include zero probability results, # since we're not counting things that haven't happened. # So for example in 'ababacaz', P(a|a) which is the probability # of an 'a' given that we just saw an 'a' which should be 0.0 # is not in this dictionary. That means that in code which uses # this, any values not found should be assumed to have probability 0. pairs = get_pairs(text) p1 = probability(text) # single letter probabilities p2 = probability(pairs) # double letter probabilities conds = {} for xy in pairs: (x, y) = (xy[0], xy[1]) # i.e. two letters in bigram "ab" conds[(y,x)] = p2[xy] / p1[x] # P(b|a) = P(a & b)/P(a) return conds if __name__ == '__main__': import doctest doctest.testmod()