"""
 bigrams.py

 bigrams (pairs of letters) and conditional probability - an example

 See 
   https://en.wikipedia.org/wiki/Bigram
   https://en.wikipedia.org/wiki/A_Mathematical_Theory_of_Communication

 $ python bigrams.py -v
 Trying:
     cp[(b,a)] # probabilty of 'b' given preceding 'a'; a* is (b,b,c,z)
 Expecting:
     0.5
 ok
 Trying:
     cp[(a,b)] # probability of 'a' given preceding 'b'; b* is (a)
 Expecting:
     1.0
 ok
 Trying:
     c['a']  # 8 chars, 4 are 'a'
 Expecting:
     4
 ok
 Trying:
     get_pairs('ababacaz')
 Expecting:
     ('ab', 'ba', 'ab', 'ba', 'ac', 'ca', 'az', 'za')
 ok
 Trying:
     p[a]  # p(a) = 4/8 , since 8 'a' of 8 total chars
 Expecting:
     0.5
 ok
 Trying:
     sum(p.values())  # probabilities add to 1
 Expecting:
     1.0
 ok
 Trying:
     p2[ab]   # 2 times out of 8 pairs
 Expecting:
     0.25
 ok
 Trying:
     sum(p2.values())  # probabilities add to 1
 Expecting:
     1.0
 ok
 Test passed.

 Jim Mahoney | cs.bennington.college | Oct 2020 | MIT License
"""

(a,b, ab) = ('a', 'b', 'ab') # just to save some typing

def count(things):
    """ return dictionary of {thing:count} 
        >>> c = count('ababacaz')
        >>> c['a']  # 8 chars, 4 are 'a'
        4
    """
    counts = {}
    for thing in things:
        counts[thing] = 1 + counts.get(thing, 0)
    return counts

def probability(things):
    """ return probabilities of {thing:p(thing)} 
        >>> p = probability('ababacaz')
        >>> p[a]  # p(a) = 4/8 , since 8 'a' of 8 total chars
        0.5
        >>> sum(p.values())  # probabilities add to 1
        1.0
        >>> p2 = probability(get_pairs('ababacaz'))
        >>> p2[ab]   # 2 times out of 8 pairs
        0.25
        >>> sum(p2.values())  # probabilities add to 1
        1.0
    """
    # These probabilities must sum to 1.
    counts = count(things)
    n = len(things)
    probs = {}
    for thing in counts:
        probs[thing] = counts[thing]/n
    return probs

def get_pairs(text):
    """ return all pairs of sequential letters including wrap around
        >>> get_pairs('ababacaz')
        ('ab', 'ba', 'ab', 'ba', 'ac', 'ca', 'az', 'za')
    """
    # The reason to wrap around is to fix for a subtle counting issue.
    # If we ask for the probability P(a) for a single character,
    # without wrap-around we'll get different answers depending on
    # whether that character is the first or second in a pair, since
    # not all characters in the text are in each category ... unless
    # we wrap around. For example, if we want P(a|z) to mean "the
    # probability of 'a' given the we just saw 'z'", and z only shows
    # up as the last character, then since nothing comes after the z,
    # this isn't even defined unless we wrap around to allow 'za' to
    # be one of the pairs.  Without wrap around, we need to
    # distinguish P(a)_1st_char P(a)_2nd_char since the P(a)_1st_char
    # looks at text[:-1], while P(a)_2nd_char is based on
    # text[1:]. I'm choosing the wrap around complication to make the
    # other formulas simpler.
    pairs = []
    for i in range(len(text)-1):
        pairs.append(text[i:i+2])
    pairs.append(text[-1] + text[0])
    return tuple(pairs)

def conditional(text):
    """ return dictionary conds[(b,a)] = P(b|a) = probability of b given a
        for all sequential bigrams 'ab' in text, including wrap-around
        >>> cp = conditional('ababacaz')  # conditional probability
        >>> cp[(b,a)] # probabilty of 'b' given preceding 'a'; a* is (b,b,c,z)
        0.5
        >>> cp[(a,b)] # probability of 'a' given preceding 'b'; b* is (a)
        1.0
    """
    # Note that this won't include zero probability results,
    # since we're not counting things that haven't happened.
    # So for example in 'ababacaz', P(a|a) which is the probability
    # of an 'a' given that we just saw an 'a' which should be 0.0
    # is not in this dictionary. That means that in code which uses
    # this, any values not found should be assumed to have probability 0.
    pairs = get_pairs(text)
    p1 = probability(text)   # single letter probabilities
    p2 = probability(pairs)  # double letter probabilities
    conds = {}
    for xy in pairs:
        (x, y) = (xy[0], xy[1])        # i.e. two letters in bigram "ab"
        conds[(y,x)] = p2[xy] / p1[x]  # P(b|a) = P(a & b)/P(a)
    return conds

if __name__ == '__main__':
    import doctest
    doctest.testmod()