recursive_descent.py

"""recursive_descent.py

 An example of a recursive descent parser using 
 simple grammar, "fully parenthesized expression" 
 as described at https://math.hws.edu/javanotes/c9/s5.html,
 where they say in BNF 

    <expression>  ::=  <number>  |
                       "(" <expression> <operator> <expression> ")"
                   
     <operator>  ::=  "+" | "-" | "*" | "/"

 I'll expand on this a bit, defining number explicitly 
 and using this notation :

     ::=        production rule
     "x"        terminal character
     name       nontermimal symbol
     |          alternative productions
     *          0 or any number of times of the preceding

 grammar : 

    expression ::=  "(" expression operator expression ")" | number
    operator   ::=  "+" | "-" | "*" | "/"
    number     ::=  digit digit*
    digit      ::=  "0" | "1" | "2" | "3" | "4" |
                    "5" | "6" | "7" | "8" | "9" 

 I've also changed the order of the alternatives in expression
 because it seemed simpler to recognize the "(" .

 examples :

    123
    (15-32)
    ((1+2)*(13/77))

 The generated parse tree is made of nodes that look like this :

    parse tree node : [name, part1, part2, part3, ...]

 which come from a rule like this

    name ::= part1 part2 part3

 and which form a subtree

        ___name______
       /      |      \
    part1  part2  part3
 
 For example, here are a few parse tree nodes :

    ["expression", "(", expr1, op1, expr2, ")"]
    ["operator", "+"]
    ["number", digit1, digit2]
    ["digit", "4"]

 And here are two full parse trees examples :

 "123"  => [EXP, [NUM, [DIG, "1"] [DIG, "2"], [DIG, "3"]]]

      expression
        |
       number
      /  |    \
   digit digit digit
    |     |     |
    1     2     3

 "(15*32)" => [EXP, "(",
                    [EXP, [NUM, [DIG, "1"], [DIG, "5"]]],
                    [OP, "-"],
                    [EXP, [NUM, [DIG, "3"], [DIG, "2"]]],
                    ")"]

       expression
      /  |  |  |  \
     (   |  -  |   )
         |     |
        num     num
       /  \     /  \
       dig dig dig  dig
       |   |   |    |
       1   5   3    2      

 In practice we wouldn't want to be quite this pendantic; we'd instead
 use a regular expression in the lexer to recognize the number, and
 not bother to put the intermediate DIGIT steps into the parser tree.

 For this simple example, I'll treat each token is just one character, 
 one of "(", ")", "+", "-", "*", "/", "0" ... "9" .

 Again, in practice we would use regular expressions to have 
 the lexer do more work, pulling out a multiple-character number
 as a single token.

 To run this and send output and errors to an output file :

   $ python recursive_descent.py > recursive_descent_out.txt 2>&1

 Jim Mahoney | Nov 18 2021 | cs.bennington.college | MIT License
"""
import pprint
pretty_print = pprint.PrettyPrinter().pprint

class LexerQueue:
    """ an API for moving through the tokens of the code string """
    def __init__(self, code):
        self.code = code  # the code string to analyze i.e. "(1+2)"
        self.offset = 0   # current character is self.code[self.offset]
    def peek(self):
        """ return the next token (character) 
            or the empty string if there are no more."""
        try:
            return self.code[self.offset]
        except:
            return ""
    def pop(self):
        """ return the next token and move on to the next """
        result = self.peek()
        self.offset += 1
        return result
    def is_empty(self):
        """ return True if there are no more tokens """
        return self.peek() == ""
    
# -- a LexerQueue test -----------------------------------------
lex_test = LexerQueue("(1+2)")
assert lex_test.peek() == "("      # first token ; don't move forward
assert lex_test.pop() == "("       # first token ; do move forward
assert lex_test.pop() == "1"       # next token should be this
while not lex_test.is_empty():     # slurp the remaining tokens
    lex_test.pop()
assert lex_test.is_empty()         # no more tokens left

# -- parser ---------

digit_tokens = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9')
operator_tokens = ('+', '-', '*', '/')

def expression(lq):
    if lq.peek() == "(":
        left = lq.pop()
        expr1 = expression(lq)
        op = operator(lq)
        expr2 = expression(lq)
        right = lq.pop()
        assert left == "(" and right == ")", \
            "OOPS - didn't find expected ( or )"
        return ["expression", left, expr1, op, expr2, right]
    else:
        return ["expression", number(lq)]

def operator(lq):
    op = lq.pop()
    assert op in operator_tokens, \
        "OOPS - failed to find expected operator"
    return ['operator', op]
        
def number(lq):
    result = ['number', digit(lq)]
    while lq.peek() in digit_tokens:
        result.append(digit(lq))
    return result

def digit(lq):
    value = lq.pop()
    assert value in digit_tokens, \
        "OOPS - didn't find expected digit"
    return ["digit", value]

# -- parser tests -------------

examples = [
    "1",
    "(1+2)",
    "((12+34)*(56-78))",
    "(1+"                  # should throw an error
    ]

for code in examples:
    print(f"code = '{code}'")
    pretty_print(expression(LexerQueue(code)))
    print()