Pratt Parser in Python

Pratt Parser in
Python
Maxim Eronin
1
An exercise in design and implementation

Use Case: Calculated Metrics
2

'2 + 3^4^5 * log2(8)'
'2', '+', '3', '^', '4', '^',
'5', '*', 'log2', '(', '8',
')'
ws = 's+'
name = '[a-z][w_]*'
infix = '[+-*/^]'
punct = '[(),]'
number = '(d*.)?d+'
3

Why write your own parser?
● It is not an as big a task as it might seem
● More control over the implementation
details/techniques
● Many of the existing python parsing libraries are lacking
in one or more areas
● Writing parsers is fun
4

What is a Pratt Parser and why
use it?
● Parsing technique designed for parsing operator
precedence correctly
● First appeared in “Top Down Operator Precedence” by
Vaughan Pratt (1973)
● A variation of a recursive descent parser but
○ Efficient
○ Modular and flexible
○ Easy to implement and and iterate upon
○ Beautiful
5

Why isn’t it more popular?
“One may wonder why such an "obviously" utopian approach has not been generally
adopted already. I suspect the root cause of this kind of oversight is our universal
preoccupation with BNF grammars and their various offspring grammars[...] together
with their related automata and a large body of theorems. I am personally enamored
of automata theory per se, but I am not impressed with the extent to which it has so
far been successfully applied to the writing of compilers or interpreters. Nor do I see a
particularly promising future in this direction. Rather, I see automata theory as
holding back the development of ideas valuable to language design that are not
visibly in the domain of automata theory.”
Vaughan R. Pratt “Top Down Operator Precedence”
6

Simple arithmetic expression grammar
expression ::= mul-expr ( ( '+' | '-' ) mul-expr )*
mul-expr ::= pow-expr ( ( '*' | '/' ) pow-expr )*
pow-expr ::= prefix-expr ['^' pow-expr]
prefix-expr ::= [ '-' ] primary
primary ::= '(' expr ')' | number | name [ '(' expr ( ',' expr )* ')' ]
7

Pratt parser: no grammar, only tokens
nilfix '<number>', '<name>'
infix '+', '-' 10
infix '*', '/' 20
infixr '^' 30
prefix '-' 40
infix '(' 50
8

from expr_parser.lexer import lex
from expr_parser.parser import Parser, Symbol, Literal, Infix, InfixR
expr = Parser(lex)
expr.define("<number>", 0, Literal)
expr.define("<name>", 0, Literal)
expr.define("+", 50, Infix)
expr.define("*", 60, Infix)
expr.define("/", 60, Infix)
expr.define("^", 70, InfixR)
@expr.define("-", 50)
class Minus(Infix, Prefix):
"""This combines both Prefix' nud and Infix' led"""
pass
9
01
02
03
04
05
06
07
08
09
10
11
12
13
14
15
16

class Symbol(object):
"""Base class for all nodes"""
id = None
lbp = 0
def __init__(self, parser, value=None):
self.parser = parser
self.value = value or self.id
self.first = None
self.second = None
def nud(self):
"""Null denotation. Prefix/Nilfix symbol"""
raise ParserError("Symbol action undefined for `%s'" % self.value)
def led(self, left):
"""Left denotation. Infix/Postfix symbol"""
raise ParserError("Infix action undefined for `%s'" % self.value)
11
01
02
03
04
05
06
07
08
09
10
11
12
13
14
15
16
17
18

class Literal(Symbol):
"""Simple literal (a number or a variable/function name)
just produces itself"""
def nud(self):
return self
class Prefix(Symbol):
"""Prefix operator.
For the sake of simplicity has fixed right binding power"""
def nud(self):
self.first = self.parser.expression(80)
return self
12
01
02
03
04
05
06
07
08
09
10
11
12
13
14
15

class Infix(Symbol):
"""Infix operator"""
self.first = left
self.second = self.parser.expression(self.lbp)
return self
class InfixR(Infix):
"""Infix (right associative) operator"""
self.first = left
self.second = self.parser.expression(self.lbp - 1)
return self
13
01
02
03
04
05
06
07
08
09
10
11
12
13
14
15
16

class Parser(object):
"""Main parser class. Contains both the grammar definition
and a pointer to the current token stream"""
def __init__(self, lex=lexer.lex):
self.lex = lex
self.symbol_table = {}
self.define("<end>")
self.tokens = iter(())
self.token = None
def expression(self, rbp):
tok = self.token
self.advance()
left = tok.nud()
while rbp < self.token.lbp:
tok = self.token
self.advance()
left = tok.led(left)
return left
14
01
02
03
04
05
06
07
08
09
10
11
12
13
14
15
16
17
18
19

def advance(self, value=None):
tok = self.token
if value and value not in (tok.value, tok.id):
raise ParserError(
"Expected `%s'; got `%s' instead" % (value, tok.value))
try:
tok = self.tokens.next()
symbol_table = self.symbol_table
# first look up token's value
if tok.value in symbol_table:
sym = symbol_table[tok.value]
elif tok.token_type in symbol_table:
# then token's type
sym = symbol_table[tok.token_type]
else:
raise ParserError("Undefined token %s" % repr(tok))
self.token = sym(self, tok.value)
except StopIteration:
self.token = self.symbol_table["<end>"](self)
return self.token
15
01
02
03
04
05
06
07
08
09
10
11
12
13
14
15
16
17
18
19
20

def define(self, sid, bp=0, symbol_class=Symbol):
symbol_table = self.symbol_table
sym = symbol_table[sid] = type(
symbol_class.__name__,
(symbol_class,),
{'id': sid, 'lbp': bp}
)
def wrapper(val):
val.id = sid
val.lbp = sym.lbp
symbol_table[sid] = val
return val
return wrapper
16
01
02
03
04
05
06
07
08
09
10
11
12
13

def parse(self, source):
try:
self.tokens = self.lex(source)
self.advance()
return self.expression(0)
finally:
self.tokens = iter(())
self.token = None
17
01
02
03
04
05
06
07
08

expr.define("<punct>")
@expr.define("(", 90)
class FunctionCall(Symbol):
"""Defining both function application and parenthesized expression"""
def nud(self):
e = self.parser.expression(0)
self.parser.advance(")")
return e
self.first = left
args = self.second = []
p = self.parser
while p.token.value != ")":
args.append(p.expression(0))
if p.token.value != ",":
break
p.advance(",")
p.advance(")")
return self
18
01
02
03
04
05
06
07
08
09
10
11
12
13
14
15
16
17
18
19
20

TOKENS = (
('ws', r's+'),
('name', r'[a-z][w_]*'),
('infix', r'[+-*/^]'),
('punct', r'[(),]'),
('number', r'(:?d*.)?d+'),
)
TOKEN_RE = '|'.join("(?P<%s>%s)" % t for t in TOKENS)
LEX_RE = re.compile(TOKEN_RE, re.UNICODE | re.IGNORECASE)
class Token(object):
def __init__(self, token_type, value, pos):
self.token_type = token_type
self.value = value
self.pos = pos
But what about lexing?
19
01
02
03
04
05
06
07
08
09
10
11
12
13
14
15
16

def lex(source, pat=LEX_RE):
i = 0
def error():
raise LexerException(
"Unexpected character at position %d: `%s`" % (i, source[i])
)
for m in pat.finditer(source):
pos = m.start()
if pos > i:
error()
i = m.end()
name = m.lastgroup
if name != "ws":
token_type = "<%s>" % name
yield Token(token_type, m.group(0), pos)
if i < len(source):
error()
20
01
02
03
04
05
06
07
08
09
10
11
12
13
14
15
16
17
18
19

● https://tdop.github.io/
Vaughan R. Pratt "Top Down Operator Precedence" (1973)
● http://javascript.crockford.com/tdop/tdop.html
Douglas Crockford "Top Down Operator Precedence" (2007)
● http://effbot.org/zone/simple-top-down-parsing.htm
Fredrik Lundh "Simple Top-Down Parsing in Python" (2008)
All code in this presentation can be found at:
https://github.com/percolate/pratt-parser
References
21
We are Percolate and we’re always hiring great engineers. Talk to us

Pratt Parser in Python

Recommended

Recommended

More Related Content

What's hot

What's hot (20)

Similar to Pratt Parser in Python

Similar to Pratt Parser in Python (20)

More from Percolate

More from Percolate (20)

Recently uploaded

Recently uploaded (20)

Pratt Parser in Python