pratt parser in python
TRANSCRIPT
Pratt Parser in Python
Maxim Eronin
1
An exercise in design and implementation
Use Case: Calculated Metrics
2
'2 + 3^4^5 * log2(8)'
'2', '+', '3', '^', '4', '^', '5', '*', 'log2', '(', '8', ')'
ws = '\s+'name = '[a-z][\w_]*'infix = '[+\-*/\^]'punct = '[\(\),]'number = '(\d*\.)?\d+'
3
Why write your own parser?
● It is not an as big a task as it might seem● More control over the implementation
details/techniques● Many of the existing python parsing libraries are lacking
in one or more areas● Writing parsers is fun
4
What is a Pratt Parser and why use it?● Parsing technique designed for parsing operator
precedence correctly● First appeared in “Top Down Operator Precedence” by
Vaughan Pratt (1973)● A variation of a recursive descent parser but
○ Efficient
○ Modular and flexible
○ Easy to implement and and iterate upon
○ Beautiful
5
Why isn’t it more popular?
“One may wonder why such an "obviously" utopian approach has not been generally adopted already. I suspect the root cause of this kind of oversight is our universal preoccupation with BNF grammars and their various offspring grammars[...] together with their related automata and a large body of theorems. I am personally enamored of automata theory per se, but I am not impressed with the extent to which it has so far been successfully applied to the writing of compilers or interpreters. Nor do I see a particularly promising future in this direction. Rather, I see automata theory as holding back the development of ideas valuable to language design that are not visibly in the domain of automata theory.”
Vaughan R. Pratt “Top Down Operator Precedence”
6
Simple arithmetic expression grammar
expression ::= mul-expr ( ( '+' | '-' ) mul-expr )*
mul-expr ::= pow-expr ( ( '*' | '/' ) pow-expr )*
pow-expr ::= prefix-expr ['^' pow-expr]
prefix-expr ::= [ '-' ] primary
primary ::= '(' expr ')' | number | name [ '(' expr ( ',' expr )* ')' ]
7
Pratt parser: no grammar, only tokensnilfix '<number>', '<name>'
infix '+', '-' 10
infix '*', '/' 20
infixr '^' 30
prefix '-' 40
infix '(' 50
8
from expr_parser.lexer import lex
from expr_parser.parser import Parser, Symbol, Literal, Infix, InfixR
expr = Parser(lex)
expr.define("<number>", 0, Literal)
expr.define("<name>", 0, Literal)
expr.define("+", 50, Infix)
expr.define("*", 60, Infix)
expr.define("/", 60, Infix)
expr.define("^", 70, InfixR)
@expr.define("-", 50)
class Minus(Infix, Prefix):
"""This combines both Prefix' nud and Infix' led"""
pass
9
01
02
03
04
05
06
07
08
09
10
11
12
13
14
15
16
10
class Symbol(object):
"""Base class for all nodes"""
id = None
lbp = 0
def __init__(self, parser, value=None):
self.parser = parser
self.value = value or self.id
self.first = None
self.second = None
def nud(self):
"""Null denotation. Prefix/Nilfix symbol"""
raise ParserError("Symbol action undefined for `%s'" % self.value)
def led(self, left):
"""Left denotation. Infix/Postfix symbol"""
raise ParserError("Infix action undefined for `%s'" % self.value)
11
01
02
03
04
05
06
07
08
09
10
11
12
13
14
15
16
17
18
class Literal(Symbol):
"""Simple literal (a number or a variable/function name)
just produces itself"""
def nud(self):
return self
class Prefix(Symbol):
"""Prefix operator.
For the sake of simplicity has fixed right binding power"""
def nud(self):
self.first = self.parser.expression(80)
return self
12
01
02
03
04
05
06
07
08
09
10
11
12
13
14
15
class Infix(Symbol):
"""Infix operator"""
def led(self, left):
self.first = left
self.second = self.parser.expression(self.lbp)
return self
class InfixR(Infix):
"""Infix (right associative) operator"""
def led(self, left):
self.first = left
self.second = self.parser.expression(self.lbp - 1)
return self
13
01
02
03
04
05
06
07
08
09
10
11
12
13
14
15
16
class Parser(object):
"""Main parser class. Contains both the grammar definition
and a pointer to the current token stream"""
def __init__(self, lex=lexer.lex):
self.lex = lex
self.symbol_table = {}
self.define("<end>")
self.tokens = iter(())
self.token = None
def expression(self, rbp):
tok = self.token
self.advance()
left = tok.nud()
while rbp < self.token.lbp:
tok = self.token
self.advance()
left = tok.led(left)
return left14
01
02
03
04
05
06
07
08
09
10
11
12
13
14
15
16
17
18
19
def advance(self, value=None):
tok = self.token
if value and value not in (tok.value, tok.id):
raise ParserError(
"Expected `%s'; got `%s' instead" % (value, tok.value))
try:
tok = self.tokens.next()
symbol_table = self.symbol_table
# first look up token's value
if tok.value in symbol_table:
sym = symbol_table[tok.value]
elif tok.token_type in symbol_table:
# then token's type
sym = symbol_table[tok.token_type]
else:
raise ParserError("Undefined token %s" % repr(tok))
self.token = sym(self, tok.value)
except StopIteration:
self.token = self.symbol_table["<end>"](self)
return self.token15
01
02
03
04
05
06
07
08
09
10
11
12
13
14
15
16
17
18
19
20
def define(self, sid, bp=0, symbol_class=Symbol):
symbol_table = self.symbol_table
sym = symbol_table[sid] = type(
symbol_class.__name__,
(symbol_class,),
{'id': sid, 'lbp': bp}
)
def wrapper(val):
val.id = sid
val.lbp = sym.lbp
symbol_table[sid] = val
return val
return wrapper
16
01
02
03
04
05
06
07
08
09
10
11
12
13
def parse(self, source):
try:
self.tokens = self.lex(source)
self.advance()
return self.expression(0)
finally:
self.tokens = iter(())
self.token = None
17
01
02
03
04
05
06
07
08
expr.define("<punct>")
@expr.define("(", 90)
class FunctionCall(Symbol):
"""Defining both function application and parenthesized expression"""
def nud(self):
e = self.parser.expression(0)
self.parser.advance(")")
return e
def led(self, left):
self.first = left
args = self.second = []
p = self.parser
while p.token.value != ")":
args.append(p.expression(0))
if p.token.value != ",":
break
p.advance(",")
p.advance(")")
return self18
01
02
03
04
05
06
07
08
09
10
11
12
13
14
15
16
17
18
19
20
TOKENS = (
('ws', r'\s+'),
('name', r'[a-z][\w_]*'),
('infix', r'[+\-*/\^]'),
('punct', r'[\(\),]'),
('number', r'(:?\d*\.)?\d+'),
)
TOKEN_RE = '|'.join("(?P<%s>%s)" % t for t in TOKENS)
LEX_RE = re.compile(TOKEN_RE, re.UNICODE | re.IGNORECASE)
class Token(object):
def __init__(self, token_type, value, pos):
self.token_type = token_type
self.value = value
self.pos = pos
But what about lexing?
19
01
02
03
04
05
06
07
08
09
10
11
12
13
14
15
16
def lex(source, pat=LEX_RE):
i = 0
def error():
raise LexerException(
"Unexpected character at position %d: `%s`" % (i, source[i])
)
for m in pat.finditer(source):
pos = m.start()
if pos > i:
error()
i = m.end()
name = m.lastgroup
if name != "ws":
token_type = "<%s>" % name
yield Token(token_type, m.group(0), pos)
if i < len(source):
error()20
01
02
03
04
05
06
07
08
09
10
11
12
13
14
15
16
17
18
19
● https://tdop.github.io/ Vaughan R. Pratt "Top Down Operator Precedence" (1973)
● http://javascript.crockford.com/tdop/tdop.htmlDouglas Crockford "Top Down Operator Precedence" (2007)
● http://effbot.org/zone/simple-top-down-parsing.htmFredrik Lundh "Simple Top-Down Parsing in Python" (2008)
All code in this presentation can be found at:https://github.com/percolate/pratt-parser
References
21
We are Percolate and we’re always hiring great engineers. Talk to us