pratt parser in python

Pratt Parser in Python

Maxim Eronin

1

An exercise in design and implementation

Use Case: Calculated Metrics

2

'2 + 3^4^5 * log2(8)'

'2', '+', '3', '^', '4', '^', '5', '*', 'log2', '(', '8', ')'

ws = '\s+'name = '[a-z][\w_]*'infix = '[+\-*/\^]'punct = '[\(\),]'number = '(\d*\.)?\d+'

3

Why write your own parser?

● It is not an as big a task as it might seem● More control over the implementation

details/techniques● Many of the existing python parsing libraries are lacking

in one or more areas● Writing parsers is fun

4

What is a Pratt Parser and why use it?● Parsing technique designed for parsing operator

precedence correctly● First appeared in “Top Down Operator Precedence” by

Vaughan Pratt (1973)● A variation of a recursive descent parser but

○ Efficient

○ Modular and flexible

○ Easy to implement and and iterate upon

○ Beautiful

5

Why isn’t it more popular?

“One may wonder why such an "obviously" utopian approach has not been generally adopted already. I suspect the root cause of this kind of oversight is our universal preoccupation with BNF grammars and their various offspring grammars[...] together with their related automata and a large body of theorems. I am personally enamored of automata theory per se, but I am not impressed with the extent to which it has so far been successfully applied to the writing of compilers or interpreters. Nor do I see a particularly promising future in this direction. Rather, I see automata theory as holding back the development of ideas valuable to language design that are not visibly in the domain of automata theory.”

Vaughan R. Pratt “Top Down Operator Precedence”

6

Simple arithmetic expression grammar

expression ::= mul-expr ( ( '+' | '-' ) mul-expr )*

mul-expr ::= pow-expr ( ( '*' | '/' ) pow-expr )*

pow-expr ::= prefix-expr ['^' pow-expr]

prefix-expr ::= [ '-' ] primary

primary ::= '(' expr ')' | number | name [ '(' expr ( ',' expr )* ')' ]

7

Pratt parser: no grammar, only tokensnilfix '<number>', '<name>'

infix '+', '-' 10

infix '*', '/' 20

infixr '^' 30

prefix '-' 40

infix '(' 50

8

from expr_parser.lexer import lex

from expr_parser.parser import Parser, Symbol, Literal, Infix, InfixR

expr = Parser(lex)

expr.define("<number>", 0, Literal)

expr.define("<name>", 0, Literal)

expr.define("+", 50, Infix)

expr.define("*", 60, Infix)

expr.define("/", 60, Infix)

expr.define("^", 70, InfixR)

@expr.define("-", 50)

class Minus(Infix, Prefix):

"""This combines both Prefix' nud and Infix' led"""

pass

9

01

02

03

04

05

06

07

08

09

10

11

12

13

14

15

16

class Symbol(object):

"""Base class for all nodes"""

id = None

lbp = 0

def __init__(self, parser, value=None):

self.parser = parser

self.value = value or self.id

self.first = None

self.second = None

def nud(self):

"""Null denotation. Prefix/Nilfix symbol"""

raise ParserError("Symbol action undefined for `%s'" % self.value)

def led(self, left):

"""Left denotation. Infix/Postfix symbol"""

raise ParserError("Infix action undefined for `%s'" % self.value)

11

01

02

03

04

05

06

07

08

09

10

11

12

13

14

15

16

17

18

class Literal(Symbol):

"""Simple literal (a number or a variable/function name)

just produces itself"""

def nud(self):

return self

class Prefix(Symbol):

"""Prefix operator.

For the sake of simplicity has fixed right binding power"""

def nud(self):

self.first = self.parser.expression(80)

return self

12

01

02

03

04

05

06

07

08

09

10

11

12

13

14

15

class Infix(Symbol):

"""Infix operator"""


self.first = left

self.second = self.parser.expression(self.lbp)

return self

class InfixR(Infix):

"""Infix (right associative) operator"""


self.first = left

self.second = self.parser.expression(self.lbp - 1)

return self

13

01

02

03

04

05

06

07

08

09

10

11

12

13

14

15

16

class Parser(object):

"""Main parser class. Contains both the grammar definition

and a pointer to the current token stream"""

def __init__(self, lex=lexer.lex):

self.lex = lex

self.symbol_table = {}

self.define("<end>")

self.tokens = iter(())

self.token = None

def expression(self, rbp):

tok = self.token

self.advance()

left = tok.nud()

while rbp < self.token.lbp:

tok = self.token

self.advance()

left = tok.led(left)

return left14

01

02

03

04

05

06

07

08

09

10

11

12

13

14

15

16

17

18

19

def advance(self, value=None):

tok = self.token

if value and value not in (tok.value, tok.id):

raise ParserError(

"Expected `%s'; got `%s' instead" % (value, tok.value))

try:

tok = self.tokens.next()

symbol_table = self.symbol_table

# first look up token's value

if tok.value in symbol_table:

sym = symbol_table[tok.value]

elif tok.token_type in symbol_table:

# then token's type

sym = symbol_table[tok.token_type]

else:

raise ParserError("Undefined token %s" % repr(tok))

self.token = sym(self, tok.value)

except StopIteration:

self.token = self.symbol_table["<end>"](self)

return self.token15

01

02

03

04

05

06

07

08

09

10

11

12

13

14

15

16

17

18

19

20

def define(self, sid, bp=0, symbol_class=Symbol):

symbol_table = self.symbol_table

sym = symbol_table[sid] = type(

symbol_class.__name__,

(symbol_class,),

{'id': sid, 'lbp': bp}

)

def wrapper(val):

val.id = sid

val.lbp = sym.lbp

symbol_table[sid] = val

return val

return wrapper

16

01

02

03

04

05

06

07

08

09

10

11

12

13

def parse(self, source):

try:

self.tokens = self.lex(source)

self.advance()

return self.expression(0)

finally:

self.tokens = iter(())

self.token = None

17

01

02

03

04

05

06

07

08

expr.define("<punct>")

@expr.define("(", 90)

class FunctionCall(Symbol):

"""Defining both function application and parenthesized expression"""

def nud(self):

e = self.parser.expression(0)

self.parser.advance(")")

return e


self.first = left

args = self.second = []

p = self.parser

while p.token.value != ")":

args.append(p.expression(0))

if p.token.value != ",":

break

p.advance(",")

p.advance(")")

return self18

01

02

03

04

05

06

07

08

09

10

11

12

13

14

15

16

17

18

19

20

TOKENS = (

('ws', r'\s+'),

('name', r'[a-z][\w_]*'),

('infix', r'[+\-*/\^]'),

('punct', r'[\(\),]'),

('number', r'(:?\d*\.)?\d+'),

)

TOKEN_RE = '|'.join("(?P<%s>%s)" % t for t in TOKENS)

LEX_RE = re.compile(TOKEN_RE, re.UNICODE | re.IGNORECASE)

class Token(object):

def __init__(self, token_type, value, pos):

self.token_type = token_type

self.value = value

self.pos = pos

But what about lexing?

19

01

02

03

04

05

06

07

08

09

10

11

12

13

14

15

16

def lex(source, pat=LEX_RE):

i = 0

def error():

raise LexerException(

"Unexpected character at position %d: `%s`" % (i, source[i])

)

for m in pat.finditer(source):

pos = m.start()

if pos > i:

error()

i = m.end()

name = m.lastgroup

if name != "ws":

token_type = "<%s>" % name

yield Token(token_type, m.group(0), pos)

if i < len(source):

error()20

01

02

03

04

05

06

07

08

09

10

11

12

13

14

15

16

17

18

19

● https://tdop.github.io/ Vaughan R. Pratt "Top Down Operator Precedence" (1973)

● http://javascript.crockford.com/tdop/tdop.htmlDouglas Crockford "Top Down Operator Precedence" (2007)

● http://effbot.org/zone/simple-top-down-parsing.htmFredrik Lundh "Simple Top-Down Parsing in Python" (2008)

All code in this presentation can be found at:https://github.com/percolate/pratt-parser

References

21

We are Percolate and we’re always hiring great engineers. Talk to us

https://tdop.github.io/

https://tdop.github.io/

http://javascript.crockford.com/tdop/tdop.html

http://javascript.crockford.com/tdop/tdop.html

http://effbot.org/zone/simple-top-down-parsing.htm

http://effbot.org/zone/simple-top-down-parsing.htm

https://github.com/percolate/pratt-parser

https://github.com/percolate/pratt-parser

pratt parser in python

Engineering