Python Parser Combinator

Fork

Details

Discourse (0)

Algorithms

Logic

Parsing

Strings

Higher-order Functions

Functions

Control Flow

Basic Language Features

Fundamentals

Description

Seeing how there aren't any parsing libraries available in any language i use besides Haskell, i wrote up this small parser combinator library in Python based on the ReadS type from Haskell. The idea is simliar to PyParsing but the combinators proviceded are based off Parsec.

The parser type P<a> is a light wrapper around a function of type String, Int -> [(String, Int, a)], which, given an input string and the current index, returns a generator of all possible matching tuples of (input string, next index, matched item).

The combinator that matches a character can be written as:

def char(chr):
  def inner(str, idx):
    if idx >= len(str): return
    if str[idx] != chr: return
    yield (str, idx + 1, chr)
  return P(inner)

Using char, the combinator that matches a sequence of characters could be written by combining multiple parsers:

def sequence(x, y, z):
  return P.char(x).then(P.char(y)).then(P.char(z)).replace(x + y + z)
# or alternatively...
def sequence(x, y, z):
  return P.char(x) >> P.char(y) >> P.char(z) ** (x + y + z)
# or even...
def sequence(*xs):
  return P.seq(*map(P.char, xs)) ** xs
# sequence('a', 'b', 'c') should match 'abc'

Code

from functools import *
from itertools import *
import re

M = staticmethod

class P:
  # f :: String, Int -> [(String, Int, a)]
  __init__ = lambda self, f=None: setattr(self, 'f', f)
  # pass forward a parser, similar to PyParsing's Forward()
  forward = lambda self, f: [self, setattr(self, 'f', f)][0]
  
  # same as stuff in Haskell

  # pure = return
  pure = M(lambda a: P(lambda s, i: iter([(s, i, a)])))
  # bind = (>>=)
  bind = lambda self, f: P(lambda s, i: (x for xs in
    map(lambda x: f(x[2]).f(x[0], x[1]), self.f(s, i)) for x in xs))
  # fmap = (<$>)
  fmap = lambda self, f: P(lambda s, i: ((x, y, f(z)) for x, y, z in self.f(s, i)))
  # replace = ($>)
  replace = lambda self, v: P(lambda s, i: ((x, y, v) for x, y, _ in self.f(s, i)))
  # apply = (<*>)
  apply = lambda self, f: self.bind(lambda x: f.bind(lambda y: P.pure(y(x))))
  # then = (>>)
  then = lambda self, p: self.bind(lambda _: p)
  # before = (<<)
  before = lambda self, p: self.bind(lambda x: p.replace(x))
  # alter = (<|>)
  # N.B. the choices are left biased
  #      place the more likely matches on the left as an optimization
  alter = lambda self, p: P(lambda s, i: chain(self.f(s, i), p.f(s, i)))
  # many = zeroOrMore
  many = lambda self: self.some().alter(P.pure(iter([])))
  # some = oneOrMore
  some = lambda self: P.fix(lambda p: self.bind(lambda x:
    p.alter(P.pure(iter([]))).fmap(lambda ys: chain([x], ys))))
  # seq = sequence
  # N.B. this returns a generator, so do parser.fmap(list) to be able to index it
  seq = M(lambda *xs: reduce(lambda p, x: x.bind(lambda a: p.fmap(
    lambda b: chain([a], b))), list(xs)[::-1], P.pure(iter([]))))

  # non Haskell stuff

  # advance the index
  creep = lambda self, n: P(lambda s, i: ((x, y + n, z) for x, y, z in self.f(s, i)))
  # skip n or more whitespace chars 
  lex = lambda self, n=0: self.before([P.many, P.some][n](P.char(str.isspace)))
  # possibly successful parse based on a predicate
  # f :: String, Int -> Either () (Int, a)
  pred = M(lambda f:P(lambda s,i:iter([[(s,i+b[0],b[1])]if b else[]for b in[f(s,i)]][0])))
  # same as pred but must not be at eof
  pred1 = M(lambda f: P.pred(lambda s, i: f(s, i) if i<len(s) else []))
  # parse a single char
  # f :: Either Char (Char -> Bool)
  char = M(lambda f: P.pred1(lambda s,i: [1,s[i]]*(f if callable(f)else lambda c:f==c)(s[i])))
  # parser that always fails
  fail = M(lambda: P(lambda x, y: iter([])))
  # parse a string
  string = M(lambda s: P.seq(*map(lambda c: P.char(lambda x: x == c), s)).replace(s))
  # choose from a list of alternatives
  choice = M(lambda *xs: reduce(lambda p, x: p.alter(x), xs, P.fail()))
  # parse a regex
  # N.B. this only gives either 0 or 1 result, so no backtracking if you use (x|y)
  regex = M(lambda pattern, group=None, flags=0: P.pred(lambda s, i:
    [[m.end() - i, m.group(group) if group is not None else m] if m is not None else []
    for m in [re.search(r'(?<=^.{{{}}}){}'.format(i, pattern), s, flags)]][0]))
  # parse end of input
  eof = M(lambda: P.pred(lambda s, i: [0,None]*(i>=len(s))))
  # helper function to avoid having to use .forward()
  # f :: Parser a -> Any
  fix = M(lambda f: [p.forward(q.f) for p in [P(None)] for q in [f(p)]][0])

  # run the parser
  # set just=True to yield only the match result
  # N.B. this returns a generator of all possible matches,
  #   with the first usually being the longest match.
  #   use parser.then(P.eof()) if you want the full match
  parse = lambda self, s, i=0, just=False: (x[2] if just else x for x in self.f(s, i))

  __pow__,__pos__,__invert__,__or__,__rshift__,__lshift__=replace,some,many,alter,then,before

# usually i'd copy paste this condensed version if i use this library in a kata
# exec(__import__('gzip').decompress(__import__('codecs').decode(b'H4sIAN/LOV4C/41WQXObOhA+m1+xt4hUZewemfB6661vfNfwPIJIsd4IUCSc4mby37srIMU2mfZiYKX9dvfbTytr3zWgT23dd50NYBrX+R7uE0120yt/ZZ9evEq+QwGhl72pG9Ufu8ektjIE2OfJ5nAwrekPB9xhZVM9SgjKag66+LdrVY5f6Nd7Nlrv9B0upclGd/6H9I83XjmI8W3VrxTbMtm4k1fo+J1NrjKH/fweOJg8FsMEow8OMi3TFENWpl2Ld+XKBsDcYEAe2mSzaaSb14ccNBvElzLN6LktOQxiV6Y8YqGNANJ0dEdvxKCwGiH+IiwbOJxxhf2cIeL3TwK6wEdIr5yVtbpGffkQ9WUJeViDlM7Z80qacR8xt2ThwnDGqBm1hJ3ZkEaw/qjaayy3gnXIwVFjFCan/sYBg7tsqp6CYd4WW73iekVEfZSmZcuicdey/ka2V+VP4UPXKJZmMQ6bCh3lNaqK1q8d95k2w5zAeh0oLfcRaKYXqjuHOXsxoOLOYaS49krdyKr9QADwCVqU0h91ZdVwA1ls5/xjl5jYZ8QVxyKp9FK0JVaAGeJR7X1mQnDYnJEa9bw8pvcDluLV4wl7N3PDqaXDBTeSehwZQI4mW/XOgUQOKkTnYE3oGR4xkeefdyWHGxoxA4fhlinofMGPyce9NCfMp4oOdEXnuTQaKlA2KFESYRVSJYglk5Y0gN6Rd5fQlAFaLxswsQuIaR6sQg2mERooRWwj8raKslvmCWLHgzBlec80IdXSWllZxXQasaatda6Lok4ZbY1JamnsAn2hDhLBPCYnHffetE/LXALlgj1k9ws51mSM3f59JFE0BdQ0B1G78+kMY3mdqS9m9UcamA/DgDBDoG5S8mwad09RmO8gDm8G5VHST747uXjR4LCy8imQWlfagEoSoskUqiyFz3QpNFn0ZfE3tie+gQnQdj0Q5NinpqTF5nZB4E1E1xittSC8Qq6kr4/M37GvD8V/2evr69vbW/r6dpfhtkb2DONOqRNZU8qzqJKN6vRlt1b0JLacUkAlmH+KUU9jq81wpSPhsumSZc+ZHs++i6nuGUGkZTQ9R5NmbsoDpS19uJnGFL3Ycvj/FPrim0QC6KrEy5DYIeNIyvD79rsYLvQ/wXU/DgdOzxCfpn3BvxzxtfPx4cPR6NFip9di0hOnYcPj5IlC4XTH8HEmJckvI3Ry/9oIAAA=','base64')).decode('utf-8'))

Test Cases

# some examples:

# parsing a greeting
parser = P.string('hello') >> +P.char(str.isspace) >> P.char(str.isalnum).some().fmap(''.join) << P.eof()
test.assert_equals('world', next(parser.parse('hello world', just=True)))
test.assert_equals('thelegend27', next(parser.parse('hello \t thelegend27', just=True)))
test.assert_equals([], list(parser.parse('helloworld!', just=True)))

# arithmetic evaluator
from operator import *
expression = P(None)
operators = {'+':add, '-':sub, '*':mul, '/':truediv, '%':mod}
operatorset = lambda xs: P.choice(*map(lambda o: P.char(o).lex()**operators[o], xs))
# similar to chainl1 in Haskell
# Dyad ::= monad Operator monad (monad in this context can be Monad or another dyad
dyad = lambda monad, op: P.seq(monad, ~P.seq(op, monad)).fmap(list).fmap(
  lambda ys: reduce(lambda v, i: next(i)(v, next(i)), ys[1], ys[0]))
# Monad ::= Number | '(' Expression ')'
monad = ( P.regex('-?(\d+(\.\d+)?|\.\d+)', 0).fmap(float).lex()
  | P.char('(').lex() >> expression << P.char(')').lex() )
# operator precedence defined by nesting dyads and moads
expression.forward(dyad(dyad(monad, operatorset('*/%')), operatorset('+-')).f)
parser = expression << P.eof()
test.assert_equals(4.6, next(parser.parse('1.2 + 3.4', just=True)))
test.assert_equals(12, next(parser.parse('5 - 6 + 7 + 2 * 3', just=True)))
test.assert_equals(-0.5, next(parser.parse('1 + (2 * (3 / 4) - 3)', just=True)))

View Run Output

Output:

All
{{group.name}} ({{group.count}})

Kumite

Move History