Move History

Fork Selected
  • Algorithms
    Logic
    Parsing
    Strings
    Higher-order Functions
    Functions
    Control Flow
    Basic Language Features
    Fundamentals
    Description

    Seeing how there aren't any parsing libraries available in any language i use besides Haskell, i wrote up this small parser combinator library in Python based on the ReadS type from Haskell. The idea is simliar to PyParsing but the combinators proviceded are based off Parsec.

    The parser type P<a> is a light wrapper around a function of type String, Int -> [(String, Int, a)], which, given an input string and the current index, returns a generator of all possible matching tuples of (input string, next index, matched item).

    The combinator that matches a character can be written as:

    def char(chr):
      def inner(str, idx):
        if idx >= len(str): return
        if str[idx] != chr: return
        yield (str, idx + 1, chr)
      return P(inner)
    

    Using char, the combinator that matches a sequence of characters could be written by combining multiple parsers:

    def sequence(x, y, z):
      return P.char(x).then(P.char(y)).then(P.char(z)).replace(x + y + z)
    # or alternatively...
    def sequence(x, y, z):
      return P.char(x) >> P.char(y) >> P.char(z) ** (x + y + z)
    # or even...
    def sequence(*xs):
      return P.seq(*map(P.char, xs)) ** xs
    # sequence('a', 'b', 'c') should match 'abc'
    
    Code
    from functools import *
    from itertools import *
    import re
    
    M = staticmethod
    
    class P:
      # f :: String, Int -> [(String, Int, a)]
      __init__ = lambda self, f=None: setattr(self, 'f', f)
      # pass forward a parser, similar to PyParsing's Forward()
      forward = lambda self, f: [self, setattr(self, 'f', f)][0]
      
      # same as stuff in Haskell
    
      # pure = return
      pure = M(lambda a: P(lambda s, i: iter([(s, i, a)])))
      # bind = (>>=)
      bind = lambda self, f: P(lambda s, i: (x for xs in
        map(lambda x: f(x[2]).f(x[0], x[1]), self.f(s, i)) for x in xs))
      # fmap = (<$>)
      fmap = lambda self, f: P(lambda s, i: ((x, y, f(z)) for x, y, z in self.f(s, i)))
      # replace = ($>)
      replace = lambda self, v: P(lambda s, i: ((x, y, v) for x, y, _ in self.f(s, i)))
      # apply = (<*>)
      apply = lambda self, f: self.bind(lambda x: f.bind(lambda y: P.pure(y(x))))
      # then = (>>)
      then = lambda self, p: self.bind(lambda _: p)
      # before = (<<)
      before = lambda self, p: self.bind(lambda x: p.replace(x))
      # alter = (<|>)
      # N.B. the choices are left biased
      #      place the more likely matches on the left as an optimization
      alter = lambda self, p: P(lambda s, i: chain(self.f(s, i), p.f(s, i)))
      # many = zeroOrMore
      many = lambda self: self.some().alter(P.pure(iter([])))
      # some = oneOrMore
      some = lambda self: P.fix(lambda p: self.bind(lambda x:
        p.alter(P.pure(iter([]))).fmap(lambda ys: chain([x], ys))))
      # seq = sequence
      # N.B. this returns a generator, so do parser.fmap(list) to be able to index it
      seq = M(lambda *xs: reduce(lambda p, x: x.bind(lambda a: p.fmap(
        lambda b: chain([a], b))), list(xs)[::-1], P.pure(iter([]))))
    
      # non Haskell stuff
    
      # advance the index
      creep = lambda self, n: P(lambda s, i: ((x, y + n, z) for x, y, z in self.f(s, i)))
      # skip n or more whitespace chars 
      lex = lambda self, n=0: self.before([P.many, P.some][n](P.char(str.isspace)))
      # possibly successful parse based on a predicate
      # f :: String, Int -> Either () (Int, a)
      pred = M(lambda f:P(lambda s,i:iter([[(s,i+b[0],b[1])]if b else[]for b in[f(s,i)]][0])))
      # same as pred but must not be at eof
      pred1 = M(lambda f: P.pred(lambda s, i: f(s, i) if i<len(s) else []))
      # parse a single char
      # f :: Either Char (Char -> Bool)
      char = M(lambda f: P.pred1(lambda s,i: [1,s[i]]*(f if callable(f)else lambda c:f==c)(s[i])))
      # parser that always fails
      fail = M(lambda: P(lambda x, y: iter([])))
      # parse a string
      string = M(lambda s: P.seq(*map(lambda c: P.char(lambda x: x == c), s)).replace(s))
      # choose from a list of alternatives
      choice = M(lambda *xs: reduce(lambda p, x: p.alter(x), xs, P.fail()))
      # parse a regex
      # N.B. this only gives either 0 or 1 result, so no backtracking if you use (x|y)
      regex = M(lambda pattern, group=None, flags=0: P.pred(lambda s, i:
        [[m.end() - i, m.group(group) if group is not None else m] if m is not None else []
        for m in [re.search(r'(?<=^.{{{}}}){}'.format(i, pattern), s, flags)]][0]))
      # parse end of input
      eof = M(lambda: P.pred(lambda s, i: [0,None]*(i>=len(s))))
      # helper function to avoid having to use .forward()
      # f :: Parser a -> Any
      fix = M(lambda f: [p.forward(q.f) for p in [P(None)] for q in [f(p)]][0])
    
      # run the parser
      # set just=True to yield only the match result
      # N.B. this returns a generator of all possible matches,
      #   with the first usually being the longest match.
      #   use parser.then(P.eof()) if you want the full match
      parse = lambda self, s, i=0, just=False: (x[2] if just else x for x in self.f(s, i))
    
      __pow__,__pos__,__invert__,__or__,__rshift__,__lshift__=replace,some,many,alter,then,before
    
    # usually i'd copy paste this condensed version if i use this library in a kata
    # exec(__import__('gzip').decompress(__import__('codecs').decode(b'H4sIAN/LOV4C/41WQXObOhA+m1+xt4hUZewemfB6661vfNfwPIJIsd4IUCSc4mby37srIMU2mfZiYKX9dvfbTytr3zWgT23dd50NYBrX+R7uE0120yt/ZZ9evEq+QwGhl72pG9Ufu8ektjIE2OfJ5nAwrekPB9xhZVM9SgjKag66+LdrVY5f6Nd7Nlrv9B0upclGd/6H9I83XjmI8W3VrxTbMtm4k1fo+J1NrjKH/fweOJg8FsMEow8OMi3TFENWpl2Ld+XKBsDcYEAe2mSzaaSb14ccNBvElzLN6LktOQxiV6Y8YqGNANJ0dEdvxKCwGiH+IiwbOJxxhf2cIeL3TwK6wEdIr5yVtbpGffkQ9WUJeViDlM7Z80qacR8xt2ThwnDGqBm1hJ3ZkEaw/qjaayy3gnXIwVFjFCan/sYBg7tsqp6CYd4WW73iekVEfZSmZcuicdey/ka2V+VP4UPXKJZmMQ6bCh3lNaqK1q8d95k2w5zAeh0oLfcRaKYXqjuHOXsxoOLOYaS49krdyKr9QADwCVqU0h91ZdVwA1ls5/xjl5jYZ8QVxyKp9FK0JVaAGeJR7X1mQnDYnJEa9bw8pvcDluLV4wl7N3PDqaXDBTeSehwZQI4mW/XOgUQOKkTnYE3oGR4xkeefdyWHGxoxA4fhlinofMGPyce9NCfMp4oOdEXnuTQaKlA2KFESYRVSJYglk5Y0gN6Rd5fQlAFaLxswsQuIaR6sQg2mERooRWwj8raKslvmCWLHgzBlec80IdXSWllZxXQasaatda6Lok4ZbY1JamnsAn2hDhLBPCYnHffetE/LXALlgj1k9ws51mSM3f59JFE0BdQ0B1G78+kMY3mdqS9m9UcamA/DgDBDoG5S8mwad09RmO8gDm8G5VHST747uXjR4LCy8imQWlfagEoSoskUqiyFz3QpNFn0ZfE3tie+gQnQdj0Q5NinpqTF5nZB4E1E1xittSC8Qq6kr4/M37GvD8V/2evr69vbW/r6dpfhtkb2DONOqRNZU8qzqJKN6vRlt1b0JLacUkAlmH+KUU9jq81wpSPhsumSZc+ZHs++i6nuGUGkZTQ9R5NmbsoDpS19uJnGFL3Ycvj/FPrim0QC6KrEy5DYIeNIyvD79rsYLvQ/wXU/DgdOzxCfpn3BvxzxtfPx4cPR6NFip9di0hOnYcPj5IlC4XTH8HEmJckvI3Ry/9oIAAA=','base64')).decode('utf-8'))
    
    Test Cases
    # some examples:
    
    # parsing a greeting
    parser = P.string('hello') >> +P.char(str.isspace) >> P.char(str.isalnum).some().fmap(''.join) << P.eof()
    test.assert_equals('world', next(parser.parse('hello world', just=True)))
    test.assert_equals('thelegend27', next(parser.parse('hello \t thelegend27', just=True)))
    test.assert_equals([], list(parser.parse('helloworld!', just=True)))
    
    # arithmetic evaluator
    from operator import *
    expression = P(None)
    operators = {'+':add, '-':sub, '*':mul, '/':truediv, '%':mod}
    operatorset = lambda xs: P.choice(*map(lambda o: P.char(o).lex()**operators[o], xs))
    # similar to chainl1 in Haskell
    # Dyad ::= monad Operator monad (monad in this context can be Monad or another dyad
    dyad = lambda monad, op: P.seq(monad, ~P.seq(op, monad)).fmap(list).fmap(
      lambda ys: reduce(lambda v, i: next(i)(v, next(i)), ys[1], ys[0]))
    # Monad ::= Number | '(' Expression ')'
    monad = ( P.regex('-?(\d+(\.\d+)?|\.\d+)', 0).fmap(float).lex()
      | P.char('(').lex() >> expression << P.char(')').lex() )
    # operator precedence defined by nesting dyads and moads
    expression.forward(dyad(dyad(monad, operatorset('*/%')), operatorset('+-')).f)
    parser = expression << P.eof()
    test.assert_equals(4.6, next(parser.parse('1.2 + 3.4', just=True)))
    test.assert_equals(12, next(parser.parse('5 - 6 + 7 + 2 * 3', just=True)))
    test.assert_equals(-0.5, next(parser.parse('1 + (2 * (3 / 4) - 3)', just=True)))