diff --git a/muppet/parser_combinator.py b/muppet/parser_combinator.py index 6f132a4dd4090b1ce71c684dcce5a326e52a6abb..119108841149645355adbb27e87c04ae21e6bbc8 100644 --- a/muppet/parser_combinator.py +++ b/muppet/parser_combinator.py @@ -4,38 +4,24 @@ YAPC - Yet Another Parser Combinator. This module implements a simple, and probably rather bad parser combinator. -At its core is the ``ParserCombinator`` class, which is the driver for -the whole process. A sample usage is as follows: +There are two "core" parts in the system. The +:class:`ParserCombinator` class, which holds the the current state, +and :data:`Parser`, which are the rules declaring what to parse. .. code-block:: python + :caption: A basic example, "parsing" everything in a file filename = 'example.ext' with open(filename, 'r') as f: data = f.read() - parser = ParserCombinator(data, file=filename) + parse_context = ParserCombinator(data, file=filename) - parser.get(grammar) + parser = many(char) -Where ``grammar`` is a parser parser combinator rule. + parse_context.get(parser) -All rules stem from the compound type ``Items`` (TODO which should be -renamed). The types of valid rules are: - -- string literals, which match themselves -- ``None``, which matches nothing, and return nothing -- ParseDirective instances, see below -- Nullary functions, returing parser instances. This is purely for - lazy evaluation. - - .. _sequence: - -- Sequences of the above, interpreted as all the items after each - other. Note that if earlier elements succeed, but later fails, then - an exception will still be thrown, and the earlier parts discarded. - See also: :ref:`and <and>` - -``ParseDirective`` objects are the extension mechanism for this parser. +See :data:`Parser` for information about how parsers are constructed. Rules ===== @@ -58,123 +44,8 @@ treated as non fatal. For exmaple will be parsed by first trynig the integer parser, which SHOULD fail if the input isn't a valid integer, and then try a string literal. If neither matches, then the whole (``literal``) expression SHOULD throw. - -Built in parsers -================ - -However, usually when constructing parser you instead want to build -from the "primitives" provided by this module. These include: - -``optional(P)`` - Either matches the given parser, or matches nothing. - -``count(P, n)`` - Matches exactly *n* instances of the parser *P*. - -``discard(P)`` - Matches the parser *P*, but returns no matches. - -``char`` - Matches any single character. This is the most basic parser possible. - -``nop`` - Matches nothing. This is only useful as part of more complex parser. - -``many(P)`` - Consume tokens until the given parser doesn't match any more, this - means 0 or more. - -``many1`` - Like ``many``, but at least one. Equivalent to ``P & many(P)`` - -``complement(P)`` - Parses anything except the given parser. Mainly useful together - with ``all_``. - -``delimited(P1, P2)`` - Parses a list of *P2*:s, delimited by the parser *P1*. A good - usage could be: - - .. code-block:: python - - delimited(ws & "," & ws, integer) - - which would match a list of integers, separated by commas (and - surrounding space). - -``digit`` - Matches any one digit. - -``hexdig`` - Matches any one hexadecimal digit. - -``space`` - Matches a single ASCII whitespace character. - -``ws`` - Matches any sequence of ``space``. - -``all_(P ...)`` - Starting from the same point, run each parser in order. If all of - them succeed, return the result of the last ran parser. - -Meta commands -------------- - -Besides these, a few "meta" commands exists, these include: - -``s(any)`` - Which wraps its argument in a parser combinator object. Primarily - useful for the operators (see below). - -``name(name, P)`` - Attaches a name to the given parser. This causes the ``repr`` of - the given object to the the given name. - -``tag(tag, P)`` - Attaches the given tag to the parsed object. This is usefull to - quickly figure out what kind of object was parsed. The following - example is a good idea of how to implement an integer parser. - - .. code-block:: python - - integer = tag("integer", many1(number)) - - When combining ``tag`` and ``name``, it's recommended to have - ``name`` as the outermost, since otherwise the tag's ``repr`` - would still be visible. - - -Operators ---------- - -Finally, the base parser combinator class defines a few operators -(which mostly also are available as methods). These are: - -.. _and: -``P1 & P2`` or ``and_(P1, P2)`` - Creates a new combinator, which matches if *P1* first matches, - and *P2* matches wherever *P1* ends. - - Note that this is an all or nothing deal, in contrast to sequences - from lists, which may fail partway through. - - See also: :ref:`sequence <sequence>` - -``P1 | P2`` or ``or_(P1, P2)``) - Creates a new combinator, which matches either *P1* or *P2*, - returning the relut from the one matching. - -``P @ f`` - Matches (or fails) *P* as usuall. However, if it matches, then the - result will be passed through the procedure *f*, and its result - will become part of the parsed tree instead of the initial result - of *P*. - -``~ P`` or ``not_(P)`` - Creates a combinator which matches the complement of *P*, meaning - that this parser will be successfull if *P* fails. """ +from __future__ import annotations # import html from dataclasses import dataclass, field @@ -243,7 +114,7 @@ class ParseError(Exception): pos: Optional[int] = None src: Optional[str] = None - stk: list['Items'] = field(default_factory=list) + stk: list['Parser'] = field(default_factory=list) def __str__(self) -> str: s = '' @@ -275,9 +146,75 @@ class ParseError(Exception): class ParseDirective: """ - Common type for special parsing directives. + Abstract base class for custom parsers. + + This is the base class for all "advanced" parsing directives. This + can be thought of as the procedural escape hatch to the purely + functional parser combinators. + + Each instance of (a child of) this class is a parser in its own + right. The recommended usage is to subclass this class into + something which looks like a regural function. For example, see + :class:`and_`. This since these instances are used like functions + when constructing compound parsers, and they technically being + classes is more of an implementation detail. + + And number of operators are also available. These can be + overridden by individual children, if care is taken. For example, + :class:`and_` overrides ``__and__`` to merge adjacent `and`s into + one large `and`. + + :operator &: + + .. code-block:: python + + P3 = P1 & P2 + + Creates a new combinator, which matches if *P1* first matches, + and *P2* matches wherever *P1* ends. + + Note that this is an all or nothing deal, in contrast to sequences + from lists, which may fail partway through. + + See :class:`and_` and :ref:`sequence <sequence>` + + :operator |: + + .. code-block:: python + + P3 = P1 | P2 + + Creates a new combinator, which matches either *P1* or *P2*, + returning the relut from the one matching. - This is used for optional parsers, alternative parsers, and the like. + See :class:`or_` + + :operator @: + + .. code-block:: python + + P2 = P @ HANDLER + + Matches (or fails) *P* as usuall. However, if it matches, then the + result will be passed through the procedure *f*, and its result + will become part of the parsed tree instead of the initial result + of *P*. + + See :func:`fmap` + + :operator ~: + + .. code-block:: python + + P2 = ~ P + + Creates a combinator which matches the complement of *P*, meaning + that this parser will be successfull if *P* fails. + + See :class:`not_` + + :param handler: + Optional return value handler, see `@` above. """ handler: Optional[Callable[[list[Any]], list[Any]]] = None @@ -286,6 +223,10 @@ class ParseDirective: """ Execute this directive. + As a procedural escape hatch to parser combinators, the "raw" + parser object is directly available. See the top of file, + under "Rules" for information. + :param parser: The formatter which holds the context to use. """ @@ -305,26 +246,48 @@ class ParseDirective: return not_(self) -# TODO rename this -Items: TypeAlias = Union[ +Parser: TypeAlias = Union[ str, None, ParseDirective, - Callable[[], 'Items'], - Sequence['Items'] + Callable[[], 'Parser'], + Sequence['Parser'] ] +""" +The core parser type. + +A parser is a rule for how a string can be parsed. + +The "uninteresting" types are + +- Strings; which match their contents literally +- None; which matches nothing + + .. _sequence: + +- Sequences of other parsers; which matches each one in order, but + still consumes output if it fails partway through. + Prefer `&` chaining (see :class:`and <and_>`) +- Procedures, which are just lazy evaluation for other parsers + +Then finally there is :class:`ParseDirective`, where the real fun +happens. These allow for arbitrary rules. To construct new rules, +either extend the class (not recommended), or build compound parsers +from the fundamental building blocks (recommended). +""" @dataclass class s(ParseDirective): """ - Wraps a thingy in a parser directive. + Construct a propper ParserCombinator object from a loose Parser. - This directive exactly matches the given string, and is mostly - here to wrap strings to allow infix operators. + On it's own this is a no-op, but it allows for strings to be + "promoted" into parser combinator objects, which allows combining + operators to work on it. See :class:`ParseDirective`. """ - s: Items + s: Parser def run(self, parser: 'ParserCombinator') -> list[MatchObject]: # noqa: D102 return parser.get(self.s) @@ -337,7 +300,7 @@ class s(ParseDirective): class not_(ParseDirective): """Succeeds if the given directive fails.""" - form: Items + form: Parser def run(self, parser: 'ParserCombinator') -> list[MatchObject]: # noqa: D102 snapshot = parser.snapshot() @@ -378,7 +341,7 @@ class name(ParseDirective): """ name: str - form: 'Items' + form: 'Parser' def run(self, parser: 'ParserCombinator') -> list[MatchObject]: # noqa: D102 return parser.get(self.form) @@ -388,7 +351,7 @@ class name(ParseDirective): def optional(parser: ParseDirective) -> ParseDirective: - """Optionally parse parameter.""" + """Either match the given parser, or match nothing.""" return parser | nop @@ -396,9 +359,9 @@ def optional(parser: ParseDirective) -> ParseDirective: class and_(ParseDirective): """Parse a group in order.""" - items: list['Items'] + items: list['Parser'] - def __init__(self, *items: 'Items'): + def __init__(self, *items: 'Parser'): self.items = list(items) def run(self, parser: 'ParserCombinator') -> list[MatchObject]: # noqa: D102 @@ -436,9 +399,9 @@ class or_(ParseDirective): (Alias Variant) """ - alternatives: list['Items'] + alternatives: list['Parser'] - def __init__(self, *alternatives: Items): + def __init__(self, *alternatives: Parser): """Retrun a new OptionParse object.""" self.alternatives = list(alternatives) @@ -466,38 +429,30 @@ class or_(ParseDirective): @dataclass class count(ParseDirective): - """Run the given parser between min and max times.""" + """ + "Run" a given parser between `min` and `max` times. + + Constructs a new parser applying the given parser a number of + times. After the given parser, either a maximum count can be given + (in which case the minimum becomes 0), an explicit minimum and + maximum, or `min` and `max` can be passed as keyword arguments. + + .. code-block:: python + + count(parser, max) + count(parser, min, max) + count(parser, min=0, max=10) + """ min: int max: int - parser: Items + parser: Parser def __init__(self, - parser: Items, + parser: Parser, arg: int, *args: int, **kwargs: int): - """ - Run parser between ``min`` and ``max`` times. - - If only one numeric argument is given, then that's used as the - maximum count, but if two values are given, then the minimum count - preceeds the maximum. - - Both max and min can also be given as keyword arguments. - - :param parser: - :param min: - :param max: - - .. code-block:: python - - count(p, min, max) - - .. code-block:: python - - count(p, max) - """ min_ = 0 max_ = 1 @@ -541,13 +496,13 @@ class count(ParseDirective): return f'count({repr(self.parser)}, {self.min}, {self.max})' -def discard(parser: Items) -> ParseDirective: +def discard(parser: Parser) -> ParseDirective: """Run parser, but discard the result.""" return s(parser) @ (lambda _: []) @dataclass -class CharParser(ParseDirective): +class _CharParser(ParseDirective): """Parse a single character.""" def run(self, parser: 'ParserCombinator') -> list[MatchObject]: # noqa: D102 @@ -557,7 +512,8 @@ class CharParser(ParseDirective): return 'char' -char = CharParser() +char = _CharParser() +"""Uncodnitionally reads (parser) a single char.""" @dataclass @@ -565,14 +521,24 @@ class all_(ParseDirective): """ Run each parser in succession from the same point, returning the final result. + This can be used to implement and and xor between parsers. + .. code-block:: python + :caption: Parse any non-space character, a ⊻ b all_(~space, char) + + .. code-block:: python + :caption: Roundabout way of parsing A-F, a & b + + all_(hex_digit, uppercase) + + # Note that uppercase isn't implemented here """ - parsers: list[Items] + parsers: list[Parser] - def __init__(self, *parsers: Items) -> None: + def __init__(self, *parsers: Parser) -> None: self.parsers = list(parsers) def run(self, parser: 'ParserCombinator') -> list[MatchObject]: # noqa: D102 @@ -589,7 +555,7 @@ class all_(ParseDirective): @dataclass -class NopParser(ParseDirective): +class _NopParser(ParseDirective): """Parses nothing.""" def run(self, parser: 'ParserCombinator') -> list[MatchObject]: # noqa: D102 @@ -599,14 +565,27 @@ class NopParser(ParseDirective): return 'nop' -nop = NopParser() +nop = _NopParser() +""" +Null Parser, and always succeeds. + +This is mostyl useful for different compound parsers. Either as a +'null' element to build from, or for implementing optional parsers, +and probably much more. + +.. code-block:: python + :caption: Implementation of optional + + def optional(parser): + return parser | nop +""" # @dataclass # class many(ParseDirective): # """Many a parser as many times as possible.""" # -# parser: 'Items' +# parser: 'Parser' # # def run(self, parser: 'ParserCombinator') -> list[MatchObject]: # noqa: D102 # out = [] @@ -629,20 +608,35 @@ nop = NopParser() # probably blow up the call stack. -def many(parser: Items) -> ParseDirective: - """Match a parser as many times as possible.""" +def many(parser: Parser) -> ParseDirective: + """ + Match a parser as many times as possible. + + Consume tokens until the given parser doesn't match any more, this + means 0 or more. + """ return (s(parser) & name('<rest>', lambda: many(parser))) @ (lambda xs: [xs[0], *xs[1:]]) \ | nop def many1(parser: ParseDirective) -> ParseDirective: - """Parse between 1 any many.""" + """Like ``many``, but at least one. Equivalent to ``P & many(P)``.""" return parser & many(parser) @dataclass class complement(ParseDirective): - """Parses any character which isn't given.""" + """ + Parse any character not in the given set. + + .. code-block:: python + :caption: A simple string parser + + str_parser = s('"') & many(complement('"')) & '"' + + :param chars: + A list of characters which should not match. + """ chars: str @@ -662,10 +656,37 @@ class complement(ParseDirective): @dataclass class tag(ParseDirective): - """Tag value returned by parser.""" + """ + Tag value returned by parser. + + This returns a new parser which works exactly like the given + parser, but wraps the given parsers result in a MatchCompound + object, attaching `tag` as metadata. + + This allows the user to know what type of object was matched, for + example + + .. code-block:: python + + # Parses the word "if" + p1 = "if" + + # Also parses the word "if", but marks it as a keyword for + # handling later. + p2 = tag("keyword", "if") + + This is NOT to be confused with :class:`name`, which just changes + the debug output of objects. It's recommended to use `name` + outside `tag` to reduce clutter in debug prints. + + :param tag: + String to tag the parser's result with. + :param parser: + The actual parser. + """ tag: str - parser: 'Items' + parser: 'Parser' def run(self, parser: 'ParserCombinator') -> list[MatchObject]: # noqa: D102 result = parser.get(self.parser) @@ -675,31 +696,47 @@ class tag(ParseDirective): return f'{self.tag}({self.parser!r})' -def delimited(delim: Items, parser: Items) -> ParseDirective: +def delimited(delim: Parser, parser: Parser) -> ParseDirective: """ Read an infix delimited list of items. - If a optional trailing "comma" is wanted: - .. code-block:: python + :caption: Usage, with optional trailing comma - and_(delimited(and_(ws, ',', ws), - ITEM), - optional(ws, ',')) + (delimited(ws & ',' & ws, # The delimiter + ITEM) # The item to parse + & optional(ws, ',') # Trailing comma """ return s(parser) & many(s(delim) & s(parser)) digit = name('digit', or_(*(chr(x + ord('0')) for x in range(0, 10)))) +"""Parser for a single base 10 digit.""" hexdig = name('hexdig', digit | or_(*(chr(x + ord('A')) for x in range(0, 6))) | or_(*(chr(x + ord('a')) for x in range(0, 6)))) +""" +Parser for a single hexadecimal digit. + +Both upper and lower case are supported +""" space = s(' ') | '\t' | '\n' | '\r' +""" +Parses a single whitespace token. + +A whitespace token here is defined as space, tab, newline, or carriage return. +""" ws = name('ws', many(space)) +""" +Parses a whitespace sequence. + +A whitespace sequence is any (including 0) number of var:`space` +characters after one another. +""" def line_comment(start: str) -> ParseDirective: @@ -715,16 +752,38 @@ class ParserCombinatorSnapshot: can only be used with the parser formatter which created it. A snapshot can be restored multiple times. + + See :func:`ParserCombinator.snapshot` and + :func:`ParserCombinator.restore` for usage. """ - def __init__(self, seek: int, creator: 'ParserCombinator'): + _seek: int + _creator: 'ParserCombinator' + + @classmethod + def _construct(cls, seek: int, creator: 'ParserCombinator') -> ParserCombinatorSnapshot: + """ + Private constructor. + + Python doesn't support private constructors, but this one + should be, so we do it ourself. + """ + self = cls() self._seek = seek self._creator = creator + return self class ParserCombinator: """ - A basic parser combinator for Python. + The state container for running a parser. + + This class represents the current state of a running parser, with + each call to `get` modifying the state. + + As an aside, the only actual piece of state inside this class is + the offset into the string, meaning that a (syntax supported) + state monad would trivially allow us to be non-mutating. :param source: The string which should be parsed. @@ -734,10 +793,12 @@ class ParserCombinator: def __init__(self, source: str, file: Optional[str] = None): self.__source = source + # TODO possibly hide this, since it REALLY shouldn't be + # manually editable self.seek = 0 self.file = file - def peek(self, item: Items) -> list[MatchObject]: + def peek(self, item: Parser) -> list[MatchObject]: """Run the parser without updating the state.""" # TODO on error? snapshot = self.snapshot() @@ -767,16 +828,23 @@ class ParserCombinator: """Return remaining, unparsed, string.""" return self.__source[self.seek:] - def get(self, item: Items) -> list[Any]: + def get(self, item: Parser) -> list[Any]: """ Try parsing the next item in stream as the passed on parser. :param item: Parser to run next. + :returns: If the parsing suceeded, the list of matched objects are returned, and the interal stream is updated. + The reason for this type being `Any` is that any rule can + modify its own return type through :func:`fmap`, meaning + that we have no idea what we get. However, if no item does + that, then the return type will be a a list of + :class:`MatchCompound` objects, and strings. + :throws ParseError: If parsing failed, then a ParseError is thrown. However, if any tokens had been parsed that far then the internal @@ -873,7 +941,7 @@ class ParserCombinator: def snapshot(self) -> ParserCombinatorSnapshot: """Create a snapshot of the parsers current state.""" - return ParserCombinatorSnapshot( + return ParserCombinatorSnapshot._construct( seek=self.seek, creator=self) @@ -911,6 +979,20 @@ class ParserCombinator: T = TypeVar('T') +# The infix operation is the "primitive" version, due to the +# ParseDirective needing to own the parser due to how Python is +# structured. This is however an implementation detail. +def fmap(proc: Callable[[Any], Any], parser: ParseDirective) -> ParseDirective: + """ + Apply proc to the eventual result of parser. + + This returns a new parser, which behaves just as the given parser, + but it's result will be passed through proc, and that will be the + result of the new parser. + """ + return parser @ proc + + def const(x: T) -> T: """ Return the value directly.