Source code for textparser

# A text parser.

import collections.abc
import re
import typing
from dataclasses import dataclass
from enum import Enum, auto
from operator import itemgetter

__author__ = 'Erik Moqvist'
from .version import __version__  # noqa: F401


class _Mismatch(Enum):
    MISMATCH = auto()

MISMATCH = _Mismatch.MISMATCH
"""Returned by :func:`~textparser.Pattern.match()` on mismatch.

"""

MismatchSingleton = typing.Literal[_Mismatch.MISMATCH]

[docs] @dataclass(slots=True, frozen=True, eq=True) class Token: kind: str value: str | None offset: int
class _Tokens: def __init__(self, tokens: list[Token]): self._tokens = tokens self._pos = 0 self._max_pos = -1 self._stack: list[int] = [] def get_value(self) -> Token | str: pos = self._pos self._pos += 1 return self._tokens[pos] def peek(self) -> Token: return self._tokens[self._pos] def peek_max(self) -> Token: pos = self._pos pos = max(pos, self._max_pos) if pos >= len(self._tokens): return self._tokens[-1] else: return self._tokens[pos] def save(self) -> None: self._stack.append(self._pos) def restore(self) -> None: self._pos = self._stack.pop() def update(self) -> None: self._stack[-1] = self._pos def mark_max_restore(self) -> None: self._max_pos = max(self._max_pos, self._pos) self._pos = self._stack.pop() def mark_max_load(self) -> None: self._max_pos = max(self._max_pos, self._pos) self._pos = self._stack[-1] def drop(self) -> None: self._stack.pop() def __repr__(self) -> str: return str(self._tokens[self._pos:self._pos + 2]) MatchObject = list["MatchObject"] | dict[str, list["MatchObject"]] | tuple[str,"MatchObject"] | Token | str
[docs] class Pattern: """Base class of all patterns. """
[docs] def match(self, tokens: _Tokens) -> MatchObject | MismatchSingleton: """Returns :data:`~textparser.MISMATCH` on mismatch, and anything else on match. """ raise NotImplementedError('To be implemented by subclasses.')
class _String(Pattern): """Matches a specific token kind. """ def __init__(self, kind: str) -> None: self.kind = kind def match(self, tokens: _Tokens) -> MatchObject | MismatchSingleton: if self.kind == tokens.peek().kind: return tokens.get_value() else: return MISMATCH class _StringTokens(_Tokens): def get_value(self) -> Token | str: pos = self._pos self._pos += 1 return typing.cast('str', self._tokens[pos].value) def _wrap_string(item: Pattern | str) -> Pattern: if isinstance(item, str): item = _String(item) return item def _wrap_strings(items: collections.abc.Sequence[Pattern | str]) -> list[Pattern]: return [_wrap_string(item) for item in items] def _format_invalid_syntax(text: str, offset: int) -> str: return f'Invalid syntax at line {line(text, offset)}, column {column(text, offset)}: "{markup_line(text, offset)}"'
[docs] class Error(Exception): """General textparser exception. """
[docs] class TokenizeError(Error): """This exception is raised when the text cannot be converted into tokens. """ def __init__(self, text: str, offset: int) -> None: self._text = text self._offset = offset message = _format_invalid_syntax(text, offset) super().__init__(message) @property def text(self) -> str: """The input text to the tokenizer. """ return self._text @property def offset(self) -> int: """Offset into the text where the tokenizer failed. """ return self._offset
[docs] class GrammarError(Error): """This exception is raised when the tokens cannot be converted into a parse tree. """ def __init__(self, offset: int) -> None: self._offset = offset message = f'Invalid syntax at offset {offset}.' super().__init__(message) @property def offset(self) -> int: """Offset into the text where the parser failed. """ return self._offset
[docs] class ParseError(Error): """This exception is raised when the parser fails to parse the text. """ def __init__(self, text: str, offset: int): self._text = text self._offset = offset self._line = line(text, offset) self._column = column(text, offset) message = _format_invalid_syntax(text, offset) super().__init__(message) @property def text(self) -> str: """The input text to the parser. """ return self._text @property def offset(self) -> int: """Offset into the text where the parser failed. """ return self._offset @property def line(self) -> int: """Line where the parser failed. """ return self._line @property def column(self) -> int: """Column where the parser failed. """ return self._column def __reduce__(self) -> tuple[typing.Any, ...]: """Adds pickling support.""" return type(self), (self._text, self._offset), {}
[docs] class Sequence(Pattern): """Matches a sequence of patterns. Becomes a list in the parse tree. """ def __init__(self, *patterns: Pattern | str) -> None: self.patterns = _wrap_strings(patterns) def match(self, tokens: _Tokens) -> MatchObject | MismatchSingleton: matched: list[MatchObject] = [] for pattern in self.patterns: mo = pattern.match(tokens) if mo is MISMATCH: return MISMATCH matched.append(mo) return matched
[docs] class Choice(Pattern): """Matches any of given ordered patterns `patterns`. The first pattern in the list has highest priority, and the last lowest. """ def __init__(self, *patterns: Pattern | str) -> None: self._patterns = _wrap_strings(patterns) def match(self, tokens: _Tokens) -> MatchObject | MismatchSingleton: tokens.save() for pattern in self._patterns: tokens.mark_max_load() mo = pattern.match(tokens) if mo is not MISMATCH: tokens.drop() return mo tokens.restore() return MISMATCH
[docs] class Tag(Pattern): """Tags any matched `pattern` with name `name`. Becomes a two-tuple of `name` and match in the parse tree. """ def __init__(self, name: str, pattern: Pattern | str) -> None: self._name = name self._pattern = _wrap_string(pattern) @property def pattern(self) -> Pattern: return self._pattern def match(self, tokens: _Tokens) -> MatchObject | MismatchSingleton: mo = self._pattern.match(tokens) if mo is not MISMATCH: return (self._name, mo) else: return MISMATCH
[docs] class Forward(Pattern): """Forward declaration of a pattern. .. code-block:: python >>> foo = Forward() >>> foo <<= Sequence('NUMBER') """ def __init__(self) -> None: self._pattern: Pattern | None = None @property def pattern(self) -> Pattern | None: return self._pattern def __ilshift__(self, other: Pattern | str) -> "Forward": self._pattern = _wrap_string(other) return self def match(self, tokens: _Tokens) -> MatchObject | MismatchSingleton: if self._pattern is not None: return self._pattern.match(tokens) return MISMATCH
[docs] class ChoiceDict(Pattern): """Matches any of given patterns. The first token kind of all patterns must be unique, otherwise and :class:`~textparser.Error` exception is raised. This class is faster than :class:`~textparser.Choice`, and should be used if the grammar allows it. """ def __init__(self, *patterns: Pattern | str) -> None: self._patterns_map: dict[str, Pattern] = {} wrapped_patterns = _wrap_strings(patterns) for pattern in wrapped_patterns: self._check_pattern(pattern, pattern) @property def patterns_map(self) -> dict[str, Pattern]: return self._patterns_map def _check_pattern(self, inner: Pattern, outer: Pattern) -> None: if isinstance(inner, _String): self._add_pattern(inner.kind, outer) elif isinstance(inner, Sequence): self._check_pattern(inner.patterns[0], outer) elif isinstance(inner, (Tag, Forward)): if inner.pattern is None: raise Error( f'No inner pattern defined for {type(inner)}.') self._check_pattern(inner.pattern, outer) elif isinstance(inner, ChoiceDict): for pattern in inner.patterns_map.values(): self._check_pattern(pattern, outer) else: raise Error( f'Unsupported pattern type {type(inner)}.') def _add_pattern(self, kind: str, pattern: Pattern) -> None: if kind in self._patterns_map: raise Error( f"First token kind must be unique, but {kind} isn't.") self._patterns_map[kind] = pattern def match(self, tokens: _Tokens) -> MatchObject | MismatchSingleton: kind = tokens.peek().kind if kind in self._patterns_map: return self._patterns_map[kind].match(tokens) else: return MISMATCH
[docs] class Repeated(Pattern): """Matches `pattern` at least `minimum` times. Any match becomes a list in the parse tree. """ def __init__(self, pattern: Pattern | str, minimum: int=0) -> None: self._pattern = _wrap_string(pattern) self._minimum = minimum def match(self, tokens: _Tokens) -> MatchObject | MismatchSingleton: matched = [] tokens.save() while True: mo = self._pattern.match(tokens) if mo is MISMATCH: tokens.mark_max_restore() break matched.append(mo) tokens.update() if len(matched) >= self._minimum: return matched else: return MISMATCH
[docs] class RepeatedDict(Repeated): """Same as :class:`~textparser.Repeated`, but becomes a dictionary instead of a list in the parse tree. `key` is a function taking the match as input and returning the dictionary key. By default the first element in the match is used as key. """ def __init__(self, pattern: Pattern | str, minimum: int=0, key: typing.Callable[[MatchObject], str] | None=None) -> None: super().__init__(pattern, minimum) if key is None: key = typing.cast('typing.Callable[[MatchObject], str]', itemgetter(0)) self._key = key def match(self, tokens: _Tokens) -> MatchObject | MismatchSingleton: matched: dict[str, list[MatchObject]] = {} tokens.save() while True: mo = self._pattern.match(tokens) if mo is MISMATCH: tokens.mark_max_restore() break key = self._key(mo) try: matched[key].append(mo) except KeyError: matched[key] = [mo] tokens.update() if len(matched) >= self._minimum: return matched else: return MISMATCH
[docs] class ZeroOrMore(Repeated): """Matches `pattern` zero or more times. See :class:`~textparser.Repeated` for more details. """ def __init__(self, pattern: Pattern | str) -> None: super().__init__(pattern, 0)
[docs] class ZeroOrMoreDict(RepeatedDict): """Matches `pattern` zero or more times. See :class:`~textparser.RepeatedDict` for more details. """ def __init__(self, pattern: Pattern | str, key: typing.Callable[[MatchObject], str] | None=None) -> None: super().__init__(pattern, 0, key)
[docs] class OneOrMore(Repeated): """Matches `pattern` one or more times. See :class:`~textparser.Repeated` for more details. """ def __init__(self, pattern: Pattern | str) -> None: super().__init__(pattern, 1)
[docs] class OneOrMoreDict(RepeatedDict): """Matches `pattern` one or more times. See :class:`~textparser.RepeatedDict` for more details. """ def __init__(self, pattern: Pattern | str, key: typing.Callable[[MatchObject], str] | None=None) -> None: super().__init__(pattern, 1, key)
[docs] class DelimitedList(Pattern): """Matches a delimented list of `pattern` separated by `delim`. `pattern` must be matched at least once. Any match becomes a list in the parse tree, excluding the delimiters. """ def __init__(self, pattern: Pattern | str, delim: str=',') -> None: self._pattern = _wrap_string(pattern) self._delim = _wrap_string(delim) def match(self, tokens: _Tokens) -> MatchObject | MismatchSingleton: # First pattern. mo = self._pattern.match(tokens) if mo is MISMATCH: return MISMATCH matched = [mo] tokens.save() while True: # Discard the delimiter. mo = self._delim.match(tokens) if mo is MISMATCH: break # Pattern. mo = self._pattern.match(tokens) if mo is MISMATCH: break matched.append(mo) tokens.update() tokens.restore() return matched
[docs] class Optional(Pattern): """Matches `pattern` zero or one times. Becomes a list in the parse tree, empty on mismatch. """ def __init__(self, pattern: Pattern | str) -> None: self._pattern = _wrap_string(pattern) def match(self, tokens: _Tokens) -> MatchObject | MismatchSingleton: tokens.save() mo = self._pattern.match(tokens) if mo is MISMATCH: tokens.mark_max_restore() return [] else: tokens.drop() return [mo]
[docs] class Any(Pattern): """Matches any token. """ def match(self, tokens: _Tokens) -> MatchObject | MismatchSingleton: if tokens.peek().kind == '__EOF__': return MISMATCH else: return tokens.get_value()
[docs] class AnyUntil(Pattern): """Matches any token until given pattern is found. Becomes a list in the parse tree, not including the given pattern match. """ def __init__(self, pattern: Pattern | str) -> None: self._pattern = _wrap_string(pattern) def match(self, tokens: _Tokens) -> MatchObject | MismatchSingleton: matched: list[MatchObject] = [] while True: tokens.save() mo = self._pattern.match(tokens) if mo is not MISMATCH: break tokens.restore() matched.append(tokens.get_value()) tokens.restore() return matched
[docs] class And(Pattern): """Matches `pattern`, without consuming any tokens. Any match becomes an empty list in the parse tree. """ def __init__(self, pattern: Pattern | str) -> None: self._pattern = _wrap_string(pattern) def match(self, tokens: _Tokens) -> MatchObject | MismatchSingleton: tokens.save() mo = self._pattern.match(tokens) tokens.restore() if mo is MISMATCH: return MISMATCH else: return []
[docs] class Not(Pattern): """Matches if `pattern` does not match. Any match becomes an empty list in the parse tree. Just like :class:`~textparser.And`, no tokens are consumed. """ def __init__(self, pattern: Pattern | str) -> None: self._pattern = _wrap_string(pattern) def match(self, tokens: _Tokens) -> MatchObject | MismatchSingleton: tokens.save() mo = self._pattern.match(tokens) tokens.restore() if mo is MISMATCH: return [] else: return MISMATCH
[docs] class NoMatch(Pattern): """Never matches anything. """ def match(self, tokens: _Tokens) -> MatchObject | MismatchSingleton: return MISMATCH
[docs] class Grammar: """Creates a tree of given tokens using the grammar `grammar`. """ def __init__(self, grammar: Pattern | str) -> None: self._root: Pattern if isinstance(grammar, str): self._root = _wrap_string(grammar) else: self._root = grammar def parse(self, token_list: list[Token], token_tree: bool=False) -> MatchObject: if token_tree: tokens = _Tokens(token_list) else: tokens = _StringTokens(token_list) parsed = self._root.match(tokens) if parsed is not MISMATCH and tokens.peek_max().kind == '__EOF__': return parsed else: raise GrammarError(tokens.peek_max().offset)
[docs] def choice(*patterns: Pattern | str) -> Choice | ChoiceDict: """Returns an instance of the fastest choice class for given patterns `patterns`. It is recommended to use this function instead of instantiate :class:`~textparser.Choice` or :class:`~textparser.ChoiceDict` directly. """ try: return ChoiceDict(*patterns) except Error: return Choice(*patterns)
[docs] def markup_line(text: str, offset: int, marker: str='>>!<<') -> str: """Insert `marker` at `offset` into `text`, and return the marked line. .. code-block:: python >>> markup_line('0\\n1234\\n56', 3) 1>>!<<234 """ begin = text.rfind('\n', 0, offset) begin += 1 end = text.find('\n', offset) if end == -1: end = len(text) return text[begin:offset] + marker + text[offset:end]
def line(text: str, offset: int) -> int: return text[:offset].count('\n') + 1 def column(text: str, offset: int) -> int: line_start = text.rfind('\n', 0, offset) return offset - line_start
[docs] def tokenize_init(spec: collections.abc.Sequence[tuple[str, str] | tuple[str, str, int]]) -> tuple[list[Token], str]: """Initialize a tokenizer. Should only be called by the :func:`~textparser.Parser.tokenize` method in the parser. """ tokens = [Token('__SOF__', '__SOF__', 0)] re_token = '|'.join([ f'(?P<{token_spec[0]}>{token_spec[1]})' for token_spec in spec ]) return tokens, re_token
[docs] class Parser: """The abstract base class of all text parsers. .. code-block:: python >>> from textparser import Parser, Sequence >>> class MyParser(Parser): ... def token_specs(self): ... return [ ... ('SKIP', r'[ \\r\\n\\t]+'), ... ('WORD', r'\\w+'), ... ('EMARK', '!', r'!'), ... ('COMMA', ',', r','), ... ('MISMATCH', r'.') ... ] ... def grammar(self): ... return Sequence('WORD', ',', 'WORD', '!') """ def _unpack_token_specs(self) -> tuple[dict[str, str], list[tuple[str,str]]]: names = {} specs = [] for spec in self.token_specs(): if len(spec) == 2: specs.append(spec) else: specs.append((spec[0], spec[2])) names[spec[0]] = spec[1] return names, specs
[docs] def keywords(self) -> set[str]: """A set of keywords in the text. .. code-block:: python def keywords(self): return set(['if', 'else']) """ return set()
[docs] def token_specs(self) -> list[tuple[str, str] | tuple[str, str, str]]: """The token specifications with token name, regular expression, and optionally a user friendly name. Two token specification forms are available; ``(kind, re)`` or ``(kind, name, re)``. If the second form is used, the grammar should use `name` instead of `kind`. See :class:`~textparser.Parser` for an example usage. """ return [ ('SKIP', r'[ \r\n\t]+'), ('NUMBER', r'-?\d+(\.\d+)?([eE][+-]?\d+)?'), ('WORD', r'[A-Za-z0-9_]+'), ('ESCAPED_STRING', r'"(\\"|[^"])*?"'), ('MISMATCH', r'.') ]
[docs] def tokenize(self, text: str) -> list[Token]: """Tokenize given string `text`, and return a list of tokens. Raises :class:`~textparser.TokenizeError` on failure. This method should only be called by :func:`~textparser.Parser.parse()`, but may very well be overridden if the default implementation does not match the parser needs. """ names, specs = self._unpack_token_specs() keywords = self.keywords() tokens, re_token = tokenize_init(specs) for mo in re.finditer(re_token, text, re.DOTALL): kind = mo.lastgroup assert isinstance(kind, str) if kind == 'SKIP': pass elif kind != 'MISMATCH': value = mo.group(kind) if value in keywords: kind = value if kind in names: kind = names[kind] tokens.append(Token(kind, value, mo.start())) else: raise TokenizeError(text, mo.start()) return tokens
[docs] def grammar(self) -> Grammar: """The text grammar is used to create a parse tree out of a list of tokens. See :class:`~textparser.Parser` for an example usage. """ raise NotImplementedError('No grammar defined.')
[docs] def parse(self, text: str, token_tree: bool=False, match_sof:bool=False) -> MatchObject | MismatchSingleton: """Parse given string `text` and return the parse tree. Raises :class:`~textparser.ParseError` on failure. Returns a parse tree of tokens if `token_tree` is ``True``. .. code-block:: python >>> MyParser().parse('Hello, World!') ['Hello', ',', 'World', '!'] >>> tree = MyParser().parse('Hello, World!', token_tree=True) >>> from pprint import pprint >>> pprint(tree) [Token(kind='WORD', value='Hello', offset=0), Token(kind=',', value=',', offset=5), Token(kind='WORD', value='World', offset=7), Token(kind='!', value='!', offset=12)] """ try: tokens = self.tokenize(text) if len(tokens) == 0 or tokens[-1].kind != '__EOF__': tokens.append(Token('__EOF__', '__EOF__', len(text))) if not match_sof: if len(tokens) > 0 and tokens[0].kind == '__SOF__': del tokens[0] grammar = self.grammar() if isinstance(grammar, Grammar): return grammar.parse(tokens, token_tree) else: # used for compatibility with old user code from the # pre-type hints era... return Grammar(grammar).parse(tokens, token_tree) except (TokenizeError, GrammarError) as e: raise ParseError(text, e.offset) from e
def replace_blocks(string: str, start: str='{', end: str='}') -> str: """Replace all blocks starting with `start` and ending with `end` with spaces (not including `start` and `end`). """ chunks = [] begin = 0 depth = 0 start_length = len(start) pattern = rf'({re.escape(start)}|{re.escape(end)})' for mo in re.finditer(pattern, string): pos = mo.start() if mo.group() == start: if depth == 0: chunks.append(string[begin:pos + start_length]) begin = (pos + start_length) depth += 1 elif depth > 0: depth -= 1 if depth == 0: for chunk in string[begin:pos].split('\n'): chunks.append(' ' * len(chunk)) chunks.append('\n') chunks.pop() begin = pos chunks.append(string[begin:]) return ''.join(chunks)