# A text parser.
import collections.abc
import re
import typing
from dataclasses import dataclass
from enum import Enum, auto
from operator import itemgetter
__author__ = 'Erik Moqvist'
from .version import __version__ # noqa: F401
class _Mismatch(Enum):
MISMATCH = auto()
MISMATCH = _Mismatch.MISMATCH
"""Returned by :func:`~textparser.Pattern.match()` on mismatch.
"""
MismatchSingleton = typing.Literal[_Mismatch.MISMATCH]
[docs]
@dataclass(slots=True, frozen=True, eq=True)
class Token:
kind: str
value: str | None
offset: int
class _Tokens:
def __init__(self, tokens: list[Token]):
self._tokens = tokens
self._pos = 0
self._max_pos = -1
self._stack: list[int] = []
def get_value(self) -> Token | str:
pos = self._pos
self._pos += 1
return self._tokens[pos]
def peek(self) -> Token:
return self._tokens[self._pos]
def peek_max(self) -> Token:
pos = self._pos
pos = max(pos, self._max_pos)
if pos >= len(self._tokens):
return self._tokens[-1]
else:
return self._tokens[pos]
def save(self) -> None:
self._stack.append(self._pos)
def restore(self) -> None:
self._pos = self._stack.pop()
def update(self) -> None:
self._stack[-1] = self._pos
def mark_max_restore(self) -> None:
self._max_pos = max(self._max_pos, self._pos)
self._pos = self._stack.pop()
def mark_max_load(self) -> None:
self._max_pos = max(self._max_pos, self._pos)
self._pos = self._stack[-1]
def drop(self) -> None:
self._stack.pop()
def __repr__(self) -> str:
return str(self._tokens[self._pos:self._pos + 2])
MatchObject = list["MatchObject"] | dict[str, list["MatchObject"]] | tuple[str,"MatchObject"] | Token | str
[docs]
class Pattern:
"""Base class of all patterns.
"""
[docs]
def match(self, tokens: _Tokens) -> MatchObject | MismatchSingleton:
"""Returns :data:`~textparser.MISMATCH` on mismatch, and anything else
on match.
"""
raise NotImplementedError('To be implemented by subclasses.')
class _String(Pattern):
"""Matches a specific token kind.
"""
def __init__(self, kind: str) -> None:
self.kind = kind
def match(self, tokens: _Tokens) -> MatchObject | MismatchSingleton:
if self.kind == tokens.peek().kind:
return tokens.get_value()
else:
return MISMATCH
class _StringTokens(_Tokens):
def get_value(self) -> Token | str:
pos = self._pos
self._pos += 1
return typing.cast('str', self._tokens[pos].value)
def _wrap_string(item: Pattern | str) -> Pattern:
if isinstance(item, str):
item = _String(item)
return item
def _wrap_strings(items: collections.abc.Sequence[Pattern | str]) -> list[Pattern]:
return [_wrap_string(item) for item in items]
def _format_invalid_syntax(text: str, offset: int) -> str:
return f'Invalid syntax at line {line(text, offset)}, column {column(text, offset)}: "{markup_line(text, offset)}"'
[docs]
class Error(Exception):
"""General textparser exception.
"""
[docs]
class TokenizeError(Error):
"""This exception is raised when the text cannot be converted into
tokens.
"""
def __init__(self, text: str, offset: int) -> None:
self._text = text
self._offset = offset
message = _format_invalid_syntax(text, offset)
super().__init__(message)
@property
def text(self) -> str:
"""The input text to the tokenizer.
"""
return self._text
@property
def offset(self) -> int:
"""Offset into the text where the tokenizer failed.
"""
return self._offset
[docs]
class GrammarError(Error):
"""This exception is raised when the tokens cannot be converted into a
parse tree.
"""
def __init__(self, offset: int) -> None:
self._offset = offset
message = f'Invalid syntax at offset {offset}.'
super().__init__(message)
@property
def offset(self) -> int:
"""Offset into the text where the parser failed.
"""
return self._offset
[docs]
class ParseError(Error):
"""This exception is raised when the parser fails to parse the text.
"""
def __init__(self, text: str, offset: int):
self._text = text
self._offset = offset
self._line = line(text, offset)
self._column = column(text, offset)
message = _format_invalid_syntax(text, offset)
super().__init__(message)
@property
def text(self) -> str:
"""The input text to the parser.
"""
return self._text
@property
def offset(self) -> int:
"""Offset into the text where the parser failed.
"""
return self._offset
@property
def line(self) -> int:
"""Line where the parser failed.
"""
return self._line
@property
def column(self) -> int:
"""Column where the parser failed.
"""
return self._column
def __reduce__(self) -> tuple[typing.Any, ...]:
"""Adds pickling support."""
return type(self), (self._text, self._offset), {}
[docs]
class Sequence(Pattern):
"""Matches a sequence of patterns. Becomes a list in the parse tree.
"""
def __init__(self, *patterns: Pattern | str) -> None:
self.patterns = _wrap_strings(patterns)
def match(self, tokens: _Tokens) -> MatchObject | MismatchSingleton:
matched: list[MatchObject] = []
for pattern in self.patterns:
mo = pattern.match(tokens)
if mo is MISMATCH:
return MISMATCH
matched.append(mo)
return matched
[docs]
class Choice(Pattern):
"""Matches any of given ordered patterns `patterns`. The first pattern
in the list has highest priority, and the last lowest.
"""
def __init__(self, *patterns: Pattern | str) -> None:
self._patterns = _wrap_strings(patterns)
def match(self, tokens: _Tokens) -> MatchObject | MismatchSingleton:
tokens.save()
for pattern in self._patterns:
tokens.mark_max_load()
mo = pattern.match(tokens)
if mo is not MISMATCH:
tokens.drop()
return mo
tokens.restore()
return MISMATCH
[docs]
class Tag(Pattern):
"""Tags any matched `pattern` with name `name`. Becomes a two-tuple of
`name` and match in the parse tree.
"""
def __init__(self, name: str, pattern: Pattern | str) -> None:
self._name = name
self._pattern = _wrap_string(pattern)
@property
def pattern(self) -> Pattern:
return self._pattern
def match(self, tokens: _Tokens) -> MatchObject | MismatchSingleton:
mo = self._pattern.match(tokens)
if mo is not MISMATCH:
return (self._name, mo)
else:
return MISMATCH
[docs]
class Forward(Pattern):
"""Forward declaration of a pattern.
.. code-block:: python
>>> foo = Forward()
>>> foo <<= Sequence('NUMBER')
"""
def __init__(self) -> None:
self._pattern: Pattern | None = None
@property
def pattern(self) -> Pattern | None:
return self._pattern
def __ilshift__(self, other: Pattern | str) -> "Forward":
self._pattern = _wrap_string(other)
return self
def match(self, tokens: _Tokens) -> MatchObject | MismatchSingleton:
if self._pattern is not None:
return self._pattern.match(tokens)
return MISMATCH
[docs]
class ChoiceDict(Pattern):
"""Matches any of given patterns. The first token kind of all patterns
must be unique, otherwise and :class:`~textparser.Error` exception
is raised.
This class is faster than :class:`~textparser.Choice`, and should
be used if the grammar allows it.
"""
def __init__(self, *patterns: Pattern | str) -> None:
self._patterns_map: dict[str, Pattern] = {}
wrapped_patterns = _wrap_strings(patterns)
for pattern in wrapped_patterns:
self._check_pattern(pattern, pattern)
@property
def patterns_map(self) -> dict[str, Pattern]:
return self._patterns_map
def _check_pattern(self, inner: Pattern, outer: Pattern) -> None:
if isinstance(inner, _String):
self._add_pattern(inner.kind, outer)
elif isinstance(inner, Sequence):
self._check_pattern(inner.patterns[0], outer)
elif isinstance(inner, (Tag, Forward)):
if inner.pattern is None:
raise Error(
f'No inner pattern defined for {type(inner)}.')
self._check_pattern(inner.pattern, outer)
elif isinstance(inner, ChoiceDict):
for pattern in inner.patterns_map.values():
self._check_pattern(pattern, outer)
else:
raise Error(
f'Unsupported pattern type {type(inner)}.')
def _add_pattern(self, kind: str, pattern: Pattern) -> None:
if kind in self._patterns_map:
raise Error(
f"First token kind must be unique, but {kind} isn't.")
self._patterns_map[kind] = pattern
def match(self, tokens: _Tokens) -> MatchObject | MismatchSingleton:
kind = tokens.peek().kind
if kind in self._patterns_map:
return self._patterns_map[kind].match(tokens)
else:
return MISMATCH
[docs]
class Repeated(Pattern):
"""Matches `pattern` at least `minimum` times. Any match becomes a
list in the parse tree.
"""
def __init__(self, pattern: Pattern | str, minimum: int=0) -> None:
self._pattern = _wrap_string(pattern)
self._minimum = minimum
def match(self, tokens: _Tokens) -> MatchObject | MismatchSingleton:
matched = []
tokens.save()
while True:
mo = self._pattern.match(tokens)
if mo is MISMATCH:
tokens.mark_max_restore()
break
matched.append(mo)
tokens.update()
if len(matched) >= self._minimum:
return matched
else:
return MISMATCH
[docs]
class RepeatedDict(Repeated):
"""Same as :class:`~textparser.Repeated`, but becomes a dictionary
instead of a list in the parse tree.
`key` is a function taking the match as input and returning the
dictionary key. By default the first element in the match is used
as key.
"""
def __init__(self, pattern: Pattern | str, minimum: int=0, key: typing.Callable[[MatchObject], str] | None=None) -> None:
super().__init__(pattern, minimum)
if key is None:
key = typing.cast('typing.Callable[[MatchObject], str]', itemgetter(0))
self._key = key
def match(self, tokens: _Tokens) -> MatchObject | MismatchSingleton:
matched: dict[str, list[MatchObject]] = {}
tokens.save()
while True:
mo = self._pattern.match(tokens)
if mo is MISMATCH:
tokens.mark_max_restore()
break
key = self._key(mo)
try:
matched[key].append(mo)
except KeyError:
matched[key] = [mo]
tokens.update()
if len(matched) >= self._minimum:
return matched
else:
return MISMATCH
[docs]
class ZeroOrMore(Repeated):
"""Matches `pattern` zero or more times.
See :class:`~textparser.Repeated` for more details.
"""
def __init__(self, pattern: Pattern | str) -> None:
super().__init__(pattern, 0)
[docs]
class ZeroOrMoreDict(RepeatedDict):
"""Matches `pattern` zero or more times.
See :class:`~textparser.RepeatedDict` for more details.
"""
def __init__(self, pattern: Pattern | str, key: typing.Callable[[MatchObject], str] | None=None) -> None:
super().__init__(pattern, 0, key)
[docs]
class OneOrMore(Repeated):
"""Matches `pattern` one or more times.
See :class:`~textparser.Repeated` for more details.
"""
def __init__(self, pattern: Pattern | str) -> None:
super().__init__(pattern, 1)
[docs]
class OneOrMoreDict(RepeatedDict):
"""Matches `pattern` one or more times.
See :class:`~textparser.RepeatedDict` for more details.
"""
def __init__(self, pattern: Pattern | str, key: typing.Callable[[MatchObject], str] | None=None) -> None:
super().__init__(pattern, 1, key)
[docs]
class DelimitedList(Pattern):
"""Matches a delimented list of `pattern` separated by
`delim`. `pattern` must be matched at least once. Any match
becomes a list in the parse tree, excluding the delimiters.
"""
def __init__(self, pattern: Pattern | str, delim: str=',') -> None:
self._pattern = _wrap_string(pattern)
self._delim = _wrap_string(delim)
def match(self, tokens: _Tokens) -> MatchObject | MismatchSingleton:
# First pattern.
mo = self._pattern.match(tokens)
if mo is MISMATCH:
return MISMATCH
matched = [mo]
tokens.save()
while True:
# Discard the delimiter.
mo = self._delim.match(tokens)
if mo is MISMATCH:
break
# Pattern.
mo = self._pattern.match(tokens)
if mo is MISMATCH:
break
matched.append(mo)
tokens.update()
tokens.restore()
return matched
[docs]
class Optional(Pattern):
"""Matches `pattern` zero or one times. Becomes a list in the parse
tree, empty on mismatch.
"""
def __init__(self, pattern: Pattern | str) -> None:
self._pattern = _wrap_string(pattern)
def match(self, tokens: _Tokens) -> MatchObject | MismatchSingleton:
tokens.save()
mo = self._pattern.match(tokens)
if mo is MISMATCH:
tokens.mark_max_restore()
return []
else:
tokens.drop()
return [mo]
[docs]
class Any(Pattern):
"""Matches any token.
"""
def match(self, tokens: _Tokens) -> MatchObject | MismatchSingleton:
if tokens.peek().kind == '__EOF__':
return MISMATCH
else:
return tokens.get_value()
[docs]
class AnyUntil(Pattern):
"""Matches any token until given pattern is found. Becomes a list in
the parse tree, not including the given pattern match.
"""
def __init__(self, pattern: Pattern | str) -> None:
self._pattern = _wrap_string(pattern)
def match(self, tokens: _Tokens) -> MatchObject | MismatchSingleton:
matched: list[MatchObject] = []
while True:
tokens.save()
mo = self._pattern.match(tokens)
if mo is not MISMATCH:
break
tokens.restore()
matched.append(tokens.get_value())
tokens.restore()
return matched
[docs]
class And(Pattern):
"""Matches `pattern`, without consuming any tokens. Any match becomes
an empty list in the parse tree.
"""
def __init__(self, pattern: Pattern | str) -> None:
self._pattern = _wrap_string(pattern)
def match(self, tokens: _Tokens) -> MatchObject | MismatchSingleton:
tokens.save()
mo = self._pattern.match(tokens)
tokens.restore()
if mo is MISMATCH:
return MISMATCH
else:
return []
[docs]
class Not(Pattern):
"""Matches if `pattern` does not match. Any match becomes an empty
list in the parse tree.
Just like :class:`~textparser.And`, no tokens are consumed.
"""
def __init__(self, pattern: Pattern | str) -> None:
self._pattern = _wrap_string(pattern)
def match(self, tokens: _Tokens) -> MatchObject | MismatchSingleton:
tokens.save()
mo = self._pattern.match(tokens)
tokens.restore()
if mo is MISMATCH:
return []
else:
return MISMATCH
[docs]
class NoMatch(Pattern):
"""Never matches anything.
"""
def match(self, tokens: _Tokens) -> MatchObject | MismatchSingleton:
return MISMATCH
[docs]
class Grammar:
"""Creates a tree of given tokens using the grammar `grammar`.
"""
def __init__(self, grammar: Pattern | str) -> None:
self._root: Pattern
if isinstance(grammar, str):
self._root = _wrap_string(grammar)
else:
self._root = grammar
def parse(self, token_list: list[Token], token_tree: bool=False) -> MatchObject:
if token_tree:
tokens = _Tokens(token_list)
else:
tokens = _StringTokens(token_list)
parsed = self._root.match(tokens)
if parsed is not MISMATCH and tokens.peek_max().kind == '__EOF__':
return parsed
else:
raise GrammarError(tokens.peek_max().offset)
[docs]
def choice(*patterns: Pattern | str) -> Choice | ChoiceDict:
"""Returns an instance of the fastest choice class for given patterns
`patterns`. It is recommended to use this function instead of
instantiate :class:`~textparser.Choice` or
:class:`~textparser.ChoiceDict` directly.
"""
try:
return ChoiceDict(*patterns)
except Error:
return Choice(*patterns)
[docs]
def markup_line(text: str, offset: int, marker: str='>>!<<') -> str:
"""Insert `marker` at `offset` into `text`, and return the marked
line.
.. code-block:: python
>>> markup_line('0\\n1234\\n56', 3)
1>>!<<234
"""
begin = text.rfind('\n', 0, offset)
begin += 1
end = text.find('\n', offset)
if end == -1:
end = len(text)
return text[begin:offset] + marker + text[offset:end]
def line(text: str, offset: int) -> int:
return text[:offset].count('\n') + 1
def column(text: str, offset: int) -> int:
line_start = text.rfind('\n', 0, offset)
return offset - line_start
[docs]
def tokenize_init(spec: collections.abc.Sequence[tuple[str, str] | tuple[str, str, int]]) -> tuple[list[Token], str]:
"""Initialize a tokenizer. Should only be called by the
:func:`~textparser.Parser.tokenize` method in the parser.
"""
tokens = [Token('__SOF__', '__SOF__', 0)]
re_token = '|'.join([
f'(?P<{token_spec[0]}>{token_spec[1]})' for token_spec in spec
])
return tokens, re_token
[docs]
class Parser:
"""The abstract base class of all text parsers.
.. code-block:: python
>>> from textparser import Parser, Sequence
>>> class MyParser(Parser):
... def token_specs(self):
... return [
... ('SKIP', r'[ \\r\\n\\t]+'),
... ('WORD', r'\\w+'),
... ('EMARK', '!', r'!'),
... ('COMMA', ',', r','),
... ('MISMATCH', r'.')
... ]
... def grammar(self):
... return Sequence('WORD', ',', 'WORD', '!')
"""
def _unpack_token_specs(self) -> tuple[dict[str, str],
list[tuple[str,str]]]:
names = {}
specs = []
for spec in self.token_specs():
if len(spec) == 2:
specs.append(spec)
else:
specs.append((spec[0], spec[2]))
names[spec[0]] = spec[1]
return names, specs
[docs]
def keywords(self) -> set[str]:
"""A set of keywords in the text.
.. code-block:: python
def keywords(self):
return set(['if', 'else'])
"""
return set()
[docs]
def token_specs(self) -> list[tuple[str, str] | tuple[str, str, str]]:
"""The token specifications with token name, regular expression, and
optionally a user friendly name.
Two token specification forms are available; ``(kind, re)`` or
``(kind, name, re)``. If the second form is used, the grammar
should use `name` instead of `kind`.
See :class:`~textparser.Parser` for an example usage.
"""
return [
('SKIP', r'[ \r\n\t]+'),
('NUMBER', r'-?\d+(\.\d+)?([eE][+-]?\d+)?'),
('WORD', r'[A-Za-z0-9_]+'),
('ESCAPED_STRING', r'"(\\"|[^"])*?"'),
('MISMATCH', r'.')
]
[docs]
def tokenize(self, text: str) -> list[Token]:
"""Tokenize given string `text`, and return a list of tokens. Raises
:class:`~textparser.TokenizeError` on failure.
This method should only be called by
:func:`~textparser.Parser.parse()`, but may very well be
overridden if the default implementation does not match the
parser needs.
"""
names, specs = self._unpack_token_specs()
keywords = self.keywords()
tokens, re_token = tokenize_init(specs)
for mo in re.finditer(re_token, text, re.DOTALL):
kind = mo.lastgroup
assert isinstance(kind, str)
if kind == 'SKIP':
pass
elif kind != 'MISMATCH':
value = mo.group(kind)
if value in keywords:
kind = value
if kind in names:
kind = names[kind]
tokens.append(Token(kind, value, mo.start()))
else:
raise TokenizeError(text, mo.start())
return tokens
[docs]
def grammar(self) -> Grammar:
"""The text grammar is used to create a parse tree out of a list of
tokens.
See :class:`~textparser.Parser` for an example usage.
"""
raise NotImplementedError('No grammar defined.')
[docs]
def parse(self, text: str, token_tree: bool=False, match_sof:bool=False) -> MatchObject | MismatchSingleton:
"""Parse given string `text` and return the parse tree. Raises
:class:`~textparser.ParseError` on failure.
Returns a parse tree of tokens if `token_tree` is ``True``.
.. code-block:: python
>>> MyParser().parse('Hello, World!')
['Hello', ',', 'World', '!']
>>> tree = MyParser().parse('Hello, World!', token_tree=True)
>>> from pprint import pprint
>>> pprint(tree)
[Token(kind='WORD', value='Hello', offset=0),
Token(kind=',', value=',', offset=5),
Token(kind='WORD', value='World', offset=7),
Token(kind='!', value='!', offset=12)]
"""
try:
tokens = self.tokenize(text)
if len(tokens) == 0 or tokens[-1].kind != '__EOF__':
tokens.append(Token('__EOF__', '__EOF__', len(text)))
if not match_sof:
if len(tokens) > 0 and tokens[0].kind == '__SOF__':
del tokens[0]
grammar = self.grammar()
if isinstance(grammar, Grammar):
return grammar.parse(tokens, token_tree)
else:
# used for compatibility with old user code from the
# pre-type hints era...
return Grammar(grammar).parse(tokens, token_tree)
except (TokenizeError, GrammarError) as e:
raise ParseError(text, e.offset) from e
def replace_blocks(string: str, start: str='{', end: str='}') -> str:
"""Replace all blocks starting with `start` and ending with `end` with
spaces (not including `start` and `end`).
"""
chunks = []
begin = 0
depth = 0
start_length = len(start)
pattern = rf'({re.escape(start)}|{re.escape(end)})'
for mo in re.finditer(pattern, string):
pos = mo.start()
if mo.group() == start:
if depth == 0:
chunks.append(string[begin:pos + start_length])
begin = (pos + start_length)
depth += 1
elif depth > 0:
depth -= 1
if depth == 0:
for chunk in string[begin:pos].split('\n'):
chunks.append(' ' * len(chunk))
chunks.append('\n')
chunks.pop()
begin = pos
chunks.append(string[begin:])
return ''.join(chunks)