Source code for textparser

# A text parser.

import re
from collections import namedtuple
from operator import itemgetter


__author__ = 'Erik Moqvist'
__version__ = '0.21.1'


class _Mismatch(object):
    pass


MISMATCH = _Mismatch()
"""Returned by :func:`~textparser.Pattern.match()` on mismatch.

"""


class _String(object):
    """Matches a specific token kind.

    """

    def __init__(self, kind):
        self.kind = kind

    def match(self, tokens):
        if self.kind == tokens.peek().kind:
            return tokens.get_value()
        else:
            return MISMATCH


class _Tokens(object):

    def __init__(self, tokens):
        self._tokens = tokens
        self._pos = 0
        self._max_pos = -1
        self._stack = []

    def get_value(self):
        pos = self._pos
        self._pos += 1

        return self._tokens[pos]

    def peek(self):
        return self._tokens[self._pos]

    def peek_max(self):
        pos = self._pos

        if self._max_pos > pos:
            pos = self._max_pos

        if pos >= len(self._tokens):
            return self._tokens[-1]
        else:
            return self._tokens[pos]

    def save(self):
        self._stack.append(self._pos)

    def restore(self):
        self._pos = self._stack.pop()

    def update(self):
        self._stack[-1] = self._pos

    def mark_max_restore(self):
        if self._pos > self._max_pos:
            self._max_pos = self._pos

        self._pos = self._stack.pop()

    def mark_max_load(self):
        if self._pos > self._max_pos:
            self._max_pos = self._pos

        self._pos = self._stack[-1]

    def drop(self):
        self._stack.pop()

    def __repr__(self):
        return str(self._tokens[self._pos:self._pos + 2])


class _StringTokens(_Tokens):

    def get_value(self):
        pos = self._pos
        self._pos += 1

        return self._tokens[pos].value


def _wrap_string(item):
    if isinstance(item, str):
        item = _String(item)

    return item


def _wrap_strings(items):
    return [_wrap_string(item) for item in items]


def _format_invalid_syntax(text, offset):
    return 'Invalid syntax at line {}, column {}: "{}"'.format(
        line(text, offset),
        column(text, offset),
        markup_line(text, offset))


[docs]class Error(Exception):
    """General textparser exception.

    """

    pass


[docs]class TokenizeError(Error):
    """This exception is raised when the text cannot be converted into
    tokens.

    """

    def __init__(self, text, offset):
        self._text = text
        self._offset = offset
        message = _format_invalid_syntax(text, offset)
        super(TokenizeError, self).__init__(message)

    @property
    def text(self):
        """The input text to the tokenizer.

        """

        return self._text

    @property
    def offset(self):
        """Offset into the text where the tokenizer failed.

        """

        return self._offset


[docs]class GrammarError(Error):
    """This exception is raised when the tokens cannot be converted into a
    parse tree.

    """

    def __init__(self, offset):
        self._offset = offset
        message = 'Invalid syntax at offset {}.'.format(offset)
        super(GrammarError, self).__init__(message)

    @property
    def offset(self):
        """Offset into the text where the parser failed.

        """

        return self._offset


[docs]class ParseError(Error):
    """This exception is raised when the parser fails to parse the text.

    """

    def __init__(self, text, offset):
        self._text = text
        self._offset = offset
        self._line = line(text, offset)
        self._column = column(text, offset)
        message = _format_invalid_syntax(text, offset)
        super(ParseError, self).__init__(message)

    @property
    def text(self):
        """The input text to the parser.

        """

        return self._text

    @property
    def offset(self):
        """Offset into the text where the parser failed.

        """

        return self._offset

    @property
    def line(self):
        """Line where the parser failed.

        """

        return self._line

    @property
    def column(self):
        """Column where the parser failed.

        """

        return self._column


Token = namedtuple('Token', ['kind', 'value', 'offset'])


[docs]class Pattern(object):
    """Base class of all patterns.

    """

[docs]    def match(self, tokens):
        """Returns :data:`~textparser.MISMATCH` on mismatch, and anything else
        on match.

        """

        raise NotImplementedError('To be implemented by subclasses.')


[docs]class Sequence(Pattern):
    """Matches a sequence of patterns. Becomes a list in the parse tree.

    """

    def __init__(self, *patterns):
        self.patterns = _wrap_strings(patterns)

    def match(self, tokens):
        matched = []

        for pattern in self.patterns:
            mo = pattern.match(tokens)

            if mo is MISMATCH:
                return MISMATCH

            matched.append(mo)

        return matched


[docs]class Choice(Pattern):
    """Matches any of given ordered patterns `patterns`. The first pattern
    in the list has highest priority, and the last lowest.

    """

    def __init__(self, *patterns):
        self._patterns = _wrap_strings(patterns)

    def match(self, tokens):
        tokens.save()

        for pattern in self._patterns:
            tokens.mark_max_load()
            mo = pattern.match(tokens)

            if mo is not MISMATCH:
                tokens.drop()

                return mo

        tokens.restore()

        return MISMATCH


[docs]class ChoiceDict(Pattern):
    """Matches any of given patterns. The first token kind of all patterns
    must be unique, otherwise and :class:`~textparser.Error` exception
    is raised.

    This class is faster than :class:`~textparser.Choice`, and should
    be used if the grammar allows it.

    """

    def __init__(self, *patterns):
        self._patterns_map = {}
        patterns = _wrap_strings(patterns)

        for pattern in patterns:
            self._check_pattern(pattern, pattern)

    @property
    def patterns_map(self):
        return self._patterns_map

    def _check_pattern(self, inner, outer):
        if isinstance(inner, _String):
            self._add_pattern(inner.kind, outer)
        elif isinstance(inner, Sequence):
            self._check_pattern(inner.patterns[0], outer)
        elif isinstance(inner, (Tag, Forward)):
            self._check_pattern(inner.pattern, outer)
        elif isinstance(inner, ChoiceDict):
            for pattern in inner.patterns_map.values():
                self._check_pattern(pattern, outer)
        else:
            raise Error(
                'Unsupported pattern type {}.'.format(type(inner)))

    def _add_pattern(self, kind, pattern):
        if kind in self._patterns_map:
            raise Error(
                "First token kind must be unique, but {} isn't.".format(
                    kind))

        self._patterns_map[kind] = pattern

    def match(self, tokens):
        kind = tokens.peek().kind

        if kind in self._patterns_map:
            return self._patterns_map[kind].match(tokens)
        else:
            return MISMATCH


[docs]class Repeated(Pattern):
    """Matches `pattern` at least `minimum` times. Any match becomes a
    list in the parse tree.

    """

    def __init__(self, pattern, minimum=0):
        self._pattern = _wrap_string(pattern)
        self._minimum = minimum

    def match(self, tokens):
        matched = []
        tokens.save()

        while True:
            mo = self._pattern.match(tokens)

            if mo is MISMATCH:
                tokens.mark_max_restore()
                break

            matched.append(mo)
            tokens.update()

        if len(matched) >= self._minimum:
            return matched
        else:
            return MISMATCH


[docs]class RepeatedDict(Repeated):
    """Same as :class:`~textparser.Repeated`, but becomes a dictionary
    instead of a list in the parse tree.

    `key` is a function taking the match as input and returning the
    dictionary key. By default the first element in the match is used
    as key.

    """

    def __init__(self, pattern, minimum=0, key=None):
        super(RepeatedDict, self).__init__(pattern, minimum)

        if key is None:
            key = itemgetter(0)

        self._key = key

    def match(self, tokens):
        matched = {}
        tokens.save()

        while True:
            mo = self._pattern.match(tokens)

            if mo is MISMATCH:
                tokens.mark_max_restore()
                break

            key = self._key(mo)

            try:
                matched[key].append(mo)
            except KeyError:
                matched[key] = [mo]

            tokens.update()

        if len(matched) >= self._minimum:
            return matched
        else:
            return MISMATCH


[docs]class ZeroOrMore(Repeated):
    """Matches `pattern` zero or more times.

    See :class:`~textparser.Repeated` for more details.

    """

    def __init__(self, pattern):
        super(ZeroOrMore, self).__init__(pattern, 0)


[docs]class ZeroOrMoreDict(RepeatedDict):
    """Matches `pattern` zero or more times.

    See :class:`~textparser.RepeatedDict` for more details.

    """

    def __init__(self, pattern, key=None):
        super(ZeroOrMoreDict, self).__init__(pattern, 0, key)


[docs]class OneOrMore(Repeated):
    """Matches `pattern` one or more times.

    See :class:`~textparser.Repeated` for more details.

    """

    def __init__(self, pattern):
        super(OneOrMore, self).__init__(pattern, 1)


[docs]class OneOrMoreDict(RepeatedDict):
    """Matches `pattern` one or more times.

    See :class:`~textparser.RepeatedDict` for more details.

    """

    def __init__(self, pattern, key=None):
        super(OneOrMoreDict, self).__init__(pattern, 1, key)


[docs]class DelimitedList(Pattern):
    """Matches a delimented list of `pattern` separated by
    `delim`. `pattern` must be matched at least once. Any match
    becomes a list in the parse tree, excluding the delimitors.

    """

    def __init__(self, pattern, delim=','):
        self._pattern = _wrap_string(pattern)
        self._delim = _wrap_string(delim)

    def match(self, tokens):
        # First pattern.
        mo = self._pattern.match(tokens)

        if mo is MISMATCH:
            return MISMATCH

        matched = [mo]
        tokens.save()

        while True:
            # Discard the delimiter.
            mo = self._delim.match(tokens)

            if mo is MISMATCH:
                break

            # Pattern.
            mo = self._pattern.match(tokens)

            if mo is MISMATCH:
                break

            matched.append(mo)
            tokens.update()

        tokens.restore()

        return matched


[docs]class Optional(Pattern):
    """Matches `pattern` zero or one times. Becomes a list in the parse
    tree, empty on mismatch.

    """

    def __init__(self, pattern):
        self._pattern = _wrap_string(pattern)

    def match(self, tokens):
        tokens.save()
        mo = self._pattern.match(tokens)

        if mo is MISMATCH:
            tokens.mark_max_restore()

            return []
        else:
            tokens.drop()

            return [mo]


[docs]class Any(Pattern):
    """Matches any token.

    """

    def match(self, tokens):
        return tokens.get_value()


[docs]class AnyUntil(Pattern):
    """Matches any token until given pattern is found. Becomes a list in
    the parse tree, not including the given pattern match.

    """

    def __init__(self, pattern):
        self._pattern = _wrap_string(pattern)

    def match(self, tokens):
        matched = []

        while True:
            tokens.save()
            mo = self._pattern.match(tokens)

            if mo is not MISMATCH:
                break

            tokens.restore()
            matched.append(tokens.get_value())

        tokens.restore()

        return matched


[docs]class And(Pattern):
    """Matches `pattern`, without consuming any tokens. Any match becomes
    an empty list in the parse tree.

    """

    def __init__(self, pattern):
        self._pattern = _wrap_string(pattern)

    def match(self, tokens):
        tokens.save()
        mo = self._pattern.match(tokens)
        tokens.restore()

        if mo is MISMATCH:
            return MISMATCH
        else:
            return []


[docs]class Not(Pattern):
    """Matches if `pattern` does not match. Any match becomes an empty
    list in the parse tree.

    Just like :class:`~textparser.And`, no tokens are consumed.

    """

    def __init__(self, pattern):
        self._pattern = _wrap_string(pattern)

    def match(self, tokens):
        tokens.save()
        mo = self._pattern.match(tokens)
        tokens.restore()

        if mo is MISMATCH:
            return []
        else:
            return MISMATCH


[docs]class NoMatch(Pattern):
    """Never matches anything.

    """

    def match(self, tokens):
        return MISMATCH


[docs]class Tag(Pattern):
    """Tags any matched `pattern` with name `name`. Becomes a two-tuple of
    `name` and match in the parse tree.

    """

    def __init__(self, name, pattern):
        self._name = name
        self._pattern = _wrap_string(pattern)

    @property
    def pattern(self):
        return self._pattern

    def match(self, tokens):
        mo = self._pattern.match(tokens)

        if mo is not MISMATCH:
            return (self._name, mo)
        else:
            return MISMATCH


[docs]class Forward(Pattern):
    """Forward declaration of a pattern.

    .. code-block:: python

       >>> foo = Forward()
       >>> foo <<= Sequence('NUMBER')

    """

    def __init__(self):
        self._pattern = None

    @property
    def pattern(self):
        return self._pattern

    def __ilshift__(self, other):
        self._pattern = _wrap_string(other)

        return self

    def match(self, tokens):
        return self._pattern.match(tokens)


class Grammar(object):
    """Creates a tree of given tokens using the grammar `grammar`.

    """

    def __init__(self, grammar):
        if isinstance(grammar, str):
            grammar = _wrap_string(grammar)

        self._root = grammar

    def parse(self, tokens, token_tree=False):
        if token_tree:
            tokens = _Tokens(tokens)
        else:
            tokens = _StringTokens(tokens)

        parsed = self._root.match(tokens)

        if parsed is not MISMATCH and tokens.peek_max().kind == '__EOF__':
            return parsed
        else:
            raise GrammarError(tokens.peek_max().offset)


[docs]def choice(*patterns):
    """Returns an instance of the fastest choice class for given patterns
    `patterns`. It is recommended to use this function instead of
    instantiate :class:`~textparser.Choice` or
    :class:`~textparser.ChoiceDict` directly.

    """

    try:
        return ChoiceDict(*patterns)
    except Error:
        return Choice(*patterns)


[docs]def markup_line(text, offset, marker='>>!<<'):
    """Insert `marker` at `offset` into `text`, and return the marked
    line.

    .. code-block:: python

       >>> markup_line('0\\n1234\\n56', 3)
       1>>!<<234

    """

    begin = text.rfind('\n', 0, offset)
    begin += 1

    end = text.find('\n', offset)

    if end == -1:
        end = len(text)

    return text[begin:offset] + marker + text[offset:end]


def line(text, offset):
    return text[:offset].count('\n') + 1


def column(text, offset):
    line_start = text.rfind('\n', 0, offset)

    return offset - line_start


[docs]def tokenize_init(spec):
    """Initialize a tokenizer. Should only be called by the
    :func:`~textparser.Parser.tokenize` method in the parser.

    """

    tokens = [Token('__SOF__', '__SOF__', 0)]
    re_token = '|'.join([
        '(?P<{}>{})'.format(name, regex) for name, regex in spec
    ])

    return tokens, re_token


[docs]class Parser(object):
    """The abstract base class of all text parsers.

    .. code-block:: python

       >>> from textparser import Parser, Sequence
       >>> class MyParser(Parser):
       ...    def token_specs(self):
       ...        return [
       ...            ('SKIP',          r'[ \\r\\n\\t]+'),
       ...            ('WORD',          r'\\w+'),
       ...            ('EMARK',    '!', r'!'),
       ...            ('COMMA',    ',', r','),
       ...            ('MISMATCH',      r'.')
       ...        ]
       ...    def grammar(self):
       ...        return Sequence('WORD', ',', 'WORD', '!')

    """

    def _unpack_token_specs(self):
        names = {}
        specs = []

        for spec in self.token_specs():
            if len(spec) == 2:
                specs.append(spec)
            else:
                specs.append((spec[0], spec[2]))
                names[spec[0]] = spec[1]

        return names, specs

[docs]    def keywords(self):
        """A set of keywords in the text.

        .. code-block:: python

           def keywords(self):
               return set(['if', 'else'])

        """

        return set()

[docs]    def token_specs(self):
        """The token specifications with token name, regular expression, and
        optionally a user friendly name.

        Two token specification forms are available; ``(kind, re)`` or
        ``(kind, name, re)``. If the second form is used, the grammar
        should use `name` instead of `kind`.

        See :class:`~textparser.Parser` for an example usage.

        """

        return [
            ('SKIP',                r'[ \r\n\t]+'),
            ('NUMBER',              r'-?\d+(\.\d+)?([eE][+-]?\d+)?'),
            ('WORD',                r'[A-Za-z0-9_]+'),
            ('ESCAPED_STRING',      r'"(\\"|[^"])*?"'),
            ('MISMATCH',            r'.')
        ]

[docs]    def tokenize(self, text):
        """Tokenize given string `text`, and return a list of tokens. Raises
        :class:`~textparser.TokenizeError` on failure.

        This method should only be called by
        :func:`~textparser.Parser.parse()`, but may very well be
        overridden if the default implementation does not match the
        parser needs.

        """

        names, specs = self._unpack_token_specs()
        keywords = self.keywords()
        tokens, re_token = tokenize_init(specs)

        for mo in re.finditer(re_token, text, re.DOTALL):
            kind = mo.lastgroup

            if kind == 'SKIP':
                pass
            elif kind != 'MISMATCH':
                value = mo.group(kind)

                if value in keywords:
                    kind = value

                if kind in names:
                    kind = names[kind]

                tokens.append(Token(kind, value, mo.start()))
            else:
                raise TokenizeError(text, mo.start())

        return tokens

[docs]    def grammar(self):
        """The text grammar is used to create a parse tree out of a list of
        tokens.

        See :class:`~textparser.Parser` for an example usage.

        """

        raise NotImplementedError('No grammar defined.')

[docs]    def parse(self, text, token_tree=False, match_sof=False):
        """Parse given string `text` and return the parse tree. Raises
        :class:`~textparser.ParseError` on failure.

        Returns a parse tree of tokens if `token_tree` is ``True``.

        .. code-block:: python

           >>> MyParser().parse('Hello, World!')
           ['Hello', ',', 'World', '!']
           >>> tree = MyParser().parse('Hello, World!', token_tree=True)
           >>> from pprint import pprint
           >>> pprint(tree)
           [Token(kind='WORD', value='Hello', offset=0),
            Token(kind=',', value=',', offset=5),
            Token(kind='WORD', value='World', offset=7),
            Token(kind='!', value='!', offset=12)]

        """

        try:
            tokens = self.tokenize(text)

            if len(tokens) == 0 or tokens[-1].kind != '__EOF__':
                tokens.append(Token('__EOF__', '__EOF__', len(text)))

            if not match_sof:
                if len(tokens) > 0 and tokens[0].kind == '__SOF__':
                    del tokens[0]

            return Grammar(self.grammar()).parse(tokens, token_tree)
        except (TokenizeError, GrammarError) as e:
            raise ParseError(text, e.offset)