From 0f6cfcdd12adc3f9ded3dd967b868582c549a2ce Mon Sep 17 00:00:00 2001 From: "Jeremy G. Siek" Date: Sat, 16 Nov 2024 13:41:22 -0500 Subject: [PATCH] remove lark subdir, back to asking user to install --- index.md | 5 - lark/__init__.py | 38 - lark/__pyinstaller/__init__.py | 6 - lark/__pyinstaller/hook-lark.py | 14 - lark/ast_utils.py | 59 - lark/common.py | 86 -- lark/exceptions.py | 292 ----- lark/grammar.py | 130 --- lark/grammars/__init__.py | 0 lark/grammars/common.lark | 59 - lark/grammars/lark.lark | 62 - lark/grammars/python.lark | 302 ----- lark/grammars/unicode.lark | 7 - lark/indenter.py | 143 --- lark/lark.py | 660 ----------- lark/lexer.py | 678 ----------- lark/load_grammar.py | 1428 ----------------------- lark/parse_tree_builder.py | 391 ------- lark/parser_frontends.py | 257 ---- lark/parsers/__init__.py | 0 lark/parsers/cyk.py | 340 ------ lark/parsers/earley.py | 312 ----- lark/parsers/earley_common.py | 42 - lark/parsers/earley_forest.py | 802 ------------- lark/parsers/grammar_analysis.py | 203 ---- lark/parsers/lalr_analysis.py | 332 ------ lark/parsers/lalr_interactive_parser.py | 158 --- lark/parsers/lalr_parser.py | 122 -- lark/parsers/lalr_parser_state.py | 110 -- lark/parsers/xearley.py | 166 --- lark/py.typed | 0 lark/reconstruct.py | 107 -- lark/tools/__init__.py | 70 -- lark/tools/nearley.py | 202 ---- lark/tools/serialize.py | 32 - lark/tools/standalone.py | 196 ---- lark/tree.py | 267 ----- lark/tree_matcher.py | 186 --- lark/tree_templates.py | 180 --- lark/utils.py | 346 ------ lark/visitors.py | 596 ---------- lib/List.pf | 17 + lib/Nat.pf | 6 + rec_desc_parser.py | 26 +- 44 files changed, 43 insertions(+), 9392 deletions(-) delete mode 100644 lark/__init__.py delete mode 100644 lark/__pyinstaller/__init__.py delete mode 100644 lark/__pyinstaller/hook-lark.py delete mode 100644 lark/ast_utils.py delete mode 100644 lark/common.py delete mode 100644 lark/exceptions.py delete mode 100644 lark/grammar.py delete mode 100644 lark/grammars/__init__.py delete mode 100644 lark/grammars/common.lark delete mode 100644 lark/grammars/lark.lark delete mode 100644 lark/grammars/python.lark delete mode 100644 lark/grammars/unicode.lark delete mode 100644 lark/indenter.py delete mode 100644 lark/lark.py delete mode 100644 lark/lexer.py delete mode 100644 lark/load_grammar.py delete mode 100644 lark/parse_tree_builder.py delete mode 100644 lark/parser_frontends.py delete mode 100644 lark/parsers/__init__.py delete mode 100644 lark/parsers/cyk.py delete mode 100644 lark/parsers/earley.py delete mode 100644 lark/parsers/earley_common.py delete mode 100644 lark/parsers/earley_forest.py delete mode 100644 lark/parsers/grammar_analysis.py delete mode 100644 lark/parsers/lalr_analysis.py delete mode 100644 lark/parsers/lalr_interactive_parser.py delete mode 100644 lark/parsers/lalr_parser.py delete mode 100644 lark/parsers/lalr_parser_state.py delete mode 100644 lark/parsers/xearley.py delete mode 100644 lark/py.typed delete mode 100644 lark/reconstruct.py delete mode 100644 lark/tools/__init__.py delete mode 100644 lark/tools/nearley.py delete mode 100644 lark/tools/serialize.py delete mode 100644 lark/tools/standalone.py delete mode 100644 lark/tree.py delete mode 100644 lark/tree_matcher.py delete mode 100644 lark/tree_templates.py delete mode 100644 lark/utils.py delete mode 100644 lark/visitors.py diff --git a/index.md b/index.md index 72ae7cc..52074e6 100644 --- a/index.md +++ b/index.md @@ -77,10 +77,6 @@ Here are some [instructions](https://wiki.python.org/moin/BeginnersGuide/Download) and links to the download for various systems. - ## Getting Started diff --git a/lark/__init__.py b/lark/__init__.py deleted file mode 100644 index d22cc2d..0000000 --- a/lark/__init__.py +++ /dev/null @@ -1,38 +0,0 @@ -from .exceptions import ( - GrammarError, - LarkError, - LexError, - ParseError, - UnexpectedCharacters, - UnexpectedEOF, - UnexpectedInput, - UnexpectedToken, -) -from .lark import Lark -from .lexer import Token -from .tree import ParseTree, Tree -from .utils import logger -from .visitors import Discard, Transformer, Transformer_NonRecursive, Visitor, v_args - -__version__: str = "1.2.2" - -__all__ = ( - "GrammarError", - "LarkError", - "LexError", - "ParseError", - "UnexpectedCharacters", - "UnexpectedEOF", - "UnexpectedInput", - "UnexpectedToken", - "Lark", - "Token", - "ParseTree", - "Tree", - "logger", - "Discard", - "Transformer", - "Transformer_NonRecursive", - "Visitor", - "v_args", -) diff --git a/lark/__pyinstaller/__init__.py b/lark/__pyinstaller/__init__.py deleted file mode 100644 index 9da62a3..0000000 --- a/lark/__pyinstaller/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -# For usage of lark with PyInstaller. See https://pyinstaller-sample-hook.readthedocs.io/en/latest/index.html - -import os - -def get_hook_dirs(): - return [os.path.dirname(__file__)] diff --git a/lark/__pyinstaller/hook-lark.py b/lark/__pyinstaller/hook-lark.py deleted file mode 100644 index cf3d8e3..0000000 --- a/lark/__pyinstaller/hook-lark.py +++ /dev/null @@ -1,14 +0,0 @@ -#----------------------------------------------------------------------------- -# Copyright (c) 2017-2020, PyInstaller Development Team. -# -# Distributed under the terms of the GNU General Public License (version 2 -# or later) with exception for distributing the bootloader. -# -# The full license is in the file COPYING.txt, distributed with this software. -# -# SPDX-License-Identifier: (GPL-2.0-or-later WITH Bootloader-exception) -#----------------------------------------------------------------------------- - -from PyInstaller.utils.hooks import collect_data_files - -datas = collect_data_files('lark') diff --git a/lark/ast_utils.py b/lark/ast_utils.py deleted file mode 100644 index a5460f3..0000000 --- a/lark/ast_utils.py +++ /dev/null @@ -1,59 +0,0 @@ -""" - Module of utilities for transforming a lark.Tree into a custom Abstract Syntax Tree (AST defined in classes) -""" - -import inspect, re -import types -from typing import Optional, Callable - -from lark import Transformer, v_args - -class Ast: - """Abstract class - - Subclasses will be collected by `create_transformer()` - """ - pass - -class AsList: - """Abstract class - - Subclasses will be instantiated with the parse results as a single list, instead of as arguments. - """ - -class WithMeta: - """Abstract class - - Subclasses will be instantiated with the Meta instance of the tree. (see ``v_args`` for more detail) - """ - pass - -def camel_to_snake(name): - return re.sub(r'(? Transformer: - """Collects `Ast` subclasses from the given module, and creates a Lark transformer that builds the AST. - - For each class, we create a corresponding rule in the transformer, with a matching name. - CamelCase names will be converted into snake_case. Example: "CodeBlock" -> "code_block". - - Classes starting with an underscore (`_`) will be skipped. - - Parameters: - ast_module: A Python module containing all the subclasses of ``ast_utils.Ast`` - transformer (Optional[Transformer]): An initial transformer. Its attributes may be overwritten. - decorator_factory (Callable): An optional callable accepting two booleans, inline, and meta, - and returning a decorator for the methods of ``transformer``. (default: ``v_args``). - """ - t = transformer or Transformer() - - for name, obj in inspect.getmembers(ast_module): - if not name.startswith('_') and inspect.isclass(obj): - if issubclass(obj, Ast): - wrapper = decorator_factory(inline=not issubclass(obj, AsList), meta=issubclass(obj, WithMeta)) - obj = wrapper(obj).__get__(t) - setattr(t, camel_to_snake(name), obj) - - return t diff --git a/lark/common.py b/lark/common.py deleted file mode 100644 index 71b6a4c..0000000 --- a/lark/common.py +++ /dev/null @@ -1,86 +0,0 @@ -from copy import deepcopy -import sys -from types import ModuleType -from typing import Callable, Collection, Dict, Optional, TYPE_CHECKING, List - -if TYPE_CHECKING: - from .lark import PostLex - from .lexer import Lexer - from .grammar import Rule - from typing import Union, Type - from typing import Literal - if sys.version_info >= (3, 10): - from typing import TypeAlias - else: - from typing_extensions import TypeAlias - -from .utils import Serialize -from .lexer import TerminalDef, Token - -###{standalone - -_ParserArgType: 'TypeAlias' = 'Literal["earley", "lalr", "cyk", "auto"]' -_LexerArgType: 'TypeAlias' = 'Union[Literal["auto", "basic", "contextual", "dynamic", "dynamic_complete"], Type[Lexer]]' -_LexerCallback = Callable[[Token], Token] -ParserCallbacks = Dict[str, Callable] - -class LexerConf(Serialize): - __serialize_fields__ = 'terminals', 'ignore', 'g_regex_flags', 'use_bytes', 'lexer_type' - __serialize_namespace__ = TerminalDef, - - terminals: Collection[TerminalDef] - re_module: ModuleType - ignore: Collection[str] - postlex: 'Optional[PostLex]' - callbacks: Dict[str, _LexerCallback] - g_regex_flags: int - skip_validation: bool - use_bytes: bool - lexer_type: Optional[_LexerArgType] - strict: bool - - def __init__(self, terminals: Collection[TerminalDef], re_module: ModuleType, ignore: Collection[str]=(), postlex: 'Optional[PostLex]'=None, - callbacks: Optional[Dict[str, _LexerCallback]]=None, g_regex_flags: int=0, skip_validation: bool=False, use_bytes: bool=False, strict: bool=False): - self.terminals = terminals - self.terminals_by_name = {t.name: t for t in self.terminals} - assert len(self.terminals) == len(self.terminals_by_name) - self.ignore = ignore - self.postlex = postlex - self.callbacks = callbacks or {} - self.g_regex_flags = g_regex_flags - self.re_module = re_module - self.skip_validation = skip_validation - self.use_bytes = use_bytes - self.strict = strict - self.lexer_type = None - - def _deserialize(self): - self.terminals_by_name = {t.name: t for t in self.terminals} - - def __deepcopy__(self, memo=None): - return type(self)( - deepcopy(self.terminals, memo), - self.re_module, - deepcopy(self.ignore, memo), - deepcopy(self.postlex, memo), - deepcopy(self.callbacks, memo), - deepcopy(self.g_regex_flags, memo), - deepcopy(self.skip_validation, memo), - deepcopy(self.use_bytes, memo), - ) - -class ParserConf(Serialize): - __serialize_fields__ = 'rules', 'start', 'parser_type' - - rules: List['Rule'] - callbacks: ParserCallbacks - start: List[str] - parser_type: _ParserArgType - - def __init__(self, rules: List['Rule'], callbacks: ParserCallbacks, start: List[str]): - assert isinstance(start, list) - self.rules = rules - self.callbacks = callbacks - self.start = start - -###} diff --git a/lark/exceptions.py b/lark/exceptions.py deleted file mode 100644 index e099d59..0000000 --- a/lark/exceptions.py +++ /dev/null @@ -1,292 +0,0 @@ -from .utils import logger, NO_VALUE -from typing import Mapping, Iterable, Callable, Union, TypeVar, Tuple, Any, List, Set, Optional, Collection, TYPE_CHECKING - -if TYPE_CHECKING: - from .lexer import Token - from .parsers.lalr_interactive_parser import InteractiveParser - from .tree import Tree - -###{standalone - -class LarkError(Exception): - pass - - -class ConfigurationError(LarkError, ValueError): - pass - - -def assert_config(value, options: Collection, msg='Got %r, expected one of %s'): - if value not in options: - raise ConfigurationError(msg % (value, options)) - - -class GrammarError(LarkError): - pass - - -class ParseError(LarkError): - pass - - -class LexError(LarkError): - pass - -T = TypeVar('T') - -class UnexpectedInput(LarkError): - """UnexpectedInput Error. - - Used as a base class for the following exceptions: - - - ``UnexpectedCharacters``: The lexer encountered an unexpected string - - ``UnexpectedToken``: The parser received an unexpected token - - ``UnexpectedEOF``: The parser expected a token, but the input ended - - After catching one of these exceptions, you may call the following helper methods to create a nicer error message. - """ - line: int - column: int - pos_in_stream = None - state: Any - _terminals_by_name = None - interactive_parser: 'InteractiveParser' - - def get_context(self, text: str, span: int=40) -> str: - """Returns a pretty string pinpointing the error in the text, - with span amount of context characters around it. - - Note: - The parser doesn't hold a copy of the text it has to parse, - so you have to provide it again - """ - assert self.pos_in_stream is not None, self - pos = self.pos_in_stream - start = max(pos - span, 0) - end = pos + span - if not isinstance(text, bytes): - before = text[start:pos].rsplit('\n', 1)[-1] - after = text[pos:end].split('\n', 1)[0] - return before + after + '\n' + ' ' * len(before.expandtabs()) + '^\n' - else: - before = text[start:pos].rsplit(b'\n', 1)[-1] - after = text[pos:end].split(b'\n', 1)[0] - return (before + after + b'\n' + b' ' * len(before.expandtabs()) + b'^\n').decode("ascii", "backslashreplace") - - def match_examples(self, parse_fn: 'Callable[[str], Tree]', - examples: Union[Mapping[T, Iterable[str]], Iterable[Tuple[T, Iterable[str]]]], - token_type_match_fallback: bool=False, - use_accepts: bool=True - ) -> Optional[T]: - """Allows you to detect what's wrong in the input text by matching - against example errors. - - Given a parser instance and a dictionary mapping some label with - some malformed syntax examples, it'll return the label for the - example that bests matches the current error. The function will - iterate the dictionary until it finds a matching error, and - return the corresponding value. - - For an example usage, see `examples/error_reporting_lalr.py` - - Parameters: - parse_fn: parse function (usually ``lark_instance.parse``) - examples: dictionary of ``{'example_string': value}``. - use_accepts: Recommended to keep this as ``use_accepts=True``. - """ - assert self.state is not None, "Not supported for this exception" - - if isinstance(examples, Mapping): - examples = examples.items() - - candidate = (None, False) - for i, (label, example) in enumerate(examples): - assert not isinstance(example, str), "Expecting a list" - - for j, malformed in enumerate(example): - try: - parse_fn(malformed) - except UnexpectedInput as ut: - if ut.state == self.state: - if ( - use_accepts - and isinstance(self, UnexpectedToken) - and isinstance(ut, UnexpectedToken) - and ut.accepts != self.accepts - ): - logger.debug("Different accepts with same state[%d]: %s != %s at example [%s][%s]" % - (self.state, self.accepts, ut.accepts, i, j)) - continue - if ( - isinstance(self, (UnexpectedToken, UnexpectedEOF)) - and isinstance(ut, (UnexpectedToken, UnexpectedEOF)) - ): - if ut.token == self.token: # Try exact match first - logger.debug("Exact Match at example [%s][%s]" % (i, j)) - return label - - if token_type_match_fallback: - # Fallback to token types match - if (ut.token.type == self.token.type) and not candidate[-1]: - logger.debug("Token Type Fallback at example [%s][%s]" % (i, j)) - candidate = label, True - - if candidate[0] is None: - logger.debug("Same State match at example [%s][%s]" % (i, j)) - candidate = label, False - - return candidate[0] - - def _format_expected(self, expected): - if self._terminals_by_name: - d = self._terminals_by_name - expected = [d[t_name].user_repr() if t_name in d else t_name for t_name in expected] - return "Expected one of: \n\t* %s\n" % '\n\t* '.join(expected) - - -class UnexpectedEOF(ParseError, UnexpectedInput): - """An exception that is raised by the parser, when the input ends while it still expects a token. - """ - expected: 'List[Token]' - - def __init__(self, expected, state=None, terminals_by_name=None): - super(UnexpectedEOF, self).__init__() - - self.expected = expected - self.state = state - from .lexer import Token - self.token = Token("", "") # , line=-1, column=-1, pos_in_stream=-1) - self.pos_in_stream = -1 - self.line = -1 - self.column = -1 - self._terminals_by_name = terminals_by_name - - - def __str__(self): - message = "Unexpected end-of-input. " - message += self._format_expected(self.expected) - return message - - -class UnexpectedCharacters(LexError, UnexpectedInput): - """An exception that is raised by the lexer, when it cannot match the next - string of characters to any of its terminals. - """ - - allowed: Set[str] - considered_tokens: Set[Any] - - def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None, - terminals_by_name=None, considered_rules=None): - super(UnexpectedCharacters, self).__init__() - - # TODO considered_tokens and allowed can be figured out using state - self.line = line - self.column = column - self.pos_in_stream = lex_pos - self.state = state - self._terminals_by_name = terminals_by_name - - self.allowed = allowed - self.considered_tokens = considered_tokens - self.considered_rules = considered_rules - self.token_history = token_history - - if isinstance(seq, bytes): - self.char = seq[lex_pos:lex_pos + 1].decode("ascii", "backslashreplace") - else: - self.char = seq[lex_pos] - self._context = self.get_context(seq) - - - def __str__(self): - message = "No terminal matches '%s' in the current parser context, at line %d col %d" % (self.char, self.line, self.column) - message += '\n\n' + self._context - if self.allowed: - message += self._format_expected(self.allowed) - if self.token_history: - message += '\nPrevious tokens: %s\n' % ', '.join(repr(t) for t in self.token_history) - return message - - -class UnexpectedToken(ParseError, UnexpectedInput): - """An exception that is raised by the parser, when the token it received - doesn't match any valid step forward. - - Parameters: - token: The mismatched token - expected: The set of expected tokens - considered_rules: Which rules were considered, to deduce the expected tokens - state: A value representing the parser state. Do not rely on its value or type. - interactive_parser: An instance of ``InteractiveParser``, that is initialized to the point of failure, - and can be used for debugging and error handling. - - Note: These parameters are available as attributes of the instance. - """ - - expected: Set[str] - considered_rules: Set[str] - - def __init__(self, token, expected, considered_rules=None, state=None, interactive_parser=None, terminals_by_name=None, token_history=None): - super(UnexpectedToken, self).__init__() - - # TODO considered_rules and expected can be figured out using state - self.line = getattr(token, 'line', '?') - self.column = getattr(token, 'column', '?') - self.pos_in_stream = getattr(token, 'start_pos', None) - self.state = state - - self.token = token - self.expected = expected # XXX deprecate? `accepts` is better - self._accepts = NO_VALUE - self.considered_rules = considered_rules - self.interactive_parser = interactive_parser - self._terminals_by_name = terminals_by_name - self.token_history = token_history - - - @property - def accepts(self) -> Set[str]: - if self._accepts is NO_VALUE: - self._accepts = self.interactive_parser and self.interactive_parser.accepts() - return self._accepts - - def __str__(self): - message = ("Unexpected token %r at line %s, column %s.\n%s" - % (self.token, self.line, self.column, self._format_expected(self.accepts or self.expected))) - if self.token_history: - message += "Previous tokens: %r\n" % self.token_history - - return message - - - -class VisitError(LarkError): - """VisitError is raised when visitors are interrupted by an exception - - It provides the following attributes for inspection: - - Parameters: - rule: the name of the visit rule that failed - obj: the tree-node or token that was being processed - orig_exc: the exception that cause it to fail - - Note: These parameters are available as attributes - """ - - obj: 'Union[Tree, Token]' - orig_exc: Exception - - def __init__(self, rule, obj, orig_exc): - message = 'Error trying to process rule "%s":\n\n%s' % (rule, orig_exc) - super(VisitError, self).__init__(message) - - self.rule = rule - self.obj = obj - self.orig_exc = orig_exc - - -class MissingVariableError(LarkError): - pass - -###} diff --git a/lark/grammar.py b/lark/grammar.py deleted file mode 100644 index 1d226d9..0000000 --- a/lark/grammar.py +++ /dev/null @@ -1,130 +0,0 @@ -from typing import Optional, Tuple, ClassVar, Sequence - -from .utils import Serialize - -###{standalone -TOKEN_DEFAULT_PRIORITY = 0 - - -class Symbol(Serialize): - __slots__ = ('name',) - - name: str - is_term: ClassVar[bool] = NotImplemented - - def __init__(self, name: str) -> None: - self.name = name - - def __eq__(self, other): - assert isinstance(other, Symbol), other - return self.is_term == other.is_term and self.name == other.name - - def __ne__(self, other): - return not (self == other) - - def __hash__(self): - return hash(self.name) - - def __repr__(self): - return '%s(%r)' % (type(self).__name__, self.name) - - fullrepr = property(__repr__) - - def renamed(self, f): - return type(self)(f(self.name)) - - -class Terminal(Symbol): - __serialize_fields__ = 'name', 'filter_out' - - is_term: ClassVar[bool] = True - - def __init__(self, name, filter_out=False): - self.name = name - self.filter_out = filter_out - - @property - def fullrepr(self): - return '%s(%r, %r)' % (type(self).__name__, self.name, self.filter_out) - - def renamed(self, f): - return type(self)(f(self.name), self.filter_out) - - -class NonTerminal(Symbol): - __serialize_fields__ = 'name', - - is_term: ClassVar[bool] = False - - -class RuleOptions(Serialize): - __serialize_fields__ = 'keep_all_tokens', 'expand1', 'priority', 'template_source', 'empty_indices' - - keep_all_tokens: bool - expand1: bool - priority: Optional[int] - template_source: Optional[str] - empty_indices: Tuple[bool, ...] - - def __init__(self, keep_all_tokens: bool=False, expand1: bool=False, priority: Optional[int]=None, template_source: Optional[str]=None, empty_indices: Tuple[bool, ...]=()) -> None: - self.keep_all_tokens = keep_all_tokens - self.expand1 = expand1 - self.priority = priority - self.template_source = template_source - self.empty_indices = empty_indices - - def __repr__(self): - return 'RuleOptions(%r, %r, %r, %r)' % ( - self.keep_all_tokens, - self.expand1, - self.priority, - self.template_source - ) - - -class Rule(Serialize): - """ - origin : a symbol - expansion : a list of symbols - order : index of this expansion amongst all rules of the same name - """ - __slots__ = ('origin', 'expansion', 'alias', 'options', 'order', '_hash') - - __serialize_fields__ = 'origin', 'expansion', 'order', 'alias', 'options' - __serialize_namespace__ = Terminal, NonTerminal, RuleOptions - - origin: NonTerminal - expansion: Sequence[Symbol] - order: int - alias: Optional[str] - options: RuleOptions - _hash: int - - def __init__(self, origin: NonTerminal, expansion: Sequence[Symbol], - order: int=0, alias: Optional[str]=None, options: Optional[RuleOptions]=None): - self.origin = origin - self.expansion = expansion - self.alias = alias - self.order = order - self.options = options or RuleOptions() - self._hash = hash((self.origin, tuple(self.expansion))) - - def _deserialize(self): - self._hash = hash((self.origin, tuple(self.expansion))) - - def __str__(self): - return '<%s : %s>' % (self.origin.name, ' '.join(x.name for x in self.expansion)) - - def __repr__(self): - return 'Rule(%r, %r, %r, %r)' % (self.origin, self.expansion, self.alias, self.options) - - def __hash__(self): - return self._hash - - def __eq__(self, other): - if not isinstance(other, Rule): - return False - return self.origin == other.origin and self.expansion == other.expansion - - -###} diff --git a/lark/grammars/__init__.py b/lark/grammars/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/lark/grammars/common.lark b/lark/grammars/common.lark deleted file mode 100644 index d2e86d1..0000000 --- a/lark/grammars/common.lark +++ /dev/null @@ -1,59 +0,0 @@ -// Basic terminals for common use - - -// -// Numbers -// - -DIGIT: "0".."9" -HEXDIGIT: "a".."f"|"A".."F"|DIGIT - -INT: DIGIT+ -SIGNED_INT: ["+"|"-"] INT -DECIMAL: INT "." INT? | "." INT - -// float = /-?\d+(\.\d+)?([eE][+-]?\d+)?/ -_EXP: ("e"|"E") SIGNED_INT -FLOAT: INT _EXP | DECIMAL _EXP? -SIGNED_FLOAT: ["+"|"-"] FLOAT - -NUMBER: FLOAT | INT -SIGNED_NUMBER: ["+"|"-"] NUMBER - -// -// Strings -// -_STRING_INNER: /.*?/ -_STRING_ESC_INNER: _STRING_INNER /(? ignore - | "%import" import_path ["->" name] -> import - | "%import" import_path name_list -> multi_import - | "%override" rule -> override_rule - | "%declare" name+ -> declare - -!import_path: "."? name ("." name)* -name_list: "(" name ("," name)* ")" - -?expansions: alias (_VBAR alias)* - -?alias: expansion ["->" RULE] - -?expansion: expr* - -?expr: atom [OP | "~" NUMBER [".." NUMBER]] - -?atom: "(" expansions ")" - | "[" expansions "]" -> maybe - | value - -?value: STRING ".." STRING -> literal_range - | name - | (REGEXP | STRING) -> literal - | name "{" value ("," value)* "}" -> template_usage - -name: RULE - | TOKEN - -_VBAR: _NL? "|" -OP: /[+*]|[?](?![a-z])/ -RULE: /!?[_?]?[a-z][_a-z0-9]*/ -TOKEN: /_?[A-Z][_A-Z0-9]*/ -STRING: _STRING "i"? -REGEXP: /\/(?!\/)(\\\/|\\\\|[^\/])*?\/[imslux]*/ -_NL: /(\r?\n)+\s*/ - -%import common.ESCAPED_STRING -> _STRING -%import common.SIGNED_INT -> NUMBER -%import common.WS_INLINE - -COMMENT: /\s*/ "//" /[^\n]/* | /\s*/ "#" /[^\n]/* - -%ignore WS_INLINE -%ignore COMMENT diff --git a/lark/grammars/python.lark b/lark/grammars/python.lark deleted file mode 100644 index 8a75966..0000000 --- a/lark/grammars/python.lark +++ /dev/null @@ -1,302 +0,0 @@ -// Python 3 grammar for Lark - -// This grammar should parse all python 3.x code successfully. - -// Adapted from: https://docs.python.org/3/reference/grammar.html - -// Start symbols for the grammar: -// single_input is a single interactive statement; -// file_input is a module or sequence of commands read from an input file; -// eval_input is the input for the eval() functions. -// NB: compound_stmt in single_input is followed by extra NEWLINE! -// - -single_input: _NEWLINE | simple_stmt | compound_stmt _NEWLINE -file_input: (_NEWLINE | stmt)* -eval_input: testlist _NEWLINE* - -decorator: "@" dotted_name [ "(" [arguments] ")" ] _NEWLINE -decorators: decorator+ -decorated: decorators (classdef | funcdef | async_funcdef) - -async_funcdef: "async" funcdef -funcdef: "def" name "(" [parameters] ")" ["->" test] ":" suite - -parameters: paramvalue ("," paramvalue)* ["," SLASH ("," paramvalue)*] ["," [starparams | kwparams]] - | starparams - | kwparams - -SLASH: "/" // Otherwise the it will completely disappear and it will be undisguisable in the result -starparams: (starparam | starguard) poststarparams -starparam: "*" typedparam -starguard: "*" -poststarparams: ("," paramvalue)* ["," kwparams] -kwparams: "**" typedparam ","? - -?paramvalue: typedparam ("=" test)? -?typedparam: name (":" test)? - - -lambdef: "lambda" [lambda_params] ":" test -lambdef_nocond: "lambda" [lambda_params] ":" test_nocond -lambda_params: lambda_paramvalue ("," lambda_paramvalue)* ["," [lambda_starparams | lambda_kwparams]] - | lambda_starparams - | lambda_kwparams -?lambda_paramvalue: name ("=" test)? -lambda_starparams: "*" [name] ("," lambda_paramvalue)* ["," [lambda_kwparams]] -lambda_kwparams: "**" name ","? - - -?stmt: simple_stmt | compound_stmt -?simple_stmt: small_stmt (";" small_stmt)* [";"] _NEWLINE -?small_stmt: (expr_stmt | assign_stmt | del_stmt | pass_stmt | flow_stmt | import_stmt | global_stmt | nonlocal_stmt | assert_stmt) -expr_stmt: testlist_star_expr -assign_stmt: annassign | augassign | assign - -annassign: testlist_star_expr ":" test ["=" test] -assign: testlist_star_expr ("=" (yield_expr|testlist_star_expr))+ -augassign: testlist_star_expr augassign_op (yield_expr|testlist) -!augassign_op: "+=" | "-=" | "*=" | "@=" | "/=" | "%=" | "&=" | "|=" | "^=" | "<<=" | ">>=" | "**=" | "//=" -?testlist_star_expr: test_or_star_expr - | test_or_star_expr ("," test_or_star_expr)+ ","? -> tuple - | test_or_star_expr "," -> tuple - -// For normal and annotated assignments, additional restrictions enforced by the interpreter -del_stmt: "del" exprlist -pass_stmt: "pass" -?flow_stmt: break_stmt | continue_stmt | return_stmt | raise_stmt | yield_stmt -break_stmt: "break" -continue_stmt: "continue" -return_stmt: "return" [testlist] -yield_stmt: yield_expr -raise_stmt: "raise" [test ["from" test]] -import_stmt: import_name | import_from -import_name: "import" dotted_as_names -// note below: the ("." | "...") is necessary because "..." is tokenized as ELLIPSIS -import_from: "from" (dots? dotted_name | dots) "import" ("*" | "(" import_as_names ")" | import_as_names) -!dots: "."+ -import_as_name: name ["as" name] -dotted_as_name: dotted_name ["as" name] -import_as_names: import_as_name ("," import_as_name)* [","] -dotted_as_names: dotted_as_name ("," dotted_as_name)* -dotted_name: name ("." name)* -global_stmt: "global" name ("," name)* -nonlocal_stmt: "nonlocal" name ("," name)* -assert_stmt: "assert" test ["," test] - -?compound_stmt: if_stmt | while_stmt | for_stmt | try_stmt | match_stmt - | with_stmt | funcdef | classdef | decorated | async_stmt -async_stmt: "async" (funcdef | with_stmt | for_stmt) -if_stmt: "if" test ":" suite elifs ["else" ":" suite] -elifs: elif_* -elif_: "elif" test ":" suite -while_stmt: "while" test ":" suite ["else" ":" suite] -for_stmt: "for" exprlist "in" testlist ":" suite ["else" ":" suite] -try_stmt: "try" ":" suite except_clauses ["else" ":" suite] [finally] - | "try" ":" suite finally -> try_finally -finally: "finally" ":" suite -except_clauses: except_clause+ -except_clause: "except" [test ["as" name]] ":" suite -// NB compile.c makes sure that the default except clause is last - - -with_stmt: "with" with_items ":" suite -with_items: with_item ("," with_item)* -with_item: test ["as" name] - -match_stmt: "match" test ":" _NEWLINE _INDENT case+ _DEDENT - -case: "case" pattern ["if" test] ":" suite - -?pattern: sequence_item_pattern "," _sequence_pattern -> sequence_pattern - | as_pattern -?as_pattern: or_pattern ("as" NAME)? -?or_pattern: closed_pattern ("|" closed_pattern)* -?closed_pattern: literal_pattern - | NAME -> capture_pattern - | "_" -> any_pattern - | attr_pattern - | "(" as_pattern ")" - | "[" _sequence_pattern "]" -> sequence_pattern - | "(" (sequence_item_pattern "," _sequence_pattern)? ")" -> sequence_pattern - | "{" (mapping_item_pattern ("," mapping_item_pattern)* ","?)?"}" -> mapping_pattern - | "{" (mapping_item_pattern ("," mapping_item_pattern)* ",")? "**" NAME ","? "}" -> mapping_star_pattern - | class_pattern - -literal_pattern: inner_literal_pattern - -?inner_literal_pattern: "None" -> const_none - | "True" -> const_true - | "False" -> const_false - | STRING -> string - | number - -attr_pattern: NAME ("." NAME)+ -> value - -name_or_attr_pattern: NAME ("." NAME)* -> value - -mapping_item_pattern: (literal_pattern|attr_pattern) ":" as_pattern - -_sequence_pattern: (sequence_item_pattern ("," sequence_item_pattern)* ","?)? -?sequence_item_pattern: as_pattern - | "*" NAME -> star_pattern - -class_pattern: name_or_attr_pattern "(" [arguments_pattern ","?] ")" -arguments_pattern: pos_arg_pattern ["," keyws_arg_pattern] - | keyws_arg_pattern -> no_pos_arguments - -pos_arg_pattern: as_pattern ("," as_pattern)* -keyws_arg_pattern: keyw_arg_pattern ("," keyw_arg_pattern)* -keyw_arg_pattern: NAME "=" as_pattern - - - -suite: simple_stmt | _NEWLINE _INDENT stmt+ _DEDENT - -?test: or_test ("if" or_test "else" test)? - | lambdef - | assign_expr - -assign_expr: name ":=" test - -?test_nocond: or_test | lambdef_nocond - -?or_test: and_test ("or" and_test)* -?and_test: not_test_ ("and" not_test_)* -?not_test_: "not" not_test_ -> not_test - | comparison -?comparison: expr (comp_op expr)* -star_expr: "*" expr - -?expr: or_expr -?or_expr: xor_expr ("|" xor_expr)* -?xor_expr: and_expr ("^" and_expr)* -?and_expr: shift_expr ("&" shift_expr)* -?shift_expr: arith_expr (_shift_op arith_expr)* -?arith_expr: term (_add_op term)* -?term: factor (_mul_op factor)* -?factor: _unary_op factor | power - -!_unary_op: "+"|"-"|"~" -!_add_op: "+"|"-" -!_shift_op: "<<"|">>" -!_mul_op: "*"|"@"|"/"|"%"|"//" -// <> isn't actually a valid comparison operator in Python. It's here for the -// sake of a __future__ import described in PEP 401 (which really works :-) -!comp_op: "<"|">"|"=="|">="|"<="|"<>"|"!="|"in"|"not" "in"|"is"|"is" "not" - -?power: await_expr ("**" factor)? -?await_expr: AWAIT? atom_expr -AWAIT: "await" - -?atom_expr: atom_expr "(" [arguments] ")" -> funccall - | atom_expr "[" subscriptlist "]" -> getitem - | atom_expr "." name -> getattr - | atom - -?atom: "(" yield_expr ")" - | "(" _tuple_inner? ")" -> tuple - | "(" comprehension{test_or_star_expr} ")" -> tuple_comprehension - | "[" _exprlist? "]" -> list - | "[" comprehension{test_or_star_expr} "]" -> list_comprehension - | "{" _dict_exprlist? "}" -> dict - | "{" comprehension{key_value} "}" -> dict_comprehension - | "{" _exprlist "}" -> set - | "{" comprehension{test} "}" -> set_comprehension - | name -> var - | number - | string_concat - | "(" test ")" - | "..." -> ellipsis - | "None" -> const_none - | "True" -> const_true - | "False" -> const_false - - -?string_concat: string+ - -_tuple_inner: test_or_star_expr (("," test_or_star_expr)+ [","] | ",") - -?test_or_star_expr: test - | star_expr - -?subscriptlist: subscript - | subscript (("," subscript)+ [","] | ",") -> subscript_tuple -?subscript: test | ([test] ":" [test] [sliceop]) -> slice -sliceop: ":" [test] -?exprlist: (expr|star_expr) - | (expr|star_expr) (("," (expr|star_expr))+ [","]|",") -?testlist: test | testlist_tuple -testlist_tuple: test (("," test)+ [","] | ",") -_dict_exprlist: (key_value | "**" expr) ("," (key_value | "**" expr))* [","] - -key_value: test ":" test - -_exprlist: test_or_star_expr ("," test_or_star_expr)* [","] - -classdef: "class" name ["(" [arguments] ")"] ":" suite - - - -arguments: argvalue ("," argvalue)* ("," [ starargs | kwargs])? - | starargs - | kwargs - | comprehension{test} - -starargs: stararg ("," stararg)* ("," argvalue)* ["," kwargs] -stararg: "*" test -kwargs: "**" test ("," argvalue)* - -?argvalue: test ("=" test)? - - -comprehension{comp_result}: comp_result comp_fors [comp_if] -comp_fors: comp_for+ -comp_for: [ASYNC] "for" exprlist "in" or_test -ASYNC: "async" -?comp_if: "if" test_nocond - -// not used in grammar, but may appear in "node" passed from Parser to Compiler -encoding_decl: name - -yield_expr: "yield" [testlist] - | "yield" "from" test -> yield_from - -number: DEC_NUMBER | HEX_NUMBER | BIN_NUMBER | OCT_NUMBER | FLOAT_NUMBER | IMAG_NUMBER -string: STRING | LONG_STRING - -// Other terminals - -_NEWLINE: ( /\r?\n[\t ]*/ | COMMENT )+ - -%ignore /[\t \f]+/ // WS -%ignore /\\[\t \f]*\r?\n/ // LINE_CONT -%ignore COMMENT -%declare _INDENT _DEDENT - - -// Python terminals - -!name: NAME | "match" | "case" -NAME: /[^\W\d]\w*/ -COMMENT: /#[^\n]*/ - -STRING: /([ubf]?r?|r[ubf])("(?!"").*?(? None: - self.paren_level = 0 - self.indent_level = [0] - assert self.tab_len > 0 - - def handle_NL(self, token: Token) -> Iterator[Token]: - if self.paren_level > 0: - return - - yield token - - indent_str = token.rsplit('\n', 1)[1] # Tabs and spaces - indent = indent_str.count(' ') + indent_str.count('\t') * self.tab_len - - if indent > self.indent_level[-1]: - self.indent_level.append(indent) - yield Token.new_borrow_pos(self.INDENT_type, indent_str, token) - else: - while indent < self.indent_level[-1]: - self.indent_level.pop() - yield Token.new_borrow_pos(self.DEDENT_type, indent_str, token) - - if indent != self.indent_level[-1]: - raise DedentError('Unexpected dedent to column %s. Expected dedent to %s' % (indent, self.indent_level[-1])) - - def _process(self, stream): - for token in stream: - if token.type == self.NL_type: - yield from self.handle_NL(token) - else: - yield token - - if token.type in self.OPEN_PAREN_types: - self.paren_level += 1 - elif token.type in self.CLOSE_PAREN_types: - self.paren_level -= 1 - assert self.paren_level >= 0 - - while len(self.indent_level) > 1: - self.indent_level.pop() - yield Token(self.DEDENT_type, '') - - assert self.indent_level == [0], self.indent_level - - def process(self, stream): - self.paren_level = 0 - self.indent_level = [0] - return self._process(stream) - - # XXX Hack for ContextualLexer. Maybe there's a more elegant solution? - @property - def always_accept(self): - return (self.NL_type,) - - @property - @abstractmethod - def NL_type(self) -> str: - "The name of the newline token" - raise NotImplementedError() - - @property - @abstractmethod - def OPEN_PAREN_types(self) -> List[str]: - "The names of the tokens that open a parenthesis" - raise NotImplementedError() - - @property - @abstractmethod - def CLOSE_PAREN_types(self) -> List[str]: - """The names of the tokens that close a parenthesis - """ - raise NotImplementedError() - - @property - @abstractmethod - def INDENT_type(self) -> str: - """The name of the token that starts an indentation in the grammar. - - See also: %declare - """ - raise NotImplementedError() - - @property - @abstractmethod - def DEDENT_type(self) -> str: - """The name of the token that end an indentation in the grammar. - - See also: %declare - """ - raise NotImplementedError() - - @property - @abstractmethod - def tab_len(self) -> int: - """How many spaces does a tab equal""" - raise NotImplementedError() - - -class PythonIndenter(Indenter): - """A postlexer that "injects" _INDENT/_DEDENT tokens based on indentation, according to the Python syntax. - - See also: the ``postlex`` option in `Lark`. - """ - - NL_type = '_NEWLINE' - OPEN_PAREN_types = ['LPAR', 'LSQB', 'LBRACE'] - CLOSE_PAREN_types = ['RPAR', 'RSQB', 'RBRACE'] - INDENT_type = '_INDENT' - DEDENT_type = '_DEDENT' - tab_len = 8 - -###} diff --git a/lark/lark.py b/lark/lark.py deleted file mode 100644 index 0bec71b..0000000 --- a/lark/lark.py +++ /dev/null @@ -1,660 +0,0 @@ -from abc import ABC, abstractmethod -import getpass -import sys, os, pickle -import tempfile -import types -import re -from typing import ( - TypeVar, Type, List, Dict, Iterator, Callable, Union, Optional, Sequence, - Tuple, Iterable, IO, Any, TYPE_CHECKING, Collection -) -if TYPE_CHECKING: - from .parsers.lalr_interactive_parser import InteractiveParser - from .tree import ParseTree - from .visitors import Transformer - from typing import Literal - from .parser_frontends import ParsingFrontend - -from .exceptions import ConfigurationError, assert_config, UnexpectedInput -from .utils import Serialize, SerializeMemoizer, FS, logger -from .load_grammar import load_grammar, FromPackageLoader, Grammar, verify_used_files, PackageResource, sha256_digest -from .tree import Tree -from .common import LexerConf, ParserConf, _ParserArgType, _LexerArgType - -from .lexer import Lexer, BasicLexer, TerminalDef, LexerThread, Token -from .parse_tree_builder import ParseTreeBuilder -from .parser_frontends import _validate_frontend_args, _get_lexer_callbacks, _deserialize_parsing_frontend, _construct_parsing_frontend -from .grammar import Rule - - -try: - import regex - _has_regex = True -except ImportError: - _has_regex = False - - -###{standalone - - -class PostLex(ABC): - @abstractmethod - def process(self, stream: Iterator[Token]) -> Iterator[Token]: - return stream - - always_accept: Iterable[str] = () - -class LarkOptions(Serialize): - """Specifies the options for Lark - - """ - - start: List[str] - debug: bool - strict: bool - transformer: 'Optional[Transformer]' - propagate_positions: Union[bool, str] - maybe_placeholders: bool - cache: Union[bool, str] - regex: bool - g_regex_flags: int - keep_all_tokens: bool - tree_class: Optional[Callable[[str, List], Any]] - parser: _ParserArgType - lexer: _LexerArgType - ambiguity: 'Literal["auto", "resolve", "explicit", "forest"]' - postlex: Optional[PostLex] - priority: 'Optional[Literal["auto", "normal", "invert"]]' - lexer_callbacks: Dict[str, Callable[[Token], Token]] - use_bytes: bool - ordered_sets: bool - edit_terminals: Optional[Callable[[TerminalDef], TerminalDef]] - import_paths: 'List[Union[str, Callable[[Union[None, str, PackageResource], str], Tuple[str, str]]]]' - source_path: Optional[str] - - OPTIONS_DOC = r""" - **=== General Options ===** - - start - The start symbol. Either a string, or a list of strings for multiple possible starts (Default: "start") - debug - Display debug information and extra warnings. Use only when debugging (Default: ``False``) - When used with Earley, it generates a forest graph as "sppf.png", if 'dot' is installed. - strict - Throw an exception on any potential ambiguity, including shift/reduce conflicts, and regex collisions. - transformer - Applies the transformer to every parse tree (equivalent to applying it after the parse, but faster) - propagate_positions - Propagates positional attributes into the 'meta' attribute of all tree branches. - Sets attributes: (line, column, end_line, end_column, start_pos, end_pos, - container_line, container_column, container_end_line, container_end_column) - Accepts ``False``, ``True``, or a callable, which will filter which nodes to ignore when propagating. - maybe_placeholders - When ``True``, the ``[]`` operator returns ``None`` when not matched. - When ``False``, ``[]`` behaves like the ``?`` operator, and returns no value at all. - (default= ``True``) - cache - Cache the results of the Lark grammar analysis, for x2 to x3 faster loading. LALR only for now. - - - When ``False``, does nothing (default) - - When ``True``, caches to a temporary file in the local directory - - When given a string, caches to the path pointed by the string - regex - When True, uses the ``regex`` module instead of the stdlib ``re``. - g_regex_flags - Flags that are applied to all terminals (both regex and strings) - keep_all_tokens - Prevent the tree builder from automagically removing "punctuation" tokens (Default: ``False``) - tree_class - Lark will produce trees comprised of instances of this class instead of the default ``lark.Tree``. - - **=== Algorithm Options ===** - - parser - Decides which parser engine to use. Accepts "earley" or "lalr". (Default: "earley"). - (there is also a "cyk" option for legacy) - lexer - Decides whether or not to use a lexer stage - - - "auto" (default): Choose for me based on the parser - - "basic": Use a basic lexer - - "contextual": Stronger lexer (only works with parser="lalr") - - "dynamic": Flexible and powerful (only with parser="earley") - - "dynamic_complete": Same as dynamic, but tries *every* variation of tokenizing possible. - ambiguity - Decides how to handle ambiguity in the parse. Only relevant if parser="earley" - - - "resolve": The parser will automatically choose the simplest derivation - (it chooses consistently: greedy for tokens, non-greedy for rules) - - "explicit": The parser will return all derivations wrapped in "_ambig" tree nodes (i.e. a forest). - - "forest": The parser will return the root of the shared packed parse forest. - - **=== Misc. / Domain Specific Options ===** - - postlex - Lexer post-processing (Default: ``None``) Only works with the basic and contextual lexers. - priority - How priorities should be evaluated - "auto", ``None``, "normal", "invert" (Default: "auto") - lexer_callbacks - Dictionary of callbacks for the lexer. May alter tokens during lexing. Use with caution. - use_bytes - Accept an input of type ``bytes`` instead of ``str``. - ordered_sets - Should Earley use ordered-sets to achieve stable output (~10% slower than regular sets. Default: True) - edit_terminals - A callback for editing the terminals before parse. - import_paths - A List of either paths or loader functions to specify from where grammars are imported - source_path - Override the source of from where the grammar was loaded. Useful for relative imports and unconventional grammar loading - **=== End of Options ===** - """ - if __doc__: - __doc__ += OPTIONS_DOC - - - # Adding a new option needs to be done in multiple places: - # - In the dictionary below. This is the primary truth of which options `Lark.__init__` accepts - # - In the docstring above. It is used both for the docstring of `LarkOptions` and `Lark`, and in readthedocs - # - As an attribute of `LarkOptions` above - # - Potentially in `_LOAD_ALLOWED_OPTIONS` below this class, when the option doesn't change how the grammar is loaded - # - Potentially in `lark.tools.__init__`, if it makes sense, and it can easily be passed as a cmd argument - _defaults: Dict[str, Any] = { - 'debug': False, - 'strict': False, - 'keep_all_tokens': False, - 'tree_class': None, - 'cache': False, - 'postlex': None, - 'parser': 'earley', - 'lexer': 'auto', - 'transformer': None, - 'start': 'start', - 'priority': 'auto', - 'ambiguity': 'auto', - 'regex': False, - 'propagate_positions': False, - 'lexer_callbacks': {}, - 'maybe_placeholders': True, - 'edit_terminals': None, - 'g_regex_flags': 0, - 'use_bytes': False, - 'ordered_sets': True, - 'import_paths': [], - 'source_path': None, - '_plugins': {}, - } - - def __init__(self, options_dict: Dict[str, Any]) -> None: - o = dict(options_dict) - - options = {} - for name, default in self._defaults.items(): - if name in o: - value = o.pop(name) - if isinstance(default, bool) and name not in ('cache', 'use_bytes', 'propagate_positions'): - value = bool(value) - else: - value = default - - options[name] = value - - if isinstance(options['start'], str): - options['start'] = [options['start']] - - self.__dict__['options'] = options - - - assert_config(self.parser, ('earley', 'lalr', 'cyk', None)) - - if self.parser == 'earley' and self.transformer: - raise ConfigurationError('Cannot specify an embedded transformer when using the Earley algorithm. ' - 'Please use your transformer on the resulting parse tree, or use a different algorithm (i.e. LALR)') - - if o: - raise ConfigurationError("Unknown options: %s" % o.keys()) - - def __getattr__(self, name: str) -> Any: - try: - return self.__dict__['options'][name] - except KeyError as e: - raise AttributeError(e) - - def __setattr__(self, name: str, value: str) -> None: - assert_config(name, self.options.keys(), "%r isn't a valid option. Expected one of: %s") - self.options[name] = value - - def serialize(self, memo = None) -> Dict[str, Any]: - return self.options - - @classmethod - def deserialize(cls, data: Dict[str, Any], memo: Dict[int, Union[TerminalDef, Rule]]) -> "LarkOptions": - return cls(data) - - -# Options that can be passed to the Lark parser, even when it was loaded from cache/standalone. -# These options are only used outside of `load_grammar`. -_LOAD_ALLOWED_OPTIONS = {'postlex', 'transformer', 'lexer_callbacks', 'use_bytes', 'debug', 'g_regex_flags', 'regex', 'propagate_positions', 'tree_class', '_plugins'} - -_VALID_PRIORITY_OPTIONS = ('auto', 'normal', 'invert', None) -_VALID_AMBIGUITY_OPTIONS = ('auto', 'resolve', 'explicit', 'forest') - - -_T = TypeVar('_T', bound="Lark") - -class Lark(Serialize): - """Main interface for the library. - - It's mostly a thin wrapper for the many different parsers, and for the tree constructor. - - Parameters: - grammar: a string or file-object containing the grammar spec (using Lark's ebnf syntax) - options: a dictionary controlling various aspects of Lark. - - Example: - >>> Lark(r'''start: "foo" ''') - Lark(...) - """ - - source_path: str - source_grammar: str - grammar: 'Grammar' - options: LarkOptions - lexer: Lexer - parser: 'ParsingFrontend' - terminals: Collection[TerminalDef] - - def __init__(self, grammar: 'Union[Grammar, str, IO[str]]', **options) -> None: - self.options = LarkOptions(options) - re_module: types.ModuleType - - # Set regex or re module - use_regex = self.options.regex - if use_regex: - if _has_regex: - re_module = regex - else: - raise ImportError('`regex` module must be installed if calling `Lark(regex=True)`.') - else: - re_module = re - - # Some, but not all file-like objects have a 'name' attribute - if self.options.source_path is None: - try: - self.source_path = grammar.name # type: ignore[union-attr] - except AttributeError: - self.source_path = '' - else: - self.source_path = self.options.source_path - - # Drain file-like objects to get their contents - try: - read = grammar.read # type: ignore[union-attr] - except AttributeError: - pass - else: - grammar = read() - - cache_fn = None - cache_sha256 = None - if isinstance(grammar, str): - self.source_grammar = grammar - if self.options.use_bytes: - if not grammar.isascii(): - raise ConfigurationError("Grammar must be ascii only, when use_bytes=True") - - if self.options.cache: - if self.options.parser != 'lalr': - raise ConfigurationError("cache only works with parser='lalr' for now") - - unhashable = ('transformer', 'postlex', 'lexer_callbacks', 'edit_terminals', '_plugins') - options_str = ''.join(k+str(v) for k, v in options.items() if k not in unhashable) - from . import __version__ - s = grammar + options_str + __version__ + str(sys.version_info[:2]) - cache_sha256 = sha256_digest(s) - - if isinstance(self.options.cache, str): - cache_fn = self.options.cache - else: - if self.options.cache is not True: - raise ConfigurationError("cache argument must be bool or str") - - try: - username = getpass.getuser() - except Exception: - # The exception raised may be ImportError or OSError in - # the future. For the cache, we don't care about the - # specific reason - we just want a username. - username = "unknown" - - cache_fn = tempfile.gettempdir() + "/.lark_cache_%s_%s_%s_%s.tmp" % (username, cache_sha256, *sys.version_info[:2]) - - old_options = self.options - try: - with FS.open(cache_fn, 'rb') as f: - logger.debug('Loading grammar from cache: %s', cache_fn) - # Remove options that aren't relevant for loading from cache - for name in (set(options) - _LOAD_ALLOWED_OPTIONS): - del options[name] - file_sha256 = f.readline().rstrip(b'\n') - cached_used_files = pickle.load(f) - if file_sha256 == cache_sha256.encode('utf8') and verify_used_files(cached_used_files): - cached_parser_data = pickle.load(f) - self._load(cached_parser_data, **options) - return - except FileNotFoundError: - # The cache file doesn't exist; parse and compose the grammar as normal - pass - except Exception: # We should probably narrow done which errors we catch here. - logger.exception("Failed to load Lark from cache: %r. We will try to carry on.", cache_fn) - - # In theory, the Lark instance might have been messed up by the call to `_load`. - # In practice the only relevant thing that might have been overwritten should be `options` - self.options = old_options - - - # Parse the grammar file and compose the grammars - self.grammar, used_files = load_grammar(grammar, self.source_path, self.options.import_paths, self.options.keep_all_tokens) - else: - assert isinstance(grammar, Grammar) - self.grammar = grammar - - - if self.options.lexer == 'auto': - if self.options.parser == 'lalr': - self.options.lexer = 'contextual' - elif self.options.parser == 'earley': - if self.options.postlex is not None: - logger.info("postlex can't be used with the dynamic lexer, so we use 'basic' instead. " - "Consider using lalr with contextual instead of earley") - self.options.lexer = 'basic' - else: - self.options.lexer = 'dynamic' - elif self.options.parser == 'cyk': - self.options.lexer = 'basic' - else: - assert False, self.options.parser - lexer = self.options.lexer - if isinstance(lexer, type): - assert issubclass(lexer, Lexer) # XXX Is this really important? Maybe just ensure interface compliance - else: - assert_config(lexer, ('basic', 'contextual', 'dynamic', 'dynamic_complete')) - if self.options.postlex is not None and 'dynamic' in lexer: - raise ConfigurationError("Can't use postlex with a dynamic lexer. Use basic or contextual instead") - - if self.options.ambiguity == 'auto': - if self.options.parser == 'earley': - self.options.ambiguity = 'resolve' - else: - assert_config(self.options.parser, ('earley', 'cyk'), "%r doesn't support disambiguation. Use one of these parsers instead: %s") - - if self.options.priority == 'auto': - self.options.priority = 'normal' - - if self.options.priority not in _VALID_PRIORITY_OPTIONS: - raise ConfigurationError("invalid priority option: %r. Must be one of %r" % (self.options.priority, _VALID_PRIORITY_OPTIONS)) - if self.options.ambiguity not in _VALID_AMBIGUITY_OPTIONS: - raise ConfigurationError("invalid ambiguity option: %r. Must be one of %r" % (self.options.ambiguity, _VALID_AMBIGUITY_OPTIONS)) - - if self.options.parser is None: - terminals_to_keep = '*' - elif self.options.postlex is not None: - terminals_to_keep = set(self.options.postlex.always_accept) - else: - terminals_to_keep = set() - - # Compile the EBNF grammar into BNF - self.terminals, self.rules, self.ignore_tokens = self.grammar.compile(self.options.start, terminals_to_keep) - - if self.options.edit_terminals: - for t in self.terminals: - self.options.edit_terminals(t) - - self._terminals_dict = {t.name: t for t in self.terminals} - - # If the user asked to invert the priorities, negate them all here. - if self.options.priority == 'invert': - for rule in self.rules: - if rule.options.priority is not None: - rule.options.priority = -rule.options.priority - for term in self.terminals: - term.priority = -term.priority - # Else, if the user asked to disable priorities, strip them from the - # rules and terminals. This allows the Earley parsers to skip an extra forest walk - # for improved performance, if you don't need them (or didn't specify any). - elif self.options.priority is None: - for rule in self.rules: - if rule.options.priority is not None: - rule.options.priority = None - for term in self.terminals: - term.priority = 0 - - # TODO Deprecate lexer_callbacks? - self.lexer_conf = LexerConf( - self.terminals, re_module, self.ignore_tokens, self.options.postlex, - self.options.lexer_callbacks, self.options.g_regex_flags, use_bytes=self.options.use_bytes, strict=self.options.strict - ) - - if self.options.parser: - self.parser = self._build_parser() - elif lexer: - self.lexer = self._build_lexer() - - if cache_fn: - logger.debug('Saving grammar to cache: %s', cache_fn) - try: - with FS.open(cache_fn, 'wb') as f: - assert cache_sha256 is not None - f.write(cache_sha256.encode('utf8') + b'\n') - pickle.dump(used_files, f) - self.save(f, _LOAD_ALLOWED_OPTIONS) - except IOError as e: - logger.exception("Failed to save Lark to cache: %r.", cache_fn, e) - - if __doc__: - __doc__ += "\n\n" + LarkOptions.OPTIONS_DOC - - __serialize_fields__ = 'parser', 'rules', 'options' - - def _build_lexer(self, dont_ignore: bool=False) -> BasicLexer: - lexer_conf = self.lexer_conf - if dont_ignore: - from copy import copy - lexer_conf = copy(lexer_conf) - lexer_conf.ignore = () - return BasicLexer(lexer_conf) - - def _prepare_callbacks(self) -> None: - self._callbacks = {} - # we don't need these callbacks if we aren't building a tree - if self.options.ambiguity != 'forest': - self._parse_tree_builder = ParseTreeBuilder( - self.rules, - self.options.tree_class or Tree, - self.options.propagate_positions, - self.options.parser != 'lalr' and self.options.ambiguity == 'explicit', - self.options.maybe_placeholders - ) - self._callbacks = self._parse_tree_builder.create_callback(self.options.transformer) - self._callbacks.update(_get_lexer_callbacks(self.options.transformer, self.terminals)) - - def _build_parser(self) -> "ParsingFrontend": - self._prepare_callbacks() - _validate_frontend_args(self.options.parser, self.options.lexer) - parser_conf = ParserConf(self.rules, self._callbacks, self.options.start) - return _construct_parsing_frontend( - self.options.parser, - self.options.lexer, - self.lexer_conf, - parser_conf, - options=self.options - ) - - def save(self, f, exclude_options: Collection[str] = ()) -> None: - """Saves the instance into the given file object - - Useful for caching and multiprocessing. - """ - if self.options.parser != 'lalr': - raise NotImplementedError("Lark.save() is only implemented for the LALR(1) parser.") - data, m = self.memo_serialize([TerminalDef, Rule]) - if exclude_options: - data["options"] = {n: v for n, v in data["options"].items() if n not in exclude_options} - pickle.dump({'data': data, 'memo': m}, f, protocol=pickle.HIGHEST_PROTOCOL) - - @classmethod - def load(cls: Type[_T], f) -> _T: - """Loads an instance from the given file object - - Useful for caching and multiprocessing. - """ - inst = cls.__new__(cls) - return inst._load(f) - - def _deserialize_lexer_conf(self, data: Dict[str, Any], memo: Dict[int, Union[TerminalDef, Rule]], options: LarkOptions) -> LexerConf: - lexer_conf = LexerConf.deserialize(data['lexer_conf'], memo) - lexer_conf.callbacks = options.lexer_callbacks or {} - lexer_conf.re_module = regex if options.regex else re - lexer_conf.use_bytes = options.use_bytes - lexer_conf.g_regex_flags = options.g_regex_flags - lexer_conf.skip_validation = True - lexer_conf.postlex = options.postlex - return lexer_conf - - def _load(self: _T, f: Any, **kwargs) -> _T: - if isinstance(f, dict): - d = f - else: - d = pickle.load(f) - memo_json = d['memo'] - data = d['data'] - - assert memo_json - memo = SerializeMemoizer.deserialize(memo_json, {'Rule': Rule, 'TerminalDef': TerminalDef}, {}) - options = dict(data['options']) - if (set(kwargs) - _LOAD_ALLOWED_OPTIONS) & set(LarkOptions._defaults): - raise ConfigurationError("Some options are not allowed when loading a Parser: {}" - .format(set(kwargs) - _LOAD_ALLOWED_OPTIONS)) - options.update(kwargs) - self.options = LarkOptions.deserialize(options, memo) - self.rules = [Rule.deserialize(r, memo) for r in data['rules']] - self.source_path = '' - _validate_frontend_args(self.options.parser, self.options.lexer) - self.lexer_conf = self._deserialize_lexer_conf(data['parser'], memo, self.options) - self.terminals = self.lexer_conf.terminals - self._prepare_callbacks() - self._terminals_dict = {t.name: t for t in self.terminals} - self.parser = _deserialize_parsing_frontend( - data['parser'], - memo, - self.lexer_conf, - self._callbacks, - self.options, # Not all, but multiple attributes are used - ) - return self - - @classmethod - def _load_from_dict(cls, data, memo, **kwargs): - inst = cls.__new__(cls) - return inst._load({'data': data, 'memo': memo}, **kwargs) - - @classmethod - def open(cls: Type[_T], grammar_filename: str, rel_to: Optional[str]=None, **options) -> _T: - """Create an instance of Lark with the grammar given by its filename - - If ``rel_to`` is provided, the function will find the grammar filename in relation to it. - - Example: - - >>> Lark.open("grammar_file.lark", rel_to=__file__, parser="lalr") - Lark(...) - - """ - if rel_to: - basepath = os.path.dirname(rel_to) - grammar_filename = os.path.join(basepath, grammar_filename) - with open(grammar_filename, encoding='utf8') as f: - return cls(f, **options) - - @classmethod - def open_from_package(cls: Type[_T], package: str, grammar_path: str, search_paths: 'Sequence[str]'=[""], **options) -> _T: - """Create an instance of Lark with the grammar loaded from within the package `package`. - This allows grammar loading from zipapps. - - Imports in the grammar will use the `package` and `search_paths` provided, through `FromPackageLoader` - - Example: - - Lark.open_from_package(__name__, "example.lark", ("grammars",), parser=...) - """ - package_loader = FromPackageLoader(package, search_paths) - full_path, text = package_loader(None, grammar_path) - options.setdefault('source_path', full_path) - options.setdefault('import_paths', []) - options['import_paths'].append(package_loader) - return cls(text, **options) - - def __repr__(self): - return 'Lark(open(%r), parser=%r, lexer=%r, ...)' % (self.source_path, self.options.parser, self.options.lexer) - - - def lex(self, text: str, dont_ignore: bool=False) -> Iterator[Token]: - """Only lex (and postlex) the text, without parsing it. Only relevant when lexer='basic' - - When dont_ignore=True, the lexer will return all tokens, even those marked for %ignore. - - :raises UnexpectedCharacters: In case the lexer cannot find a suitable match. - """ - lexer: Lexer - if not hasattr(self, 'lexer') or dont_ignore: - lexer = self._build_lexer(dont_ignore) - else: - lexer = self.lexer - lexer_thread = LexerThread.from_text(lexer, text) - stream = lexer_thread.lex(None) - if self.options.postlex: - return self.options.postlex.process(stream) - return stream - - def get_terminal(self, name: str) -> TerminalDef: - """Get information about a terminal""" - return self._terminals_dict[name] - - def parse_interactive(self, text: Optional[str]=None, start: Optional[str]=None) -> 'InteractiveParser': - """Start an interactive parsing session. - - Parameters: - text (str, optional): Text to be parsed. Required for ``resume_parse()``. - start (str, optional): Start symbol - - Returns: - A new InteractiveParser instance. - - See Also: ``Lark.parse()`` - """ - return self.parser.parse_interactive(text, start=start) - - def parse(self, text: str, start: Optional[str]=None, on_error: 'Optional[Callable[[UnexpectedInput], bool]]'=None) -> 'ParseTree': - """Parse the given text, according to the options provided. - - Parameters: - text (str): Text to be parsed. - start (str, optional): Required if Lark was given multiple possible start symbols (using the start option). - on_error (function, optional): if provided, will be called on UnexpectedToken error. Return true to resume parsing. - LALR only. See examples/advanced/error_handling.py for an example of how to use on_error. - - Returns: - If a transformer is supplied to ``__init__``, returns whatever is the - result of the transformation. Otherwise, returns a Tree instance. - - :raises UnexpectedInput: On a parse error, one of these sub-exceptions will rise: - ``UnexpectedCharacters``, ``UnexpectedToken``, or ``UnexpectedEOF``. - For convenience, these sub-exceptions also inherit from ``ParserError`` and ``LexerError``. - - """ - if on_error is not None and self.options.parser != 'lalr': - raise NotImplementedError("The on_error option is only implemented for the LALR(1) parser.") - return self.parser.parse(text, start=start, on_error=on_error) - - -###} diff --git a/lark/lexer.py b/lark/lexer.py deleted file mode 100644 index 9061d60..0000000 --- a/lark/lexer.py +++ /dev/null @@ -1,678 +0,0 @@ -# Lexer Implementation - -from abc import abstractmethod, ABC -import re -from contextlib import suppress -from typing import ( - TypeVar, Type, Dict, Iterator, Collection, Callable, Optional, FrozenSet, Any, - ClassVar, TYPE_CHECKING, overload -) -from types import ModuleType -import warnings -try: - import interegular -except ImportError: - pass -if TYPE_CHECKING: - from .common import LexerConf - from .parsers.lalr_parser_state import ParserState - -from .utils import classify, get_regexp_width, Serialize, logger -from .exceptions import UnexpectedCharacters, LexError, UnexpectedToken -from .grammar import TOKEN_DEFAULT_PRIORITY - - -###{standalone -from copy import copy - -try: # For the standalone parser, we need to make sure that has_interegular is False to avoid NameErrors later on - has_interegular = bool(interegular) -except NameError: - has_interegular = False - -class Pattern(Serialize, ABC): - "An abstraction over regular expressions." - - value: str - flags: Collection[str] - raw: Optional[str] - type: ClassVar[str] - - def __init__(self, value: str, flags: Collection[str] = (), raw: Optional[str] = None) -> None: - self.value = value - self.flags = frozenset(flags) - self.raw = raw - - def __repr__(self): - return repr(self.to_regexp()) - - # Pattern Hashing assumes all subclasses have a different priority! - def __hash__(self): - return hash((type(self), self.value, self.flags)) - - def __eq__(self, other): - return type(self) == type(other) and self.value == other.value and self.flags == other.flags - - @abstractmethod - def to_regexp(self) -> str: - raise NotImplementedError() - - @property - @abstractmethod - def min_width(self) -> int: - raise NotImplementedError() - - @property - @abstractmethod - def max_width(self) -> int: - raise NotImplementedError() - - def _get_flags(self, value): - for f in self.flags: - value = ('(?%s:%s)' % (f, value)) - return value - - -class PatternStr(Pattern): - __serialize_fields__ = 'value', 'flags', 'raw' - - type: ClassVar[str] = "str" - - def to_regexp(self) -> str: - return self._get_flags(re.escape(self.value)) - - @property - def min_width(self) -> int: - return len(self.value) - - @property - def max_width(self) -> int: - return len(self.value) - - -class PatternRE(Pattern): - __serialize_fields__ = 'value', 'flags', 'raw', '_width' - - type: ClassVar[str] = "re" - - def to_regexp(self) -> str: - return self._get_flags(self.value) - - _width = None - def _get_width(self): - if self._width is None: - self._width = get_regexp_width(self.to_regexp()) - return self._width - - @property - def min_width(self) -> int: - return self._get_width()[0] - - @property - def max_width(self) -> int: - return self._get_width()[1] - - -class TerminalDef(Serialize): - "A definition of a terminal" - __serialize_fields__ = 'name', 'pattern', 'priority' - __serialize_namespace__ = PatternStr, PatternRE - - name: str - pattern: Pattern - priority: int - - def __init__(self, name: str, pattern: Pattern, priority: int = TOKEN_DEFAULT_PRIORITY) -> None: - assert isinstance(pattern, Pattern), pattern - self.name = name - self.pattern = pattern - self.priority = priority - - def __repr__(self): - return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern) - - def user_repr(self) -> str: - if self.name.startswith('__'): # We represent a generated terminal - return self.pattern.raw or self.name - else: - return self.name - -_T = TypeVar('_T', bound="Token") - -class Token(str): - """A string with meta-information, that is produced by the lexer. - - When parsing text, the resulting chunks of the input that haven't been discarded, - will end up in the tree as Token instances. The Token class inherits from Python's ``str``, - so normal string comparisons and operations will work as expected. - - Attributes: - type: Name of the token (as specified in grammar) - value: Value of the token (redundant, as ``token.value == token`` will always be true) - start_pos: The index of the token in the text - line: The line of the token in the text (starting with 1) - column: The column of the token in the text (starting with 1) - end_line: The line where the token ends - end_column: The next column after the end of the token. For example, - if the token is a single character with a column value of 4, - end_column will be 5. - end_pos: the index where the token ends (basically ``start_pos + len(token)``) - """ - __slots__ = ('type', 'start_pos', 'value', 'line', 'column', 'end_line', 'end_column', 'end_pos') - - __match_args__ = ('type', 'value') - - type: str - start_pos: Optional[int] - value: Any - line: Optional[int] - column: Optional[int] - end_line: Optional[int] - end_column: Optional[int] - end_pos: Optional[int] - - - @overload - def __new__( - cls, - type: str, - value: Any, - start_pos: Optional[int] = None, - line: Optional[int] = None, - column: Optional[int] = None, - end_line: Optional[int] = None, - end_column: Optional[int] = None, - end_pos: Optional[int] = None - ) -> 'Token': - ... - - @overload - def __new__( - cls, - type_: str, - value: Any, - start_pos: Optional[int] = None, - line: Optional[int] = None, - column: Optional[int] = None, - end_line: Optional[int] = None, - end_column: Optional[int] = None, - end_pos: Optional[int] = None - ) -> 'Token': ... - - def __new__(cls, *args, **kwargs): - if "type_" in kwargs: - warnings.warn("`type_` is deprecated use `type` instead", DeprecationWarning) - - if "type" in kwargs: - raise TypeError("Error: using both 'type' and the deprecated 'type_' as arguments.") - kwargs["type"] = kwargs.pop("type_") - - return cls._future_new(*args, **kwargs) - - - @classmethod - def _future_new(cls, type, value, start_pos=None, line=None, column=None, end_line=None, end_column=None, end_pos=None): - inst = super(Token, cls).__new__(cls, value) - - inst.type = type - inst.start_pos = start_pos - inst.value = value - inst.line = line - inst.column = column - inst.end_line = end_line - inst.end_column = end_column - inst.end_pos = end_pos - return inst - - @overload - def update(self, type: Optional[str] = None, value: Optional[Any] = None) -> 'Token': - ... - - @overload - def update(self, type_: Optional[str] = None, value: Optional[Any] = None) -> 'Token': - ... - - def update(self, *args, **kwargs): - if "type_" in kwargs: - warnings.warn("`type_` is deprecated use `type` instead", DeprecationWarning) - - if "type" in kwargs: - raise TypeError("Error: using both 'type' and the deprecated 'type_' as arguments.") - kwargs["type"] = kwargs.pop("type_") - - return self._future_update(*args, **kwargs) - - def _future_update(self, type: Optional[str] = None, value: Optional[Any] = None) -> 'Token': - return Token.new_borrow_pos( - type if type is not None else self.type, - value if value is not None else self.value, - self - ) - - @classmethod - def new_borrow_pos(cls: Type[_T], type_: str, value: Any, borrow_t: 'Token') -> _T: - return cls(type_, value, borrow_t.start_pos, borrow_t.line, borrow_t.column, borrow_t.end_line, borrow_t.end_column, borrow_t.end_pos) - - def __reduce__(self): - return (self.__class__, (self.type, self.value, self.start_pos, self.line, self.column)) - - def __repr__(self): - return 'Token(%r, %r)' % (self.type, self.value) - - def __deepcopy__(self, memo): - return Token(self.type, self.value, self.start_pos, self.line, self.column) - - def __eq__(self, other): - if isinstance(other, Token) and self.type != other.type: - return False - - return str.__eq__(self, other) - - __hash__ = str.__hash__ - - -class LineCounter: - "A utility class for keeping track of line & column information" - - __slots__ = 'char_pos', 'line', 'column', 'line_start_pos', 'newline_char' - - def __init__(self, newline_char): - self.newline_char = newline_char - self.char_pos = 0 - self.line = 1 - self.column = 1 - self.line_start_pos = 0 - - def __eq__(self, other): - if not isinstance(other, LineCounter): - return NotImplemented - - return self.char_pos == other.char_pos and self.newline_char == other.newline_char - - def feed(self, token: Token, test_newline=True): - """Consume a token and calculate the new line & column. - - As an optional optimization, set test_newline=False if token doesn't contain a newline. - """ - if test_newline: - newlines = token.count(self.newline_char) - if newlines: - self.line += newlines - self.line_start_pos = self.char_pos + token.rindex(self.newline_char) + 1 - - self.char_pos += len(token) - self.column = self.char_pos - self.line_start_pos + 1 - - -class UnlessCallback: - def __init__(self, scanner): - self.scanner = scanner - - def __call__(self, t): - res = self.scanner.match(t.value, 0) - if res: - _value, t.type = res - return t - - -class CallChain: - def __init__(self, callback1, callback2, cond): - self.callback1 = callback1 - self.callback2 = callback2 - self.cond = cond - - def __call__(self, t): - t2 = self.callback1(t) - return self.callback2(t) if self.cond(t2) else t2 - - -def _get_match(re_, regexp, s, flags): - m = re_.match(regexp, s, flags) - if m: - return m.group(0) - -def _create_unless(terminals, g_regex_flags, re_, use_bytes): - tokens_by_type = classify(terminals, lambda t: type(t.pattern)) - assert len(tokens_by_type) <= 2, tokens_by_type.keys() - embedded_strs = set() - callback = {} - for retok in tokens_by_type.get(PatternRE, []): - unless = [] - for strtok in tokens_by_type.get(PatternStr, []): - if strtok.priority != retok.priority: - continue - s = strtok.pattern.value - if s == _get_match(re_, retok.pattern.to_regexp(), s, g_regex_flags): - unless.append(strtok) - if strtok.pattern.flags <= retok.pattern.flags: - embedded_strs.add(strtok) - if unless: - callback[retok.name] = UnlessCallback(Scanner(unless, g_regex_flags, re_, match_whole=True, use_bytes=use_bytes)) - - new_terminals = [t for t in terminals if t not in embedded_strs] - return new_terminals, callback - - -class Scanner: - def __init__(self, terminals, g_regex_flags, re_, use_bytes, match_whole=False): - self.terminals = terminals - self.g_regex_flags = g_regex_flags - self.re_ = re_ - self.use_bytes = use_bytes - self.match_whole = match_whole - - self.allowed_types = {t.name for t in self.terminals} - - self._mres = self._build_mres(terminals, len(terminals)) - - def _build_mres(self, terminals, max_size): - # Python sets an unreasonable group limit (currently 100) in its re module - # Worse, the only way to know we reached it is by catching an AssertionError! - # This function recursively tries less and less groups until it's successful. - postfix = '$' if self.match_whole else '' - mres = [] - while terminals: - pattern = u'|'.join(u'(?P<%s>%s)' % (t.name, t.pattern.to_regexp() + postfix) for t in terminals[:max_size]) - if self.use_bytes: - pattern = pattern.encode('latin-1') - try: - mre = self.re_.compile(pattern, self.g_regex_flags) - except AssertionError: # Yes, this is what Python provides us.. :/ - return self._build_mres(terminals, max_size // 2) - - mres.append(mre) - terminals = terminals[max_size:] - return mres - - def match(self, text, pos): - for mre in self._mres: - m = mre.match(text, pos) - if m: - return m.group(0), m.lastgroup - - -def _regexp_has_newline(r: str): - r"""Expressions that may indicate newlines in a regexp: - - newlines (\n) - - escaped newline (\\n) - - anything but ([^...]) - - any-char (.) when the flag (?s) exists - - spaces (\s) - """ - return '\n' in r or '\\n' in r or '\\s' in r or '[^' in r or ('(?s' in r and '.' in r) - - -class LexerState: - """Represents the current state of the lexer as it scans the text - (Lexer objects are only instantiated per grammar, not per text) - """ - - __slots__ = 'text', 'line_ctr', 'last_token' - - text: str - line_ctr: LineCounter - last_token: Optional[Token] - - def __init__(self, text: str, line_ctr: Optional[LineCounter]=None, last_token: Optional[Token]=None): - self.text = text - self.line_ctr = line_ctr or LineCounter(b'\n' if isinstance(text, bytes) else '\n') - self.last_token = last_token - - def __eq__(self, other): - if not isinstance(other, LexerState): - return NotImplemented - - return self.text is other.text and self.line_ctr == other.line_ctr and self.last_token == other.last_token - - def __copy__(self): - return type(self)(self.text, copy(self.line_ctr), self.last_token) - - -class LexerThread: - """A thread that ties a lexer instance and a lexer state, to be used by the parser - """ - - def __init__(self, lexer: 'Lexer', lexer_state: LexerState): - self.lexer = lexer - self.state = lexer_state - - @classmethod - def from_text(cls, lexer: 'Lexer', text: str) -> 'LexerThread': - return cls(lexer, LexerState(text)) - - def lex(self, parser_state): - return self.lexer.lex(self.state, parser_state) - - def __copy__(self): - return type(self)(self.lexer, copy(self.state)) - - _Token = Token - - -_Callback = Callable[[Token], Token] - -class Lexer(ABC): - """Lexer interface - - Method Signatures: - lex(self, lexer_state, parser_state) -> Iterator[Token] - """ - @abstractmethod - def lex(self, lexer_state: LexerState, parser_state: Any) -> Iterator[Token]: - return NotImplemented - - def make_lexer_state(self, text): - "Deprecated" - return LexerState(text) - - -def _check_regex_collisions(terminal_to_regexp: Dict[TerminalDef, str], comparator, strict_mode, max_collisions_to_show=8): - if not comparator: - comparator = interegular.Comparator.from_regexes(terminal_to_regexp) - - # When in strict mode, we only ever try to provide one example, so taking - # a long time for that should be fine - max_time = 2 if strict_mode else 0.2 - - # We don't want to show too many collisions. - if comparator.count_marked_pairs() >= max_collisions_to_show: - return - for group in classify(terminal_to_regexp, lambda t: t.priority).values(): - for a, b in comparator.check(group, skip_marked=True): - assert a.priority == b.priority - # Mark this pair to not repeat warnings when multiple different BasicLexers see the same collision - comparator.mark(a, b) - - # Notify the user - message = f"Collision between Terminals {a.name} and {b.name}. " - try: - example = comparator.get_example_overlap(a, b, max_time).format_multiline() - except ValueError: - # Couldn't find an example within max_time steps. - example = "No example could be found fast enough. However, the collision does still exists" - if strict_mode: - raise LexError(f"{message}\n{example}") - logger.warning("%s The lexer will choose between them arbitrarily.\n%s", message, example) - if comparator.count_marked_pairs() >= max_collisions_to_show: - logger.warning("Found 8 regex collisions, will not check for more.") - return - - -class AbstractBasicLexer(Lexer): - terminals_by_name: Dict[str, TerminalDef] - - @abstractmethod - def __init__(self, conf: 'LexerConf', comparator=None) -> None: - ... - - @abstractmethod - def next_token(self, lex_state: LexerState, parser_state: Any = None) -> Token: - ... - - def lex(self, state: LexerState, parser_state: Any) -> Iterator[Token]: - with suppress(EOFError): - while True: - yield self.next_token(state, parser_state) - - -class BasicLexer(AbstractBasicLexer): - terminals: Collection[TerminalDef] - ignore_types: FrozenSet[str] - newline_types: FrozenSet[str] - user_callbacks: Dict[str, _Callback] - callback: Dict[str, _Callback] - re: ModuleType - - def __init__(self, conf: 'LexerConf', comparator=None) -> None: - terminals = list(conf.terminals) - assert all(isinstance(t, TerminalDef) for t in terminals), terminals - - self.re = conf.re_module - - if not conf.skip_validation: - # Sanitization - terminal_to_regexp = {} - for t in terminals: - regexp = t.pattern.to_regexp() - try: - self.re.compile(regexp, conf.g_regex_flags) - except self.re.error: - raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern)) - - if t.pattern.min_width == 0: - raise LexError("Lexer does not allow zero-width terminals. (%s: %s)" % (t.name, t.pattern)) - if t.pattern.type == "re": - terminal_to_regexp[t] = regexp - - if not (set(conf.ignore) <= {t.name for t in terminals}): - raise LexError("Ignore terminals are not defined: %s" % (set(conf.ignore) - {t.name for t in terminals})) - - if has_interegular: - _check_regex_collisions(terminal_to_regexp, comparator, conf.strict) - elif conf.strict: - raise LexError("interegular must be installed for strict mode. Use `pip install 'lark[interegular]'`.") - - # Init - self.newline_types = frozenset(t.name for t in terminals if _regexp_has_newline(t.pattern.to_regexp())) - self.ignore_types = frozenset(conf.ignore) - - terminals.sort(key=lambda x: (-x.priority, -x.pattern.max_width, -len(x.pattern.value), x.name)) - self.terminals = terminals - self.user_callbacks = conf.callbacks - self.g_regex_flags = conf.g_regex_flags - self.use_bytes = conf.use_bytes - self.terminals_by_name = conf.terminals_by_name - - self._scanner = None - - def _build_scanner(self): - terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, self.re, self.use_bytes) - assert all(self.callback.values()) - - for type_, f in self.user_callbacks.items(): - if type_ in self.callback: - # Already a callback there, probably UnlessCallback - self.callback[type_] = CallChain(self.callback[type_], f, lambda t: t.type == type_) - else: - self.callback[type_] = f - - self._scanner = Scanner(terminals, self.g_regex_flags, self.re, self.use_bytes) - - @property - def scanner(self): - if self._scanner is None: - self._build_scanner() - return self._scanner - - def match(self, text, pos): - return self.scanner.match(text, pos) - - def next_token(self, lex_state: LexerState, parser_state: Any = None) -> Token: - line_ctr = lex_state.line_ctr - while line_ctr.char_pos < len(lex_state.text): - res = self.match(lex_state.text, line_ctr.char_pos) - if not res: - allowed = self.scanner.allowed_types - self.ignore_types - if not allowed: - allowed = {""} - raise UnexpectedCharacters(lex_state.text, line_ctr.char_pos, line_ctr.line, line_ctr.column, - allowed=allowed, token_history=lex_state.last_token and [lex_state.last_token], - state=parser_state, terminals_by_name=self.terminals_by_name) - - value, type_ = res - - ignored = type_ in self.ignore_types - t = None - if not ignored or type_ in self.callback: - t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) - line_ctr.feed(value, type_ in self.newline_types) - if t is not None: - t.end_line = line_ctr.line - t.end_column = line_ctr.column - t.end_pos = line_ctr.char_pos - if t.type in self.callback: - t = self.callback[t.type](t) - if not ignored: - if not isinstance(t, Token): - raise LexError("Callbacks must return a token (returned %r)" % t) - lex_state.last_token = t - return t - - # EOF - raise EOFError(self) - - -class ContextualLexer(Lexer): - lexers: Dict[int, AbstractBasicLexer] - root_lexer: AbstractBasicLexer - - BasicLexer: Type[AbstractBasicLexer] = BasicLexer - - def __init__(self, conf: 'LexerConf', states: Dict[int, Collection[str]], always_accept: Collection[str]=()) -> None: - terminals = list(conf.terminals) - terminals_by_name = conf.terminals_by_name - - trad_conf = copy(conf) - trad_conf.terminals = terminals - - if has_interegular and not conf.skip_validation: - comparator = interegular.Comparator.from_regexes({t: t.pattern.to_regexp() for t in terminals}) - else: - comparator = None - lexer_by_tokens: Dict[FrozenSet[str], AbstractBasicLexer] = {} - self.lexers = {} - for state, accepts in states.items(): - key = frozenset(accepts) - try: - lexer = lexer_by_tokens[key] - except KeyError: - accepts = set(accepts) | set(conf.ignore) | set(always_accept) - lexer_conf = copy(trad_conf) - lexer_conf.terminals = [terminals_by_name[n] for n in accepts if n in terminals_by_name] - lexer = self.BasicLexer(lexer_conf, comparator) - lexer_by_tokens[key] = lexer - - self.lexers[state] = lexer - - assert trad_conf.terminals is terminals - trad_conf.skip_validation = True # We don't need to verify all terminals again - self.root_lexer = self.BasicLexer(trad_conf, comparator) - - def lex(self, lexer_state: LexerState, parser_state: 'ParserState') -> Iterator[Token]: - try: - while True: - lexer = self.lexers[parser_state.position] - yield lexer.next_token(lexer_state, parser_state) - except EOFError: - pass - except UnexpectedCharacters as e: - # In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined, but not in the current context. - # This tests the input against the global context, to provide a nicer error. - try: - last_token = lexer_state.last_token # Save last_token. Calling root_lexer.next_token will change this to the wrong token - token = self.root_lexer.next_token(lexer_state, parser_state) - raise UnexpectedToken(token, e.allowed, state=parser_state, token_history=[last_token], terminals_by_name=self.root_lexer.terminals_by_name) - except UnexpectedCharacters: - raise e # Raise the original UnexpectedCharacters. The root lexer raises it with the wrong expected set. - -###} diff --git a/lark/load_grammar.py b/lark/load_grammar.py deleted file mode 100644 index 362a845..0000000 --- a/lark/load_grammar.py +++ /dev/null @@ -1,1428 +0,0 @@ -"""Parses and compiles Lark grammars into an internal representation. -""" - -import hashlib -import os.path -import sys -from collections import namedtuple -from copy import copy, deepcopy -import pkgutil -from ast import literal_eval -from contextlib import suppress -from typing import List, Tuple, Union, Callable, Dict, Optional, Sequence, Generator - -from .utils import bfs, logger, classify_bool, is_id_continue, is_id_start, bfs_all_unique, small_factors, OrderedSet -from .lexer import Token, TerminalDef, PatternStr, PatternRE, Pattern - -from .parse_tree_builder import ParseTreeBuilder -from .parser_frontends import ParsingFrontend -from .common import LexerConf, ParserConf -from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol, TOKEN_DEFAULT_PRIORITY -from .utils import classify, dedup_list -from .exceptions import GrammarError, UnexpectedCharacters, UnexpectedToken, ParseError, UnexpectedInput - -from .tree import Tree, SlottedTree as ST -from .visitors import Transformer, Visitor, v_args, Transformer_InPlace, Transformer_NonRecursive -inline_args = v_args(inline=True) - -IMPORT_PATHS = ['grammars'] - -EXT = '.lark' - -_RE_FLAGS = 'imslux' - -_EMPTY = Symbol('__empty__') - -_TERMINAL_NAMES = { - '.' : 'DOT', - ',' : 'COMMA', - ':' : 'COLON', - ';' : 'SEMICOLON', - '+' : 'PLUS', - '-' : 'MINUS', - '*' : 'STAR', - '/' : 'SLASH', - '\\' : 'BACKSLASH', - '|' : 'VBAR', - '?' : 'QMARK', - '!' : 'BANG', - '@' : 'AT', - '#' : 'HASH', - '$' : 'DOLLAR', - '%' : 'PERCENT', - '^' : 'CIRCUMFLEX', - '&' : 'AMPERSAND', - '_' : 'UNDERSCORE', - '<' : 'LESSTHAN', - '>' : 'MORETHAN', - '=' : 'EQUAL', - '"' : 'DBLQUOTE', - '\'' : 'QUOTE', - '`' : 'BACKQUOTE', - '~' : 'TILDE', - '(' : 'LPAR', - ')' : 'RPAR', - '{' : 'LBRACE', - '}' : 'RBRACE', - '[' : 'LSQB', - ']' : 'RSQB', - '\n' : 'NEWLINE', - '\r\n' : 'CRLF', - '\t' : 'TAB', - ' ' : 'SPACE', -} - -# Grammar Parser -TERMINALS = { - '_LPAR': r'\(', - '_RPAR': r'\)', - '_LBRA': r'\[', - '_RBRA': r'\]', - '_LBRACE': r'\{', - '_RBRACE': r'\}', - 'OP': '[+*]|[?](?![a-z_])', - '_COLON': ':', - '_COMMA': ',', - '_OR': r'\|', - '_DOT': r'\.(?!\.)', - '_DOTDOT': r'\.\.', - 'TILDE': '~', - 'RULE_MODIFIERS': '(!|![?]?|[?]!?)(?=[_a-z])', - 'RULE': '_?[a-z][_a-z0-9]*', - 'TERMINAL': '_?[A-Z][_A-Z0-9]*', - 'STRING': r'"(\\"|\\\\|[^"\n])*?"i?', - 'REGEXP': r'/(?!/)(\\/|\\\\|[^/])*?/[%s]*' % _RE_FLAGS, - '_NL': r'(\r?\n)+\s*', - '_NL_OR': r'(\r?\n)+\s*\|', - 'WS': r'[ \t]+', - 'COMMENT': r'\s*//[^\n]*|\s*#[^\n]*', - 'BACKSLASH': r'\\[ ]*\n', - '_TO': '->', - '_IGNORE': r'%ignore', - '_OVERRIDE': r'%override', - '_DECLARE': r'%declare', - '_EXTEND': r'%extend', - '_IMPORT': r'%import', - 'NUMBER': r'[+-]?\d+', -} - -RULES = { - 'start': ['_list'], - '_list': ['_item', '_list _item'], - '_item': ['rule', 'term', 'ignore', 'import', 'declare', 'override', 'extend', '_NL'], - - 'rule': ['rule_modifiers RULE template_params priority _COLON expansions _NL'], - 'rule_modifiers': ['RULE_MODIFIERS', - ''], - 'priority': ['_DOT NUMBER', - ''], - 'template_params': ['_LBRACE _template_params _RBRACE', - ''], - '_template_params': ['RULE', - '_template_params _COMMA RULE'], - 'expansions': ['_expansions'], - '_expansions': ['alias', - '_expansions _OR alias', - '_expansions _NL_OR alias'], - - '?alias': ['expansion _TO nonterminal', 'expansion'], - 'expansion': ['_expansion'], - - '_expansion': ['', '_expansion expr'], - - '?expr': ['atom', - 'atom OP', - 'atom TILDE NUMBER', - 'atom TILDE NUMBER _DOTDOT NUMBER', - ], - - '?atom': ['_LPAR expansions _RPAR', - 'maybe', - 'value'], - - 'value': ['terminal', - 'nonterminal', - 'literal', - 'range', - 'template_usage'], - - 'terminal': ['TERMINAL'], - 'nonterminal': ['RULE'], - - '?name': ['RULE', 'TERMINAL'], - '?symbol': ['terminal', 'nonterminal'], - - 'maybe': ['_LBRA expansions _RBRA'], - 'range': ['STRING _DOTDOT STRING'], - - 'template_usage': ['nonterminal _LBRACE _template_args _RBRACE'], - '_template_args': ['value', - '_template_args _COMMA value'], - - 'term': ['TERMINAL _COLON expansions _NL', - 'TERMINAL _DOT NUMBER _COLON expansions _NL'], - 'override': ['_OVERRIDE rule', - '_OVERRIDE term'], - 'extend': ['_EXTEND rule', - '_EXTEND term'], - 'ignore': ['_IGNORE expansions _NL'], - 'declare': ['_DECLARE _declare_args _NL'], - 'import': ['_IMPORT _import_path _NL', - '_IMPORT _import_path _LPAR name_list _RPAR _NL', - '_IMPORT _import_path _TO name _NL'], - - '_import_path': ['import_lib', 'import_rel'], - 'import_lib': ['_import_args'], - 'import_rel': ['_DOT _import_args'], - '_import_args': ['name', '_import_args _DOT name'], - - 'name_list': ['_name_list'], - '_name_list': ['name', '_name_list _COMMA name'], - - '_declare_args': ['symbol', '_declare_args symbol'], - 'literal': ['REGEXP', 'STRING'], -} - - -# Value 5 keeps the number of states in the lalr parser somewhat minimal -# It isn't optimal, but close to it. See PR #949 -SMALL_FACTOR_THRESHOLD = 5 -# The Threshold whether repeat via ~ are split up into different rules -# 50 is chosen since it keeps the number of states low and therefore lalr analysis time low, -# while not being to overaggressive and unnecessarily creating rules that might create shift/reduce conflicts. -# (See PR #949) -REPEAT_BREAK_THRESHOLD = 50 - - -class FindRuleSize(Transformer): - def __init__(self, keep_all_tokens: bool): - self.keep_all_tokens = keep_all_tokens - - def _will_not_get_removed(self, sym: Symbol) -> bool: - if isinstance(sym, NonTerminal): - return not sym.name.startswith('_') - if isinstance(sym, Terminal): - return self.keep_all_tokens or not sym.filter_out - if sym is _EMPTY: - return False - assert False, sym - - def _args_as_int(self, args: List[Union[int, Symbol]]) -> Generator[int, None, None]: - for a in args: - if isinstance(a, int): - yield a - elif isinstance(a, Symbol): - yield 1 if self._will_not_get_removed(a) else 0 - else: - assert False - - def expansion(self, args) -> int: - return sum(self._args_as_int(args)) - - def expansions(self, args) -> int: - return max(self._args_as_int(args)) - - -@inline_args -class EBNF_to_BNF(Transformer_InPlace): - def __init__(self): - self.new_rules = [] - self.rules_cache = {} - self.prefix = 'anon' - self.i = 0 - self.rule_options = None - - def _name_rule(self, inner: str): - new_name = '__%s_%s_%d' % (self.prefix, inner, self.i) - self.i += 1 - return new_name - - def _add_rule(self, key, name, expansions): - t = NonTerminal(name) - self.new_rules.append((name, expansions, self.rule_options)) - self.rules_cache[key] = t - return t - - def _add_recurse_rule(self, type_: str, expr: Tree): - try: - return self.rules_cache[expr] - except KeyError: - new_name = self._name_rule(type_) - t = NonTerminal(new_name) - tree = ST('expansions', [ - ST('expansion', [expr]), - ST('expansion', [t, expr]) - ]) - return self._add_rule(expr, new_name, tree) - - def _add_repeat_rule(self, a, b, target, atom): - """Generate a rule that repeats target ``a`` times, and repeats atom ``b`` times. - - When called recursively (into target), it repeats atom for x(n) times, where: - x(0) = 1 - x(n) = a(n) * x(n-1) + b - - Example rule when a=3, b=4: - - new_rule: target target target atom atom atom atom - - """ - key = (a, b, target, atom) - try: - return self.rules_cache[key] - except KeyError: - new_name = self._name_rule('repeat_a%d_b%d' % (a, b)) - tree = ST('expansions', [ST('expansion', [target] * a + [atom] * b)]) - return self._add_rule(key, new_name, tree) - - def _add_repeat_opt_rule(self, a, b, target, target_opt, atom): - """Creates a rule that matches atom 0 to (a*n+b)-1 times. - - When target matches n times atom, and target_opt 0 to n-1 times target_opt, - - First we generate target * i followed by target_opt, for i from 0 to a-1 - These match 0 to n*a - 1 times atom - - Then we generate target * a followed by atom * i, for i from 0 to b-1 - These match n*a to n*a + b-1 times atom - - The created rule will not have any shift/reduce conflicts so that it can be used with lalr - - Example rule when a=3, b=4: - - new_rule: target_opt - | target target_opt - | target target target_opt - - | target target target - | target target target atom - | target target target atom atom - | target target target atom atom atom - - """ - key = (a, b, target, atom, "opt") - try: - return self.rules_cache[key] - except KeyError: - new_name = self._name_rule('repeat_a%d_b%d_opt' % (a, b)) - tree = ST('expansions', [ - ST('expansion', [target]*i + [target_opt]) for i in range(a) - ] + [ - ST('expansion', [target]*a + [atom]*i) for i in range(b) - ]) - return self._add_rule(key, new_name, tree) - - def _generate_repeats(self, rule: Tree, mn: int, mx: int): - """Generates a rule tree that repeats ``rule`` exactly between ``mn`` to ``mx`` times. - """ - # For a small number of repeats, we can take the naive approach - if mx < REPEAT_BREAK_THRESHOLD: - return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx + 1)]) - - # For large repeat values, we break the repetition into sub-rules. - # We treat ``rule~mn..mx`` as ``rule~mn rule~0..(diff=mx-mn)``. - # We then use small_factors to split up mn and diff up into values [(a, b), ...] - # This values are used with the help of _add_repeat_rule and _add_repeat_rule_opt - # to generate a complete rule/expression that matches the corresponding number of repeats - mn_target = rule - for a, b in small_factors(mn, SMALL_FACTOR_THRESHOLD): - mn_target = self._add_repeat_rule(a, b, mn_target, rule) - if mx == mn: - return mn_target - - diff = mx - mn + 1 # We add one because _add_repeat_opt_rule generates rules that match one less - diff_factors = small_factors(diff, SMALL_FACTOR_THRESHOLD) - diff_target = rule # Match rule 1 times - diff_opt_target = ST('expansion', []) # match rule 0 times (e.g. up to 1 -1 times) - for a, b in diff_factors[:-1]: - diff_opt_target = self._add_repeat_opt_rule(a, b, diff_target, diff_opt_target, rule) - diff_target = self._add_repeat_rule(a, b, diff_target, rule) - - a, b = diff_factors[-1] - diff_opt_target = self._add_repeat_opt_rule(a, b, diff_target, diff_opt_target, rule) - - return ST('expansions', [ST('expansion', [mn_target] + [diff_opt_target])]) - - def expr(self, rule: Tree, op: Token, *args): - if op.value == '?': - empty = ST('expansion', []) - return ST('expansions', [rule, empty]) - elif op.value == '+': - # a : b c+ d - # --> - # a : b _c d - # _c : _c c | c; - return self._add_recurse_rule('plus', rule) - elif op.value == '*': - # a : b c* d - # --> - # a : b _c? d - # _c : _c c | c; - new_name = self._add_recurse_rule('star', rule) - return ST('expansions', [new_name, ST('expansion', [])]) - elif op.value == '~': - if len(args) == 1: - mn = mx = int(args[0]) - else: - mn, mx = map(int, args) - if mx < mn or mn < 0: - raise GrammarError("Bad Range for %s (%d..%d isn't allowed)" % (rule, mn, mx)) - - return self._generate_repeats(rule, mn, mx) - - assert False, op - - def maybe(self, rule: Tree): - keep_all_tokens = self.rule_options and self.rule_options.keep_all_tokens - rule_size = FindRuleSize(keep_all_tokens).transform(rule) - empty = ST('expansion', [_EMPTY] * rule_size) - return ST('expansions', [rule, empty]) - - -class SimplifyRule_Visitor(Visitor): - - @staticmethod - def _flatten(tree: Tree): - while tree.expand_kids_by_data(tree.data): - pass - - def expansion(self, tree: Tree): - # rules_list unpacking - # a : b (c|d) e - # --> - # a : b c e | b d e - # - # In AST terms: - # expansion(b, expansions(c, d), e) - # --> - # expansions( expansion(b, c, e), expansion(b, d, e) ) - - self._flatten(tree) - - for i, child in enumerate(tree.children): - if isinstance(child, Tree) and child.data == 'expansions': - tree.data = 'expansions' - tree.children = [self.visit(ST('expansion', [option if i == j else other - for j, other in enumerate(tree.children)])) - for option in dedup_list(child.children)] - self._flatten(tree) - break - - def alias(self, tree): - rule, alias_name = tree.children - if rule.data == 'expansions': - aliases = [] - for child in tree.children[0].children: - aliases.append(ST('alias', [child, alias_name])) - tree.data = 'expansions' - tree.children = aliases - - def expansions(self, tree: Tree): - self._flatten(tree) - # Ensure all children are unique - if len(set(tree.children)) != len(tree.children): - tree.children = dedup_list(tree.children) # dedup is expensive, so try to minimize its use - - -class RuleTreeToText(Transformer): - def expansions(self, x): - return x - - def expansion(self, symbols): - return symbols, None - - def alias(self, x): - (expansion, _alias), alias = x - assert _alias is None, (alias, expansion, '-', _alias) # Double alias not allowed - return expansion, alias.name - - -class PrepareAnonTerminals(Transformer_InPlace): - """Create a unique list of anonymous terminals. Attempt to give meaningful names to them when we add them""" - - def __init__(self, terminals): - self.terminals = terminals - self.term_set = {td.name for td in self.terminals} - self.term_reverse = {td.pattern: td for td in terminals} - self.i = 0 - self.rule_options = None - - @inline_args - def pattern(self, p): - value = p.value - if p in self.term_reverse and p.flags != self.term_reverse[p].pattern.flags: - raise GrammarError(u'Conflicting flags for the same terminal: %s' % p) - - term_name = None - - if isinstance(p, PatternStr): - try: - # If already defined, use the user-defined terminal name - term_name = self.term_reverse[p].name - except KeyError: - # Try to assign an indicative anon-terminal name - try: - term_name = _TERMINAL_NAMES[value] - except KeyError: - if value and is_id_continue(value) and is_id_start(value[0]) and value.upper() not in self.term_set: - term_name = value.upper() - - if term_name in self.term_set: - term_name = None - - elif isinstance(p, PatternRE): - if p in self.term_reverse: # Kind of a weird placement.name - term_name = self.term_reverse[p].name - else: - assert False, p - - if term_name is None: - term_name = '__ANON_%d' % self.i - self.i += 1 - - if term_name not in self.term_set: - assert p not in self.term_reverse - self.term_set.add(term_name) - termdef = TerminalDef(term_name, p) - self.term_reverse[p] = termdef - self.terminals.append(termdef) - - filter_out = False if self.rule_options and self.rule_options.keep_all_tokens else isinstance(p, PatternStr) - - return Terminal(term_name, filter_out=filter_out) - - -class _ReplaceSymbols(Transformer_InPlace): - """Helper for ApplyTemplates""" - - def __init__(self): - self.names = {} - - def value(self, c): - if len(c) == 1 and isinstance(c[0], Symbol) and c[0].name in self.names: - return self.names[c[0].name] - return self.__default__('value', c, None) - - def template_usage(self, c): - name = c[0].name - if name in self.names: - return self.__default__('template_usage', [self.names[name]] + c[1:], None) - return self.__default__('template_usage', c, None) - - -class ApplyTemplates(Transformer_InPlace): - """Apply the templates, creating new rules that represent the used templates""" - - def __init__(self, rule_defs): - self.rule_defs = rule_defs - self.replacer = _ReplaceSymbols() - self.created_templates = set() - - def template_usage(self, c): - name = c[0].name - args = c[1:] - result_name = "%s{%s}" % (name, ",".join(a.name for a in args)) - if result_name not in self.created_templates: - self.created_templates.add(result_name) - (_n, params, tree, options) ,= (t for t in self.rule_defs if t[0] == name) - assert len(params) == len(args), args - result_tree = deepcopy(tree) - self.replacer.names = dict(zip(params, args)) - self.replacer.transform(result_tree) - self.rule_defs.append((result_name, [], result_tree, deepcopy(options))) - return NonTerminal(result_name) - - -def _rfind(s, choices): - return max(s.rfind(c) for c in choices) - - -def eval_escaping(s): - w = '' - i = iter(s) - for n in i: - w += n - if n == '\\': - try: - n2 = next(i) - except StopIteration: - raise GrammarError("Literal ended unexpectedly (bad escaping): `%r`" % s) - if n2 == '\\': - w += '\\\\' - elif n2 not in 'Uuxnftr': - w += '\\' - w += n2 - w = w.replace('\\"', '"').replace("'", "\\'") - - to_eval = "u'''%s'''" % w - try: - s = literal_eval(to_eval) - except SyntaxError as e: - raise GrammarError(s, e) - - return s - - -def _literal_to_pattern(literal): - assert isinstance(literal, Token) - v = literal.value - flag_start = _rfind(v, '/"')+1 - assert flag_start > 0 - flags = v[flag_start:] - assert all(f in _RE_FLAGS for f in flags), flags - - if literal.type == 'STRING' and '\n' in v: - raise GrammarError('You cannot put newlines in string literals') - - if literal.type == 'REGEXP' and '\n' in v and 'x' not in flags: - raise GrammarError('You can only use newlines in regular expressions ' - 'with the `x` (verbose) flag') - - v = v[:flag_start] - assert v[0] == v[-1] and v[0] in '"/' - x = v[1:-1] - - s = eval_escaping(x) - - if s == "": - raise GrammarError("Empty terminals are not allowed (%s)" % literal) - - if literal.type == 'STRING': - s = s.replace('\\\\', '\\') - return PatternStr(s, flags, raw=literal.value) - elif literal.type == 'REGEXP': - return PatternRE(s, flags, raw=literal.value) - else: - assert False, 'Invariant failed: literal.type not in ["STRING", "REGEXP"]' - - -@inline_args -class PrepareLiterals(Transformer_InPlace): - def literal(self, literal): - return ST('pattern', [_literal_to_pattern(literal)]) - - def range(self, start, end): - assert start.type == end.type == 'STRING' - start = start.value[1:-1] - end = end.value[1:-1] - assert len(eval_escaping(start)) == len(eval_escaping(end)) == 1 - regexp = '[%s-%s]' % (start, end) - return ST('pattern', [PatternRE(regexp)]) - - -def _make_joined_pattern(regexp, flags_set) -> PatternRE: - return PatternRE(regexp, ()) - -class TerminalTreeToPattern(Transformer_NonRecursive): - def pattern(self, ps): - p ,= ps - return p - - def expansion(self, items: List[Pattern]) -> Pattern: - if not items: - return PatternStr('') - - if len(items) == 1: - return items[0] - - pattern = ''.join(i.to_regexp() for i in items) - return _make_joined_pattern(pattern, {i.flags for i in items}) - - def expansions(self, exps: List[Pattern]) -> Pattern: - if len(exps) == 1: - return exps[0] - - # Do a bit of sorting to make sure that the longest option is returned - # (Python's re module otherwise prefers just 'l' when given (l|ll) and both could match) - exps.sort(key=lambda x: (-x.max_width, -x.min_width, -len(x.value))) - - pattern = '(?:%s)' % ('|'.join(i.to_regexp() for i in exps)) - return _make_joined_pattern(pattern, {i.flags for i in exps}) - - def expr(self, args) -> Pattern: - inner: Pattern - inner, op = args[:2] - if op == '~': - if len(args) == 3: - op = "{%d}" % int(args[2]) - else: - mn, mx = map(int, args[2:]) - if mx < mn: - raise GrammarError("Bad Range for %s (%d..%d isn't allowed)" % (inner, mn, mx)) - op = "{%d,%d}" % (mn, mx) - else: - assert len(args) == 2 - return PatternRE('(?:%s)%s' % (inner.to_regexp(), op), inner.flags) - - def maybe(self, expr): - return self.expr(expr + ['?']) - - def alias(self, t): - raise GrammarError("Aliasing not allowed in terminals (You used -> in the wrong place)") - - def value(self, v): - return v[0] - - -class ValidateSymbols(Transformer_InPlace): - def value(self, v): - v ,= v - assert isinstance(v, (Tree, Symbol)) - return v - - -def nr_deepcopy_tree(t): - """Deepcopy tree `t` without recursion""" - return Transformer_NonRecursive(False).transform(t) - - -class Grammar: - - term_defs: List[Tuple[str, Tuple[Tree, int]]] - rule_defs: List[Tuple[str, Tuple[str, ...], Tree, RuleOptions]] - ignore: List[str] - - def __init__(self, rule_defs: List[Tuple[str, Tuple[str, ...], Tree, RuleOptions]], term_defs: List[Tuple[str, Tuple[Tree, int]]], ignore: List[str]) -> None: - self.term_defs = term_defs - self.rule_defs = rule_defs - self.ignore = ignore - - def compile(self, start, terminals_to_keep) -> Tuple[List[TerminalDef], List[Rule], List[str]]: - # We change the trees in-place (to support huge grammars) - # So deepcopy allows calling compile more than once. - term_defs = [(n, (nr_deepcopy_tree(t), p)) for n, (t, p) in self.term_defs] - rule_defs = [(n, p, nr_deepcopy_tree(t), o) for n, p, t, o in self.rule_defs] - - # =================== - # Compile Terminals - # =================== - - # Convert terminal-trees to strings/regexps - - for name, (term_tree, priority) in term_defs: - if term_tree is None: # Terminal added through %declare - continue - expansions = list(term_tree.find_data('expansion')) - if len(expansions) == 1 and not expansions[0].children: - raise GrammarError("Terminals cannot be empty (%s)" % name) - - transformer = PrepareLiterals() * TerminalTreeToPattern() - terminals = [TerminalDef(name, transformer.transform(term_tree), priority) - for name, (term_tree, priority) in term_defs if term_tree] - - # ================= - # Compile Rules - # ================= - - # 1. Pre-process terminals - anon_tokens_transf = PrepareAnonTerminals(terminals) - transformer = PrepareLiterals() * ValidateSymbols() * anon_tokens_transf # Adds to terminals - - # 2. Inline Templates - - transformer *= ApplyTemplates(rule_defs) - - # 3. Convert EBNF to BNF (and apply step 1 & 2) - ebnf_to_bnf = EBNF_to_BNF() - rules = [] - i = 0 - while i < len(rule_defs): # We have to do it like this because rule_defs might grow due to templates - name, params, rule_tree, options = rule_defs[i] - i += 1 - if len(params) != 0: # Dont transform templates - continue - rule_options = RuleOptions(keep_all_tokens=True) if options and options.keep_all_tokens else None - ebnf_to_bnf.rule_options = rule_options - ebnf_to_bnf.prefix = name - anon_tokens_transf.rule_options = rule_options - tree = transformer.transform(rule_tree) - res: Tree = ebnf_to_bnf.transform(tree) - rules.append((name, res, options)) - rules += ebnf_to_bnf.new_rules - - assert len(rules) == len({name for name, _t, _o in rules}), "Whoops, name collision" - - # 4. Compile tree to Rule objects - rule_tree_to_text = RuleTreeToText() - - simplify_rule = SimplifyRule_Visitor() - compiled_rules: List[Rule] = [] - for rule_content in rules: - name, tree, options = rule_content - simplify_rule.visit(tree) - expansions = rule_tree_to_text.transform(tree) - - for i, (expansion, alias) in enumerate(expansions): - if alias and name.startswith('_'): - raise GrammarError("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)"% (name, alias)) - - empty_indices = tuple(x==_EMPTY for x in expansion) - if any(empty_indices): - exp_options = copy(options) or RuleOptions() - exp_options.empty_indices = empty_indices - expansion = [x for x in expansion if x!=_EMPTY] - else: - exp_options = options - - for sym in expansion: - assert isinstance(sym, Symbol) - if sym.is_term and exp_options and exp_options.keep_all_tokens: - assert isinstance(sym, Terminal) - sym.filter_out = False - rule = Rule(NonTerminal(name), expansion, i, alias, exp_options) - compiled_rules.append(rule) - - # Remove duplicates of empty rules, throw error for non-empty duplicates - if len(set(compiled_rules)) != len(compiled_rules): - duplicates = classify(compiled_rules, lambda x: x) - for dups in duplicates.values(): - if len(dups) > 1: - if dups[0].expansion: - raise GrammarError("Rules defined twice: %s\n\n(Might happen due to colliding expansion of optionals: [] or ?)" - % ''.join('\n * %s' % i for i in dups)) - - # Empty rule; assert all other attributes are equal - assert len({(r.alias, r.order, r.options) for r in dups}) == len(dups) - - # Remove duplicates - compiled_rules = list(OrderedSet(compiled_rules)) - - # Filter out unused rules - while True: - c = len(compiled_rules) - used_rules = {s for r in compiled_rules - for s in r.expansion - if isinstance(s, NonTerminal) - and s != r.origin} - used_rules |= {NonTerminal(s) for s in start} - compiled_rules, unused = classify_bool(compiled_rules, lambda r: r.origin in used_rules) - for r in unused: - logger.debug("Unused rule: %s", r) - if len(compiled_rules) == c: - break - - # Filter out unused terminals - if terminals_to_keep != '*': - used_terms = {t.name for r in compiled_rules - for t in r.expansion - if isinstance(t, Terminal)} - terminals, unused = classify_bool(terminals, lambda t: t.name in used_terms or t.name in self.ignore or t.name in terminals_to_keep) - if unused: - logger.debug("Unused terminals: %s", [t.name for t in unused]) - - return terminals, compiled_rules, self.ignore - - -PackageResource = namedtuple('PackageResource', 'pkg_name path') - - -class FromPackageLoader: - """ - Provides a simple way of creating custom import loaders that load from packages via ``pkgutil.get_data`` instead of using `open`. - This allows them to be compatible even from within zip files. - - Relative imports are handled, so you can just freely use them. - - pkg_name: The name of the package. You can probably provide `__name__` most of the time - search_paths: All the path that will be search on absolute imports. - """ - - pkg_name: str - search_paths: Sequence[str] - - def __init__(self, pkg_name: str, search_paths: Sequence[str]=("", )) -> None: - self.pkg_name = pkg_name - self.search_paths = search_paths - - def __repr__(self): - return "%s(%r, %r)" % (type(self).__name__, self.pkg_name, self.search_paths) - - def __call__(self, base_path: Union[None, str, PackageResource], grammar_path: str) -> Tuple[PackageResource, str]: - if base_path is None: - to_try = self.search_paths - else: - # Check whether or not the importing grammar was loaded by this module. - if not isinstance(base_path, PackageResource) or base_path.pkg_name != self.pkg_name: - # Technically false, but FileNotFound doesn't exist in python2.7, and this message should never reach the end user anyway - raise IOError() - to_try = [base_path.path] - - err = None - for path in to_try: - full_path = os.path.join(path, grammar_path) - try: - text: Optional[bytes] = pkgutil.get_data(self.pkg_name, full_path) - except IOError as e: - err = e - continue - else: - return PackageResource(self.pkg_name, full_path), (text.decode() if text else '') - - raise IOError('Cannot find grammar in given paths') from err - - -stdlib_loader = FromPackageLoader('lark', IMPORT_PATHS) - - - -def resolve_term_references(term_dict): - # TODO Solve with transitive closure (maybe) - - while True: - changed = False - for name, token_tree in term_dict.items(): - if token_tree is None: # Terminal added through %declare - continue - for exp in token_tree.find_data('value'): - item ,= exp.children - if isinstance(item, NonTerminal): - raise GrammarError("Rules aren't allowed inside terminals (%s in %s)" % (item, name)) - elif isinstance(item, Terminal): - try: - term_value = term_dict[item.name] - except KeyError: - raise GrammarError("Terminal used but not defined: %s" % item.name) - assert term_value is not None - exp.children[0] = term_value - changed = True - else: - assert isinstance(item, Tree) - if not changed: - break - - for name, term in term_dict.items(): - if term: # Not just declared - for child in term.children: - ids = [id(x) for x in child.iter_subtrees()] - if id(term) in ids: - raise GrammarError("Recursion in terminal '%s' (recursion is only allowed in rules, not terminals)" % name) - - - -def symbol_from_strcase(s): - assert isinstance(s, str) - return Terminal(s, filter_out=s.startswith('_')) if s.isupper() else NonTerminal(s) - -@inline_args -class PrepareGrammar(Transformer_InPlace): - def terminal(self, name): - return Terminal(str(name), filter_out=name.startswith('_')) - - def nonterminal(self, name): - return NonTerminal(name.value) - - -def _find_used_symbols(tree): - assert tree.data == 'expansions' - return {t.name for x in tree.find_data('expansion') - for t in x.scan_values(lambda t: isinstance(t, Symbol))} - - -def _get_parser(): - try: - return _get_parser.cache - except AttributeError: - terminals = [TerminalDef(name, PatternRE(value)) for name, value in TERMINALS.items()] - - rules = [(name.lstrip('?'), x, RuleOptions(expand1=name.startswith('?'))) - for name, x in RULES.items()] - rules = [Rule(NonTerminal(r), [symbol_from_strcase(s) for s in x.split()], i, None, o) - for r, xs, o in rules for i, x in enumerate(xs)] - - callback = ParseTreeBuilder(rules, ST).create_callback() - import re - lexer_conf = LexerConf(terminals, re, ['WS', 'COMMENT', 'BACKSLASH']) - parser_conf = ParserConf(rules, callback, ['start']) - lexer_conf.lexer_type = 'basic' - parser_conf.parser_type = 'lalr' - _get_parser.cache = ParsingFrontend(lexer_conf, parser_conf, None) - return _get_parser.cache - -GRAMMAR_ERRORS = [ - ('Incorrect type of value', ['a: 1\n']), - ('Unclosed parenthesis', ['a: (\n']), - ('Unmatched closing parenthesis', ['a: )\n', 'a: [)\n', 'a: (]\n']), - ('Expecting rule or terminal definition (missing colon)', ['a\n', 'A\n', 'a->\n', 'A->\n', 'a A\n']), - ('Illegal name for rules or terminals', ['Aa:\n']), - ('Alias expects lowercase name', ['a: -> "a"\n']), - ('Unexpected colon', ['a::\n', 'a: b:\n', 'a: B:\n', 'a: "a":\n']), - ('Misplaced operator', ['a: b??', 'a: b(?)', 'a:+\n', 'a:?\n', 'a:*\n', 'a:|*\n']), - ('Expecting option ("|") or a new rule or terminal definition', ['a:a\n()\n']), - ('Terminal names cannot contain dots', ['A.B\n']), - ('Expecting rule or terminal definition', ['"a"\n']), - ('%import expects a name', ['%import "a"\n']), - ('%ignore expects a value', ['%ignore %import\n']), - ] - -def _translate_parser_exception(parse, e): - error = e.match_examples(parse, GRAMMAR_ERRORS, use_accepts=True) - if error: - return error - elif 'STRING' in e.expected: - return "Expecting a value" - -def _parse_grammar(text, name, start='start'): - try: - tree = _get_parser().parse(text + '\n', start) - except UnexpectedCharacters as e: - context = e.get_context(text) - raise GrammarError("Unexpected input at line %d column %d in %s: \n\n%s" % - (e.line, e.column, name, context)) - except UnexpectedToken as e: - context = e.get_context(text) - error = _translate_parser_exception(_get_parser().parse, e) - if error: - raise GrammarError("%s, at line %s column %s\n\n%s" % (error, e.line, e.column, context)) - raise - - return PrepareGrammar().transform(tree) - - -def _error_repr(error): - if isinstance(error, UnexpectedToken): - error2 = _translate_parser_exception(_get_parser().parse, error) - if error2: - return error2 - expected = ', '.join(error.accepts or error.expected) - return "Unexpected token %r. Expected one of: {%s}" % (str(error.token), expected) - else: - return str(error) - -def _search_interactive_parser(interactive_parser, predicate): - def expand(node): - path, p = node - for choice in p.choices(): - t = Token(choice, '') - try: - new_p = p.feed_token(t) - except ParseError: # Illegal - pass - else: - yield path + (choice,), new_p - - for path, p in bfs_all_unique([((), interactive_parser)], expand): - if predicate(p): - return path, p - -def find_grammar_errors(text: str, start: str='start') -> List[Tuple[UnexpectedInput, str]]: - errors = [] - def on_error(e): - errors.append((e, _error_repr(e))) - - # recover to a new line - token_path, _ = _search_interactive_parser(e.interactive_parser.as_immutable(), lambda p: '_NL' in p.choices()) - for token_type in token_path: - e.interactive_parser.feed_token(Token(token_type, '')) - e.interactive_parser.feed_token(Token('_NL', '\n')) - return True - - _tree = _get_parser().parse(text + '\n', start, on_error=on_error) - - errors_by_line = classify(errors, lambda e: e[0].line) - errors = [el[0] for el in errors_by_line.values()] # already sorted - - for e in errors: - e[0].interactive_parser = None - return errors - - -def _get_mangle(prefix, aliases, base_mangle=None): - def mangle(s): - if s in aliases: - s = aliases[s] - else: - if s[0] == '_': - s = '_%s__%s' % (prefix, s[1:]) - else: - s = '%s__%s' % (prefix, s) - if base_mangle is not None: - s = base_mangle(s) - return s - return mangle - -def _mangle_definition_tree(exp, mangle): - if mangle is None: - return exp - exp = deepcopy(exp) # TODO: is this needed? - for t in exp.iter_subtrees(): - for i, c in enumerate(t.children): - if isinstance(c, Symbol): - t.children[i] = c.renamed(mangle) - - return exp - -def _make_rule_tuple(modifiers_tree, name, params, priority_tree, expansions): - if modifiers_tree.children: - m ,= modifiers_tree.children - expand1 = '?' in m - if expand1 and name.startswith('_'): - raise GrammarError("Inlined rules (_rule) cannot use the ?rule modifier.") - keep_all_tokens = '!' in m - else: - keep_all_tokens = False - expand1 = False - - if priority_tree.children: - p ,= priority_tree.children - priority = int(p) - else: - priority = None - - if params is not None: - params = [t.value for t in params.children] # For the grammar parser - - return name, params, expansions, RuleOptions(keep_all_tokens, expand1, priority=priority, - template_source=(name if params else None)) - - -class Definition: - def __init__(self, is_term, tree, params=(), options=None): - self.is_term = is_term - self.tree = tree - self.params = tuple(params) - self.options = options - -class GrammarBuilder: - - global_keep_all_tokens: bool - import_paths: List[Union[str, Callable]] - used_files: Dict[str, str] - - _definitions: Dict[str, Definition] - _ignore_names: List[str] - - def __init__(self, global_keep_all_tokens: bool=False, import_paths: Optional[List[Union[str, Callable]]]=None, used_files: Optional[Dict[str, str]]=None) -> None: - self.global_keep_all_tokens = global_keep_all_tokens - self.import_paths = import_paths or [] - self.used_files = used_files or {} - - self._definitions: Dict[str, Definition] = {} - self._ignore_names: List[str] = [] - - def _grammar_error(self, is_term, msg, *names): - args = {} - for i, name in enumerate(names, start=1): - postfix = '' if i == 1 else str(i) - args['name' + postfix] = name - args['type' + postfix] = lowercase_type = ("rule", "terminal")[is_term] - args['Type' + postfix] = lowercase_type.title() - raise GrammarError(msg.format(**args)) - - def _check_options(self, is_term, options): - if is_term: - if options is None: - options = 1 - elif not isinstance(options, int): - raise GrammarError("Terminal require a single int as 'options' (e.g. priority), got %s" % (type(options),)) - else: - if options is None: - options = RuleOptions() - elif not isinstance(options, RuleOptions): - raise GrammarError("Rules require a RuleOptions instance as 'options'") - if self.global_keep_all_tokens: - options.keep_all_tokens = True - return options - - - def _define(self, name, is_term, exp, params=(), options=None, *, override=False): - if name in self._definitions: - if not override: - self._grammar_error(is_term, "{Type} '{name}' defined more than once", name) - elif override: - self._grammar_error(is_term, "Cannot override a nonexisting {type} {name}", name) - - if name.startswith('__'): - self._grammar_error(is_term, 'Names starting with double-underscore are reserved (Error at {name})', name) - - self._definitions[name] = Definition(is_term, exp, params, self._check_options(is_term, options)) - - def _extend(self, name, is_term, exp, params=(), options=None): - if name not in self._definitions: - self._grammar_error(is_term, "Can't extend {type} {name} as it wasn't defined before", name) - - d = self._definitions[name] - - if is_term != d.is_term: - self._grammar_error(is_term, "Cannot extend {type} {name} - one is a terminal, while the other is not.", name) - if tuple(params) != d.params: - self._grammar_error(is_term, "Cannot extend {type} with different parameters: {name}", name) - - if d.tree is None: - self._grammar_error(is_term, "Can't extend {type} {name} - it is abstract.", name) - - # TODO: think about what to do with 'options' - base = d.tree - - assert isinstance(base, Tree) and base.data == 'expansions' - base.children.insert(0, exp) - - def _ignore(self, exp_or_name): - if isinstance(exp_or_name, str): - self._ignore_names.append(exp_or_name) - else: - assert isinstance(exp_or_name, Tree) - t = exp_or_name - if t.data == 'expansions' and len(t.children) == 1: - t2 ,= t.children - if t2.data=='expansion' and len(t2.children) == 1: - item ,= t2.children - if item.data == 'value': - item ,= item.children - if isinstance(item, Terminal): - # Keep terminal name, no need to create a new definition - self._ignore_names.append(item.name) - return - - name = '__IGNORE_%d'% len(self._ignore_names) - self._ignore_names.append(name) - self._definitions[name] = Definition(True, t, options=TOKEN_DEFAULT_PRIORITY) - - def _unpack_import(self, stmt, grammar_name): - if len(stmt.children) > 1: - path_node, arg1 = stmt.children - else: - path_node, = stmt.children - arg1 = None - - if isinstance(arg1, Tree): # Multi import - dotted_path = tuple(path_node.children) - names = arg1.children - aliases = dict(zip(names, names)) # Can't have aliased multi import, so all aliases will be the same as names - else: # Single import - dotted_path = tuple(path_node.children[:-1]) - if not dotted_path: - name ,= path_node.children - raise GrammarError("Nothing was imported from grammar `%s`" % name) - name = path_node.children[-1] # Get name from dotted path - aliases = {name.value: (arg1 or name).value} # Aliases if exist - - if path_node.data == 'import_lib': # Import from library - base_path = None - else: # Relative import - if grammar_name == '': # Import relative to script file path if grammar is coded in script - try: - base_file = os.path.abspath(sys.modules['__main__'].__file__) - except AttributeError: - base_file = None - else: - base_file = grammar_name # Import relative to grammar file path if external grammar file - if base_file: - if isinstance(base_file, PackageResource): - base_path = PackageResource(base_file.pkg_name, os.path.split(base_file.path)[0]) - else: - base_path = os.path.split(base_file)[0] - else: - base_path = os.path.abspath(os.path.curdir) - - return dotted_path, base_path, aliases - - def _unpack_definition(self, tree, mangle): - - if tree.data == 'rule': - name, params, exp, opts = _make_rule_tuple(*tree.children) - is_term = False - else: - name = tree.children[0].value - params = () # TODO terminal templates - opts = int(tree.children[1]) if len(tree.children) == 3 else TOKEN_DEFAULT_PRIORITY # priority - exp = tree.children[-1] - is_term = True - - if mangle is not None: - params = tuple(mangle(p) for p in params) - name = mangle(name) - - exp = _mangle_definition_tree(exp, mangle) - return name, is_term, exp, params, opts - - - def load_grammar(self, grammar_text: str, grammar_name: str="", mangle: Optional[Callable[[str], str]]=None) -> None: - tree = _parse_grammar(grammar_text, grammar_name) - - imports: Dict[Tuple[str, ...], Tuple[Optional[str], Dict[str, str]]] = {} - - for stmt in tree.children: - if stmt.data == 'import': - dotted_path, base_path, aliases = self._unpack_import(stmt, grammar_name) - try: - import_base_path, import_aliases = imports[dotted_path] - assert base_path == import_base_path, 'Inconsistent base_path for %s.' % '.'.join(dotted_path) - import_aliases.update(aliases) - except KeyError: - imports[dotted_path] = base_path, aliases - - for dotted_path, (base_path, aliases) in imports.items(): - self.do_import(dotted_path, base_path, aliases, mangle) - - for stmt in tree.children: - if stmt.data in ('term', 'rule'): - self._define(*self._unpack_definition(stmt, mangle)) - elif stmt.data == 'override': - r ,= stmt.children - self._define(*self._unpack_definition(r, mangle), override=True) - elif stmt.data == 'extend': - r ,= stmt.children - self._extend(*self._unpack_definition(r, mangle)) - elif stmt.data == 'ignore': - # if mangle is not None, we shouldn't apply ignore, since we aren't in a toplevel grammar - if mangle is None: - self._ignore(*stmt.children) - elif stmt.data == 'declare': - for symbol in stmt.children: - assert isinstance(symbol, Symbol), symbol - is_term = isinstance(symbol, Terminal) - if mangle is None: - name = symbol.name - else: - name = mangle(symbol.name) - self._define(name, is_term, None) - elif stmt.data == 'import': - pass - else: - assert False, stmt - - - term_defs = { name: d.tree - for name, d in self._definitions.items() - if d.is_term - } - resolve_term_references(term_defs) - - - def _remove_unused(self, used): - def rule_dependencies(symbol): - try: - d = self._definitions[symbol] - except KeyError: - return [] - if d.is_term: - return [] - return _find_used_symbols(d.tree) - set(d.params) - - _used = set(bfs(used, rule_dependencies)) - self._definitions = {k: v for k, v in self._definitions.items() if k in _used} - - - def do_import(self, dotted_path: Tuple[str, ...], base_path: Optional[str], aliases: Dict[str, str], base_mangle: Optional[Callable[[str], str]]=None) -> None: - assert dotted_path - mangle = _get_mangle('__'.join(dotted_path), aliases, base_mangle) - grammar_path = os.path.join(*dotted_path) + EXT - to_try = self.import_paths + ([base_path] if base_path is not None else []) + [stdlib_loader] - for source in to_try: - try: - if callable(source): - joined_path, text = source(base_path, grammar_path) - else: - joined_path = os.path.join(source, grammar_path) - with open(joined_path, encoding='utf8') as f: - text = f.read() - except IOError: - continue - else: - h = sha256_digest(text) - if self.used_files.get(joined_path, h) != h: - raise RuntimeError("Grammar file was changed during importing") - self.used_files[joined_path] = h - - gb = GrammarBuilder(self.global_keep_all_tokens, self.import_paths, self.used_files) - gb.load_grammar(text, joined_path, mangle) - gb._remove_unused(map(mangle, aliases)) - for name in gb._definitions: - if name in self._definitions: - raise GrammarError("Cannot import '%s' from '%s': Symbol already defined." % (name, grammar_path)) - - self._definitions.update(**gb._definitions) - break - else: - # Search failed. Make Python throw a nice error. - open(grammar_path, encoding='utf8') - assert False, "Couldn't import grammar %s, but a corresponding file was found at a place where lark doesn't search for it" % (dotted_path,) - - - def validate(self) -> None: - for name, d in self._definitions.items(): - params = d.params - exp = d.tree - - for i, p in enumerate(params): - if p in self._definitions: - raise GrammarError("Template Parameter conflicts with rule %s (in template %s)" % (p, name)) - if p in params[:i]: - raise GrammarError("Duplicate Template Parameter %s (in template %s)" % (p, name)) - - if exp is None: # Remaining checks don't apply to abstract rules/terminals (created with %declare) - continue - - for temp in exp.find_data('template_usage'): - sym = temp.children[0].name - args = temp.children[1:] - if sym not in params: - if sym not in self._definitions: - self._grammar_error(d.is_term, "Template '%s' used but not defined (in {type} {name})" % sym, name) - if len(args) != len(self._definitions[sym].params): - expected, actual = len(self._definitions[sym].params), len(args) - self._grammar_error(d.is_term, "Wrong number of template arguments used for {name} " - "(expected %s, got %s) (in {type2} {name2})" % (expected, actual), sym, name) - - for sym in _find_used_symbols(exp): - if sym not in self._definitions and sym not in params: - self._grammar_error(d.is_term, "{Type} '{name}' used but not defined (in {type2} {name2})", sym, name) - - if not set(self._definitions).issuperset(self._ignore_names): - raise GrammarError("Terminals %s were marked to ignore but were not defined!" % (set(self._ignore_names) - set(self._definitions))) - - def build(self) -> Grammar: - self.validate() - rule_defs = [] - term_defs = [] - for name, d in self._definitions.items(): - (params, exp, options) = d.params, d.tree, d.options - if d.is_term: - assert len(params) == 0 - term_defs.append((name, (exp, options))) - else: - rule_defs.append((name, params, exp, options)) - # resolve_term_references(term_defs) - return Grammar(rule_defs, term_defs, self._ignore_names) - - -def verify_used_files(file_hashes): - for path, old in file_hashes.items(): - text = None - if isinstance(path, str) and os.path.exists(path): - with open(path, encoding='utf8') as f: - text = f.read() - elif isinstance(path, PackageResource): - with suppress(IOError): - text = pkgutil.get_data(*path).decode('utf-8') - if text is None: # We don't know how to load the path. ignore it. - continue - - current = sha256_digest(text) - if old != current: - logger.info("File %r changed, rebuilding Parser" % path) - return False - return True - -def list_grammar_imports(grammar, import_paths=[]): - "Returns a list of paths to the lark grammars imported by the given grammar (recursively)" - builder = GrammarBuilder(False, import_paths) - builder.load_grammar(grammar, '') - return list(builder.used_files.keys()) - -def load_grammar(grammar, source, import_paths, global_keep_all_tokens): - builder = GrammarBuilder(global_keep_all_tokens, import_paths) - builder.load_grammar(grammar, source) - return builder.build(), builder.used_files - - -def sha256_digest(s: str) -> str: - """Get the sha256 digest of a string - - Supports the `usedforsecurity` argument for Python 3.9+ to allow running on - a FIPS-enabled system. - """ - if sys.version_info >= (3, 9): - return hashlib.sha256(s.encode('utf8'), usedforsecurity=False).hexdigest() - else: - return hashlib.sha256(s.encode('utf8')).hexdigest() diff --git a/lark/parse_tree_builder.py b/lark/parse_tree_builder.py deleted file mode 100644 index e3a4171..0000000 --- a/lark/parse_tree_builder.py +++ /dev/null @@ -1,391 +0,0 @@ -"""Provides functions for the automatic building and shaping of the parse-tree.""" - -from typing import List - -from .exceptions import GrammarError, ConfigurationError -from .lexer import Token -from .tree import Tree -from .visitors import Transformer_InPlace -from .visitors import _vargs_meta, _vargs_meta_inline - -###{standalone -from functools import partial, wraps -from itertools import product - - -class ExpandSingleChild: - def __init__(self, node_builder): - self.node_builder = node_builder - - def __call__(self, children): - if len(children) == 1: - return children[0] - else: - return self.node_builder(children) - - - -class PropagatePositions: - def __init__(self, node_builder, node_filter=None): - self.node_builder = node_builder - self.node_filter = node_filter - - def __call__(self, children): - res = self.node_builder(children) - - if isinstance(res, Tree): - # Calculate positions while the tree is streaming, according to the rule: - # - nodes start at the start of their first child's container, - # and end at the end of their last child's container. - # Containers are nodes that take up space in text, but have been inlined in the tree. - - res_meta = res.meta - - first_meta = self._pp_get_meta(children) - if first_meta is not None: - if not hasattr(res_meta, 'line'): - # meta was already set, probably because the rule has been inlined (e.g. `?rule`) - res_meta.line = getattr(first_meta, 'container_line', first_meta.line) - res_meta.column = getattr(first_meta, 'container_column', first_meta.column) - res_meta.start_pos = getattr(first_meta, 'container_start_pos', first_meta.start_pos) - res_meta.empty = False - - res_meta.container_line = getattr(first_meta, 'container_line', first_meta.line) - res_meta.container_column = getattr(first_meta, 'container_column', first_meta.column) - res_meta.container_start_pos = getattr(first_meta, 'container_start_pos', first_meta.start_pos) - - last_meta = self._pp_get_meta(reversed(children)) - if last_meta is not None: - if not hasattr(res_meta, 'end_line'): - res_meta.end_line = getattr(last_meta, 'container_end_line', last_meta.end_line) - res_meta.end_column = getattr(last_meta, 'container_end_column', last_meta.end_column) - res_meta.end_pos = getattr(last_meta, 'container_end_pos', last_meta.end_pos) - res_meta.empty = False - - res_meta.container_end_line = getattr(last_meta, 'container_end_line', last_meta.end_line) - res_meta.container_end_column = getattr(last_meta, 'container_end_column', last_meta.end_column) - res_meta.container_end_pos = getattr(last_meta, 'container_end_pos', last_meta.end_pos) - - return res - - def _pp_get_meta(self, children): - for c in children: - if self.node_filter is not None and not self.node_filter(c): - continue - if isinstance(c, Tree): - if not c.meta.empty: - return c.meta - elif isinstance(c, Token): - return c - elif hasattr(c, '__lark_meta__'): - return c.__lark_meta__() - -def make_propagate_positions(option): - if callable(option): - return partial(PropagatePositions, node_filter=option) - elif option is True: - return PropagatePositions - elif option is False: - return None - - raise ConfigurationError('Invalid option for propagate_positions: %r' % option) - - -class ChildFilter: - def __init__(self, to_include, append_none, node_builder): - self.node_builder = node_builder - self.to_include = to_include - self.append_none = append_none - - def __call__(self, children): - filtered = [] - - for i, to_expand, add_none in self.to_include: - if add_none: - filtered += [None] * add_none - if to_expand: - filtered += children[i].children - else: - filtered.append(children[i]) - - if self.append_none: - filtered += [None] * self.append_none - - return self.node_builder(filtered) - - -class ChildFilterLALR(ChildFilter): - """Optimized childfilter for LALR (assumes no duplication in parse tree, so it's safe to change it)""" - - def __call__(self, children): - filtered = [] - for i, to_expand, add_none in self.to_include: - if add_none: - filtered += [None] * add_none - if to_expand: - if filtered: - filtered += children[i].children - else: # Optimize for left-recursion - filtered = children[i].children - else: - filtered.append(children[i]) - - if self.append_none: - filtered += [None] * self.append_none - - return self.node_builder(filtered) - - -class ChildFilterLALR_NoPlaceholders(ChildFilter): - "Optimized childfilter for LALR (assumes no duplication in parse tree, so it's safe to change it)" - def __init__(self, to_include, node_builder): - self.node_builder = node_builder - self.to_include = to_include - - def __call__(self, children): - filtered = [] - for i, to_expand in self.to_include: - if to_expand: - if filtered: - filtered += children[i].children - else: # Optimize for left-recursion - filtered = children[i].children - else: - filtered.append(children[i]) - return self.node_builder(filtered) - - -def _should_expand(sym): - return not sym.is_term and sym.name.startswith('_') - - -def maybe_create_child_filter(expansion, keep_all_tokens, ambiguous, _empty_indices: List[bool]): - # Prepare empty_indices as: How many Nones to insert at each index? - if _empty_indices: - assert _empty_indices.count(False) == len(expansion) - s = ''.join(str(int(b)) for b in _empty_indices) - empty_indices = [len(ones) for ones in s.split('0')] - assert len(empty_indices) == len(expansion)+1, (empty_indices, len(expansion)) - else: - empty_indices = [0] * (len(expansion)+1) - - to_include = [] - nones_to_add = 0 - for i, sym in enumerate(expansion): - nones_to_add += empty_indices[i] - if keep_all_tokens or not (sym.is_term and sym.filter_out): - to_include.append((i, _should_expand(sym), nones_to_add)) - nones_to_add = 0 - - nones_to_add += empty_indices[len(expansion)] - - if _empty_indices or len(to_include) < len(expansion) or any(to_expand for i, to_expand,_ in to_include): - if _empty_indices or ambiguous: - return partial(ChildFilter if ambiguous else ChildFilterLALR, to_include, nones_to_add) - else: - # LALR without placeholders - return partial(ChildFilterLALR_NoPlaceholders, [(i, x) for i,x,_ in to_include]) - - -class AmbiguousExpander: - """Deal with the case where we're expanding children ('_rule') into a parent but the children - are ambiguous. i.e. (parent->_ambig->_expand_this_rule). In this case, make the parent itself - ambiguous with as many copies as there are ambiguous children, and then copy the ambiguous children - into the right parents in the right places, essentially shifting the ambiguity up the tree.""" - def __init__(self, to_expand, tree_class, node_builder): - self.node_builder = node_builder - self.tree_class = tree_class - self.to_expand = to_expand - - def __call__(self, children): - def _is_ambig_tree(t): - return hasattr(t, 'data') and t.data == '_ambig' - - # -- When we're repeatedly expanding ambiguities we can end up with nested ambiguities. - # All children of an _ambig node should be a derivation of that ambig node, hence - # it is safe to assume that if we see an _ambig node nested within an ambig node - # it is safe to simply expand it into the parent _ambig node as an alternative derivation. - ambiguous = [] - for i, child in enumerate(children): - if _is_ambig_tree(child): - if i in self.to_expand: - ambiguous.append(i) - - child.expand_kids_by_data('_ambig') - - if not ambiguous: - return self.node_builder(children) - - expand = [child.children if i in ambiguous else (child,) for i, child in enumerate(children)] - return self.tree_class('_ambig', [self.node_builder(list(f)) for f in product(*expand)]) - - -def maybe_create_ambiguous_expander(tree_class, expansion, keep_all_tokens): - to_expand = [i for i, sym in enumerate(expansion) - if keep_all_tokens or ((not (sym.is_term and sym.filter_out)) and _should_expand(sym))] - if to_expand: - return partial(AmbiguousExpander, to_expand, tree_class) - - -class AmbiguousIntermediateExpander: - """ - Propagate ambiguous intermediate nodes and their derivations up to the - current rule. - - In general, converts - - rule - _iambig - _inter - someChildren1 - ... - _inter - someChildren2 - ... - someChildren3 - ... - - to - - _ambig - rule - someChildren1 - ... - someChildren3 - ... - rule - someChildren2 - ... - someChildren3 - ... - rule - childrenFromNestedIambigs - ... - someChildren3 - ... - ... - - propagating up any nested '_iambig' nodes along the way. - """ - - def __init__(self, tree_class, node_builder): - self.node_builder = node_builder - self.tree_class = tree_class - - def __call__(self, children): - def _is_iambig_tree(child): - return hasattr(child, 'data') and child.data == '_iambig' - - def _collapse_iambig(children): - """ - Recursively flatten the derivations of the parent of an '_iambig' - node. Returns a list of '_inter' nodes guaranteed not - to contain any nested '_iambig' nodes, or None if children does - not contain an '_iambig' node. - """ - - # Due to the structure of the SPPF, - # an '_iambig' node can only appear as the first child - if children and _is_iambig_tree(children[0]): - iambig_node = children[0] - result = [] - for grandchild in iambig_node.children: - collapsed = _collapse_iambig(grandchild.children) - if collapsed: - for child in collapsed: - child.children += children[1:] - result += collapsed - else: - new_tree = self.tree_class('_inter', grandchild.children + children[1:]) - result.append(new_tree) - return result - - collapsed = _collapse_iambig(children) - if collapsed: - processed_nodes = [self.node_builder(c.children) for c in collapsed] - return self.tree_class('_ambig', processed_nodes) - - return self.node_builder(children) - - - -def inplace_transformer(func): - @wraps(func) - def f(children): - # function name in a Transformer is a rule name. - tree = Tree(func.__name__, children) - return func(tree) - return f - - -def apply_visit_wrapper(func, name, wrapper): - if wrapper is _vargs_meta or wrapper is _vargs_meta_inline: - raise NotImplementedError("Meta args not supported for internal transformer") - - @wraps(func) - def f(children): - return wrapper(func, name, children, None) - return f - - -class ParseTreeBuilder: - def __init__(self, rules, tree_class, propagate_positions=False, ambiguous=False, maybe_placeholders=False): - self.tree_class = tree_class - self.propagate_positions = propagate_positions - self.ambiguous = ambiguous - self.maybe_placeholders = maybe_placeholders - - self.rule_builders = list(self._init_builders(rules)) - - def _init_builders(self, rules): - propagate_positions = make_propagate_positions(self.propagate_positions) - - for rule in rules: - options = rule.options - keep_all_tokens = options.keep_all_tokens - expand_single_child = options.expand1 - - wrapper_chain = list(filter(None, [ - (expand_single_child and not rule.alias) and ExpandSingleChild, - maybe_create_child_filter(rule.expansion, keep_all_tokens, self.ambiguous, options.empty_indices if self.maybe_placeholders else None), - propagate_positions, - self.ambiguous and maybe_create_ambiguous_expander(self.tree_class, rule.expansion, keep_all_tokens), - self.ambiguous and partial(AmbiguousIntermediateExpander, self.tree_class) - ])) - - yield rule, wrapper_chain - - def create_callback(self, transformer=None): - callbacks = {} - - default_handler = getattr(transformer, '__default__', None) - if default_handler: - def default_callback(data, children): - return default_handler(data, children, None) - else: - default_callback = self.tree_class - - for rule, wrapper_chain in self.rule_builders: - - user_callback_name = rule.alias or rule.options.template_source or rule.origin.name - try: - f = getattr(transformer, user_callback_name) - wrapper = getattr(f, 'visit_wrapper', None) - if wrapper is not None: - f = apply_visit_wrapper(f, user_callback_name, wrapper) - elif isinstance(transformer, Transformer_InPlace): - f = inplace_transformer(f) - except AttributeError: - f = partial(default_callback, user_callback_name) - - for w in wrapper_chain: - f = w(f) - - if rule in callbacks: - raise GrammarError("Rule '%s' already exists" % (rule,)) - - callbacks[rule] = f - - return callbacks - -###} diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py deleted file mode 100644 index 186058a..0000000 --- a/lark/parser_frontends.py +++ /dev/null @@ -1,257 +0,0 @@ -from typing import Any, Callable, Dict, Optional, Collection, Union, TYPE_CHECKING - -from .exceptions import ConfigurationError, GrammarError, assert_config -from .utils import get_regexp_width, Serialize -from .lexer import LexerThread, BasicLexer, ContextualLexer, Lexer -from .parsers import earley, xearley, cyk -from .parsers.lalr_parser import LALR_Parser -from .tree import Tree -from .common import LexerConf, ParserConf, _ParserArgType, _LexerArgType - -if TYPE_CHECKING: - from .parsers.lalr_analysis import ParseTableBase - - -###{standalone - -def _wrap_lexer(lexer_class): - future_interface = getattr(lexer_class, '__future_interface__', False) - if future_interface: - return lexer_class - else: - class CustomLexerWrapper(Lexer): - def __init__(self, lexer_conf): - self.lexer = lexer_class(lexer_conf) - def lex(self, lexer_state, parser_state): - return self.lexer.lex(lexer_state.text) - return CustomLexerWrapper - - -def _deserialize_parsing_frontend(data, memo, lexer_conf, callbacks, options): - parser_conf = ParserConf.deserialize(data['parser_conf'], memo) - cls = (options and options._plugins.get('LALR_Parser')) or LALR_Parser - parser = cls.deserialize(data['parser'], memo, callbacks, options.debug) - parser_conf.callbacks = callbacks - return ParsingFrontend(lexer_conf, parser_conf, options, parser=parser) - - -_parser_creators: 'Dict[str, Callable[[LexerConf, Any, Any], Any]]' = {} - - -class ParsingFrontend(Serialize): - __serialize_fields__ = 'lexer_conf', 'parser_conf', 'parser' - - lexer_conf: LexerConf - parser_conf: ParserConf - options: Any - - def __init__(self, lexer_conf: LexerConf, parser_conf: ParserConf, options, parser=None): - self.parser_conf = parser_conf - self.lexer_conf = lexer_conf - self.options = options - - # Set-up parser - if parser: # From cache - self.parser = parser - else: - create_parser = _parser_creators.get(parser_conf.parser_type) - assert create_parser is not None, "{} is not supported in standalone mode".format( - parser_conf.parser_type - ) - self.parser = create_parser(lexer_conf, parser_conf, options) - - # Set-up lexer - lexer_type = lexer_conf.lexer_type - self.skip_lexer = False - if lexer_type in ('dynamic', 'dynamic_complete'): - assert lexer_conf.postlex is None - self.skip_lexer = True - return - - if isinstance(lexer_type, type): - assert issubclass(lexer_type, Lexer) - self.lexer = _wrap_lexer(lexer_type)(lexer_conf) - elif isinstance(lexer_type, str): - create_lexer = { - 'basic': create_basic_lexer, - 'contextual': create_contextual_lexer, - }[lexer_type] - self.lexer = create_lexer(lexer_conf, self.parser, lexer_conf.postlex, options) - else: - raise TypeError("Bad value for lexer_type: {lexer_type}") - - if lexer_conf.postlex: - self.lexer = PostLexConnector(self.lexer, lexer_conf.postlex) - - def _verify_start(self, start=None): - if start is None: - start_decls = self.parser_conf.start - if len(start_decls) > 1: - raise ConfigurationError("Lark initialized with more than 1 possible start rule. Must specify which start rule to parse", start_decls) - start ,= start_decls - elif start not in self.parser_conf.start: - raise ConfigurationError("Unknown start rule %s. Must be one of %r" % (start, self.parser_conf.start)) - return start - - def _make_lexer_thread(self, text: str) -> Union[str, LexerThread]: - cls = (self.options and self.options._plugins.get('LexerThread')) or LexerThread - return text if self.skip_lexer else cls.from_text(self.lexer, text) - - def parse(self, text: str, start=None, on_error=None): - chosen_start = self._verify_start(start) - kw = {} if on_error is None else {'on_error': on_error} - stream = self._make_lexer_thread(text) - return self.parser.parse(stream, chosen_start, **kw) - - def parse_interactive(self, text: Optional[str]=None, start=None): - # TODO BREAK - Change text from Optional[str] to text: str = ''. - # Would break behavior of exhaust_lexer(), which currently raises TypeError, and after the change would just return [] - chosen_start = self._verify_start(start) - if self.parser_conf.parser_type != 'lalr': - raise ConfigurationError("parse_interactive() currently only works with parser='lalr' ") - stream = self._make_lexer_thread(text) # type: ignore[arg-type] - return self.parser.parse_interactive(stream, chosen_start) - - -def _validate_frontend_args(parser, lexer) -> None: - assert_config(parser, ('lalr', 'earley', 'cyk')) - if not isinstance(lexer, type): # not custom lexer? - expected = { - 'lalr': ('basic', 'contextual'), - 'earley': ('basic', 'dynamic', 'dynamic_complete'), - 'cyk': ('basic', ), - }[parser] - assert_config(lexer, expected, 'Parser %r does not support lexer %%r, expected one of %%s' % parser) - - -def _get_lexer_callbacks(transformer, terminals): - result = {} - for terminal in terminals: - callback = getattr(transformer, terminal.name, None) - if callback is not None: - result[terminal.name] = callback - return result - -class PostLexConnector: - def __init__(self, lexer, postlexer): - self.lexer = lexer - self.postlexer = postlexer - - def lex(self, lexer_state, parser_state): - i = self.lexer.lex(lexer_state, parser_state) - return self.postlexer.process(i) - - - -def create_basic_lexer(lexer_conf, parser, postlex, options) -> BasicLexer: - cls = (options and options._plugins.get('BasicLexer')) or BasicLexer - return cls(lexer_conf) - -def create_contextual_lexer(lexer_conf: LexerConf, parser, postlex, options) -> ContextualLexer: - cls = (options and options._plugins.get('ContextualLexer')) or ContextualLexer - parse_table: ParseTableBase[int] = parser._parse_table - states: Dict[int, Collection[str]] = {idx:list(t.keys()) for idx, t in parse_table.states.items()} - always_accept: Collection[str] = postlex.always_accept if postlex else () - return cls(lexer_conf, states, always_accept=always_accept) - -def create_lalr_parser(lexer_conf: LexerConf, parser_conf: ParserConf, options=None) -> LALR_Parser: - debug = options.debug if options else False - strict = options.strict if options else False - cls = (options and options._plugins.get('LALR_Parser')) or LALR_Parser - return cls(parser_conf, debug=debug, strict=strict) - -_parser_creators['lalr'] = create_lalr_parser - -###} - -class EarleyRegexpMatcher: - def __init__(self, lexer_conf): - self.regexps = {} - for t in lexer_conf.terminals: - regexp = t.pattern.to_regexp() - try: - width = get_regexp_width(regexp)[0] - except ValueError: - raise GrammarError("Bad regexp in token %s: %s" % (t.name, regexp)) - else: - if width == 0: - raise GrammarError("Dynamic Earley doesn't allow zero-width regexps", t) - if lexer_conf.use_bytes: - regexp = regexp.encode('utf-8') - - self.regexps[t.name] = lexer_conf.re_module.compile(regexp, lexer_conf.g_regex_flags) - - def match(self, term, text, index=0): - return self.regexps[term.name].match(text, index) - - -def create_earley_parser__dynamic(lexer_conf: LexerConf, parser_conf: ParserConf, **kw): - if lexer_conf.callbacks: - raise GrammarError("Earley's dynamic lexer doesn't support lexer_callbacks.") - - earley_matcher = EarleyRegexpMatcher(lexer_conf) - return xearley.Parser(lexer_conf, parser_conf, earley_matcher.match, **kw) - -def _match_earley_basic(term, token): - return term.name == token.type - -def create_earley_parser__basic(lexer_conf: LexerConf, parser_conf: ParserConf, **kw): - return earley.Parser(lexer_conf, parser_conf, _match_earley_basic, **kw) - -def create_earley_parser(lexer_conf: LexerConf, parser_conf: ParserConf, options) -> earley.Parser: - resolve_ambiguity = options.ambiguity == 'resolve' - debug = options.debug if options else False - tree_class = options.tree_class or Tree if options.ambiguity != 'forest' else None - - extra = {} - if lexer_conf.lexer_type == 'dynamic': - f = create_earley_parser__dynamic - elif lexer_conf.lexer_type == 'dynamic_complete': - extra['complete_lex'] = True - f = create_earley_parser__dynamic - else: - f = create_earley_parser__basic - - return f(lexer_conf, parser_conf, resolve_ambiguity=resolve_ambiguity, - debug=debug, tree_class=tree_class, ordered_sets=options.ordered_sets, **extra) - - - -class CYK_FrontEnd: - def __init__(self, lexer_conf, parser_conf, options=None): - self.parser = cyk.Parser(parser_conf.rules) - - self.callbacks = parser_conf.callbacks - - def parse(self, lexer_thread, start): - tokens = list(lexer_thread.lex(None)) - tree = self.parser.parse(tokens, start) - return self._transform(tree) - - def _transform(self, tree): - subtrees = list(tree.iter_subtrees()) - for subtree in subtrees: - subtree.children = [self._apply_callback(c) if isinstance(c, Tree) else c for c in subtree.children] - - return self._apply_callback(tree) - - def _apply_callback(self, tree): - return self.callbacks[tree.rule](tree.children) - - -_parser_creators['earley'] = create_earley_parser -_parser_creators['cyk'] = CYK_FrontEnd - - -def _construct_parsing_frontend( - parser_type: _ParserArgType, - lexer_type: _LexerArgType, - lexer_conf, - parser_conf, - options -): - assert isinstance(lexer_conf, LexerConf) - assert isinstance(parser_conf, ParserConf) - parser_conf.parser_type = parser_type - lexer_conf.lexer_type = lexer_type - return ParsingFrontend(lexer_conf, parser_conf, options) diff --git a/lark/parsers/__init__.py b/lark/parsers/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/lark/parsers/cyk.py b/lark/parsers/cyk.py deleted file mode 100644 index b5334f9..0000000 --- a/lark/parsers/cyk.py +++ /dev/null @@ -1,340 +0,0 @@ -"""This module implements a CYK parser.""" - -# Author: https://github.com/ehudt (2018) -# -# Adapted by Erez - - -from collections import defaultdict -import itertools - -from ..exceptions import ParseError -from ..lexer import Token -from ..tree import Tree -from ..grammar import Terminal as T, NonTerminal as NT, Symbol - -def match(t, s): - assert isinstance(t, T) - return t.name == s.type - - -class Rule: - """Context-free grammar rule.""" - - def __init__(self, lhs, rhs, weight, alias): - super(Rule, self).__init__() - assert isinstance(lhs, NT), lhs - assert all(isinstance(x, NT) or isinstance(x, T) for x in rhs), rhs - self.lhs = lhs - self.rhs = rhs - self.weight = weight - self.alias = alias - - def __str__(self): - return '%s -> %s' % (str(self.lhs), ' '.join(str(x) for x in self.rhs)) - - def __repr__(self): - return str(self) - - def __hash__(self): - return hash((self.lhs, tuple(self.rhs))) - - def __eq__(self, other): - return self.lhs == other.lhs and self.rhs == other.rhs - - def __ne__(self, other): - return not (self == other) - - -class Grammar: - """Context-free grammar.""" - - def __init__(self, rules): - self.rules = frozenset(rules) - - def __eq__(self, other): - return self.rules == other.rules - - def __str__(self): - return '\n' + '\n'.join(sorted(repr(x) for x in self.rules)) + '\n' - - def __repr__(self): - return str(self) - - -# Parse tree data structures -class RuleNode: - """A node in the parse tree, which also contains the full rhs rule.""" - - def __init__(self, rule, children, weight=0): - self.rule = rule - self.children = children - self.weight = weight - - def __repr__(self): - return 'RuleNode(%s, [%s])' % (repr(self.rule.lhs), ', '.join(str(x) for x in self.children)) - - - -class Parser: - """Parser wrapper.""" - - def __init__(self, rules): - super(Parser, self).__init__() - self.orig_rules = {rule: rule for rule in rules} - rules = [self._to_rule(rule) for rule in rules] - self.grammar = to_cnf(Grammar(rules)) - - def _to_rule(self, lark_rule): - """Converts a lark rule, (lhs, rhs, callback, options), to a Rule.""" - assert isinstance(lark_rule.origin, NT) - assert all(isinstance(x, Symbol) for x in lark_rule.expansion) - return Rule( - lark_rule.origin, lark_rule.expansion, - weight=lark_rule.options.priority if lark_rule.options.priority else 0, - alias=lark_rule) - - def parse(self, tokenized, start): # pylint: disable=invalid-name - """Parses input, which is a list of tokens.""" - assert start - start = NT(start) - - table, trees = _parse(tokenized, self.grammar) - # Check if the parse succeeded. - if all(r.lhs != start for r in table[(0, len(tokenized) - 1)]): - raise ParseError('Parsing failed.') - parse = trees[(0, len(tokenized) - 1)][start] - return self._to_tree(revert_cnf(parse)) - - def _to_tree(self, rule_node): - """Converts a RuleNode parse tree to a lark Tree.""" - orig_rule = self.orig_rules[rule_node.rule.alias] - children = [] - for child in rule_node.children: - if isinstance(child, RuleNode): - children.append(self._to_tree(child)) - else: - assert isinstance(child.name, Token) - children.append(child.name) - t = Tree(orig_rule.origin, children) - t.rule=orig_rule - return t - - -def print_parse(node, indent=0): - if isinstance(node, RuleNode): - print(' ' * (indent * 2) + str(node.rule.lhs)) - for child in node.children: - print_parse(child, indent + 1) - else: - print(' ' * (indent * 2) + str(node.s)) - - -def _parse(s, g): - """Parses sentence 's' using CNF grammar 'g'.""" - # The CYK table. Indexed with a 2-tuple: (start pos, end pos) - table = defaultdict(set) - # Top-level structure is similar to the CYK table. Each cell is a dict from - # rule name to the best (lightest) tree for that rule. - trees = defaultdict(dict) - # Populate base case with existing terminal production rules - for i, w in enumerate(s): - for terminal, rules in g.terminal_rules.items(): - if match(terminal, w): - for rule in rules: - table[(i, i)].add(rule) - if (rule.lhs not in trees[(i, i)] or - rule.weight < trees[(i, i)][rule.lhs].weight): - trees[(i, i)][rule.lhs] = RuleNode(rule, [T(w)], weight=rule.weight) - - # Iterate over lengths of sub-sentences - for l in range(2, len(s) + 1): - # Iterate over sub-sentences with the given length - for i in range(len(s) - l + 1): - # Choose partition of the sub-sentence in [1, l) - for p in range(i + 1, i + l): - span1 = (i, p - 1) - span2 = (p, i + l - 1) - for r1, r2 in itertools.product(table[span1], table[span2]): - for rule in g.nonterminal_rules.get((r1.lhs, r2.lhs), []): - table[(i, i + l - 1)].add(rule) - r1_tree = trees[span1][r1.lhs] - r2_tree = trees[span2][r2.lhs] - rule_total_weight = rule.weight + r1_tree.weight + r2_tree.weight - if (rule.lhs not in trees[(i, i + l - 1)] - or rule_total_weight < trees[(i, i + l - 1)][rule.lhs].weight): - trees[(i, i + l - 1)][rule.lhs] = RuleNode(rule, [r1_tree, r2_tree], weight=rule_total_weight) - return table, trees - - -# This section implements context-free grammar converter to Chomsky normal form. -# It also implements a conversion of parse trees from its CNF to the original -# grammar. -# Overview: -# Applies the following operations in this order: -# * TERM: Eliminates non-solitary terminals from all rules -# * BIN: Eliminates rules with more than 2 symbols on their right-hand-side. -# * UNIT: Eliminates non-terminal unit rules -# -# The following grammar characteristics aren't featured: -# * Start symbol appears on RHS -# * Empty rules (epsilon rules) - - -class CnfWrapper: - """CNF wrapper for grammar. - - Validates that the input grammar is CNF and provides helper data structures. - """ - - def __init__(self, grammar): - super(CnfWrapper, self).__init__() - self.grammar = grammar - self.rules = grammar.rules - self.terminal_rules = defaultdict(list) - self.nonterminal_rules = defaultdict(list) - for r in self.rules: - # Validate that the grammar is CNF and populate auxiliary data structures. - assert isinstance(r.lhs, NT), r - if len(r.rhs) not in [1, 2]: - raise ParseError("CYK doesn't support empty rules") - if len(r.rhs) == 1 and isinstance(r.rhs[0], T): - self.terminal_rules[r.rhs[0]].append(r) - elif len(r.rhs) == 2 and all(isinstance(x, NT) for x in r.rhs): - self.nonterminal_rules[tuple(r.rhs)].append(r) - else: - assert False, r - - def __eq__(self, other): - return self.grammar == other.grammar - - def __repr__(self): - return repr(self.grammar) - - -class UnitSkipRule(Rule): - """A rule that records NTs that were skipped during transformation.""" - - def __init__(self, lhs, rhs, skipped_rules, weight, alias): - super(UnitSkipRule, self).__init__(lhs, rhs, weight, alias) - self.skipped_rules = skipped_rules - - def __eq__(self, other): - return isinstance(other, type(self)) and self.skipped_rules == other.skipped_rules - - __hash__ = Rule.__hash__ - - -def build_unit_skiprule(unit_rule, target_rule): - skipped_rules = [] - if isinstance(unit_rule, UnitSkipRule): - skipped_rules += unit_rule.skipped_rules - skipped_rules.append(target_rule) - if isinstance(target_rule, UnitSkipRule): - skipped_rules += target_rule.skipped_rules - return UnitSkipRule(unit_rule.lhs, target_rule.rhs, skipped_rules, - weight=unit_rule.weight + target_rule.weight, alias=unit_rule.alias) - - -def get_any_nt_unit_rule(g): - """Returns a non-terminal unit rule from 'g', or None if there is none.""" - for rule in g.rules: - if len(rule.rhs) == 1 and isinstance(rule.rhs[0], NT): - return rule - return None - - -def _remove_unit_rule(g, rule): - """Removes 'rule' from 'g' without changing the language produced by 'g'.""" - new_rules = [x for x in g.rules if x != rule] - refs = [x for x in g.rules if x.lhs == rule.rhs[0]] - new_rules += [build_unit_skiprule(rule, ref) for ref in refs] - return Grammar(new_rules) - - -def _split(rule): - """Splits a rule whose len(rhs) > 2 into shorter rules.""" - rule_str = str(rule.lhs) + '__' + '_'.join(str(x) for x in rule.rhs) - rule_name = '__SP_%s' % (rule_str) + '_%d' - yield Rule(rule.lhs, [rule.rhs[0], NT(rule_name % 1)], weight=rule.weight, alias=rule.alias) - for i in range(1, len(rule.rhs) - 2): - yield Rule(NT(rule_name % i), [rule.rhs[i], NT(rule_name % (i + 1))], weight=0, alias='Split') - yield Rule(NT(rule_name % (len(rule.rhs) - 2)), rule.rhs[-2:], weight=0, alias='Split') - - -def _term(g): - """Applies the TERM rule on 'g' (see top comment).""" - all_t = {x for rule in g.rules for x in rule.rhs if isinstance(x, T)} - t_rules = {t: Rule(NT('__T_%s' % str(t)), [t], weight=0, alias='Term') for t in all_t} - new_rules = [] - for rule in g.rules: - if len(rule.rhs) > 1 and any(isinstance(x, T) for x in rule.rhs): - new_rhs = [t_rules[x].lhs if isinstance(x, T) else x for x in rule.rhs] - new_rules.append(Rule(rule.lhs, new_rhs, weight=rule.weight, alias=rule.alias)) - new_rules.extend(v for k, v in t_rules.items() if k in rule.rhs) - else: - new_rules.append(rule) - return Grammar(new_rules) - - -def _bin(g): - """Applies the BIN rule to 'g' (see top comment).""" - new_rules = [] - for rule in g.rules: - if len(rule.rhs) > 2: - new_rules += _split(rule) - else: - new_rules.append(rule) - return Grammar(new_rules) - - -def _unit(g): - """Applies the UNIT rule to 'g' (see top comment).""" - nt_unit_rule = get_any_nt_unit_rule(g) - while nt_unit_rule: - g = _remove_unit_rule(g, nt_unit_rule) - nt_unit_rule = get_any_nt_unit_rule(g) - return g - - -def to_cnf(g): - """Creates a CNF grammar from a general context-free grammar 'g'.""" - g = _unit(_bin(_term(g))) - return CnfWrapper(g) - - -def unroll_unit_skiprule(lhs, orig_rhs, skipped_rules, children, weight, alias): - if not skipped_rules: - return RuleNode(Rule(lhs, orig_rhs, weight=weight, alias=alias), children, weight=weight) - else: - weight = weight - skipped_rules[0].weight - return RuleNode( - Rule(lhs, [skipped_rules[0].lhs], weight=weight, alias=alias), [ - unroll_unit_skiprule(skipped_rules[0].lhs, orig_rhs, - skipped_rules[1:], children, - skipped_rules[0].weight, skipped_rules[0].alias) - ], weight=weight) - - -def revert_cnf(node): - """Reverts a parse tree (RuleNode) to its original non-CNF form (Node).""" - if isinstance(node, T): - return node - # Reverts TERM rule. - if node.rule.lhs.name.startswith('__T_'): - return node.children[0] - else: - children = [] - for child in map(revert_cnf, node.children): - # Reverts BIN rule. - if isinstance(child, RuleNode) and child.rule.lhs.name.startswith('__SP_'): - children += child.children - else: - children.append(child) - # Reverts UNIT rule. - if isinstance(node.rule, UnitSkipRule): - return unroll_unit_skiprule(node.rule.lhs, node.rule.rhs, - node.rule.skipped_rules, children, - node.rule.weight, node.rule.alias) - else: - return RuleNode(node.rule, children) diff --git a/lark/parsers/earley.py b/lark/parsers/earley.py deleted file mode 100644 index 8ac7ecb..0000000 --- a/lark/parsers/earley.py +++ /dev/null @@ -1,312 +0,0 @@ -"""This module implements an Earley parser. - -The core Earley algorithm used here is based on Elizabeth Scott's implementation, here: - https://www.sciencedirect.com/science/article/pii/S1571066108001497 - -That is probably the best reference for understanding the algorithm here. - -The Earley parser outputs an SPPF-tree as per that document. The SPPF tree format -is explained here: https://lark-parser.readthedocs.io/en/latest/_static/sppf/sppf.html -""" - -from typing import TYPE_CHECKING, Callable, Optional, List, Any -from collections import deque - -from ..lexer import Token -from ..tree import Tree -from ..exceptions import UnexpectedEOF, UnexpectedToken -from ..utils import logger, OrderedSet, dedup_list -from .grammar_analysis import GrammarAnalyzer -from ..grammar import NonTerminal -from .earley_common import Item -from .earley_forest import ForestSumVisitor, SymbolNode, StableSymbolNode, TokenNode, ForestToParseTree - -if TYPE_CHECKING: - from ..common import LexerConf, ParserConf - -class Parser: - lexer_conf: 'LexerConf' - parser_conf: 'ParserConf' - debug: bool - - def __init__(self, lexer_conf: 'LexerConf', parser_conf: 'ParserConf', term_matcher: Callable, - resolve_ambiguity: bool=True, debug: bool=False, - tree_class: Optional[Callable[[str, List], Any]]=Tree, ordered_sets: bool=True): - analysis = GrammarAnalyzer(parser_conf) - self.lexer_conf = lexer_conf - self.parser_conf = parser_conf - self.resolve_ambiguity = resolve_ambiguity - self.debug = debug - self.Tree = tree_class - self.Set = OrderedSet if ordered_sets else set - self.SymbolNode = StableSymbolNode if ordered_sets else SymbolNode - - self.FIRST = analysis.FIRST - self.NULLABLE = analysis.NULLABLE - self.callbacks = parser_conf.callbacks - # TODO add typing info - self.predictions = {} # type: ignore[var-annotated] - - ## These could be moved to the grammar analyzer. Pre-computing these is *much* faster than - # the slow 'isupper' in is_terminal. - self.TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if sym.is_term } - self.NON_TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if not sym.is_term } - - self.forest_sum_visitor = None - for rule in parser_conf.rules: - if rule.origin not in self.predictions: - self.predictions[rule.origin] = [x.rule for x in analysis.expand_rule(rule.origin)] - - ## Detect if any rules/terminals have priorities set. If the user specified priority = None, then - # the priorities will be stripped from all rules/terminals before they reach us, allowing us to - # skip the extra tree walk. We'll also skip this if the user just didn't specify priorities - # on any rules/terminals. - if self.forest_sum_visitor is None and rule.options.priority is not None: - self.forest_sum_visitor = ForestSumVisitor - - # Check terminals for priorities - # Ignore terminal priorities if the basic lexer is used - if self.lexer_conf.lexer_type != 'basic' and self.forest_sum_visitor is None: - for term in self.lexer_conf.terminals: - if term.priority: - self.forest_sum_visitor = ForestSumVisitor - break - - self.term_matcher = term_matcher - - - def predict_and_complete(self, i, to_scan, columns, transitives, node_cache): - """The core Earley Predictor and Completer. - - At each stage of the input, we handling any completed items (things - that matched on the last cycle) and use those to predict what should - come next in the input stream. The completions and any predicted - non-terminals are recursively processed until we reach a set of, - which can be added to the scan list for the next scanner cycle.""" - # Held Completions (H in E.Scotts paper). - held_completions = {} - - column = columns[i] - # R (items) = Ei (column.items) - items = deque(column) - while items: - item = items.pop() # remove an element, A say, from R - - ### The Earley completer - if item.is_complete: ### (item.s == string) - if item.node is None: - label = (item.s, item.start, i) - item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, self.SymbolNode(*label)) - item.node.add_family(item.s, item.rule, item.start, None, None) - - # create_leo_transitives(item.rule.origin, item.start) - - ###R Joop Leo right recursion Completer - if item.rule.origin in transitives[item.start]: - transitive = transitives[item.start][item.s] - if transitive.previous in transitives[transitive.column]: - root_transitive = transitives[transitive.column][transitive.previous] - else: - root_transitive = transitive - - new_item = Item(transitive.rule, transitive.ptr, transitive.start) - label = (root_transitive.s, root_transitive.start, i) - new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, self.SymbolNode(*label)) - new_item.node.add_path(root_transitive, item.node) - if new_item.expect in self.TERMINALS: - # Add (B :: aC.B, h, y) to Q - to_scan.add(new_item) - elif new_item not in column: - # Add (B :: aC.B, h, y) to Ei and R - column.add(new_item) - items.append(new_item) - ###R Regular Earley completer - else: - # Empty has 0 length. If we complete an empty symbol in a particular - # parse step, we need to be able to use that same empty symbol to complete - # any predictions that result, that themselves require empty. Avoids - # infinite recursion on empty symbols. - # held_completions is 'H' in E.Scott's paper. - is_empty_item = item.start == i - if is_empty_item: - held_completions[item.rule.origin] = item.node - - originators = [originator for originator in columns[item.start] if originator.expect is not None and originator.expect == item.s] - for originator in originators: - new_item = originator.advance() - label = (new_item.s, originator.start, i) - new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, self.SymbolNode(*label)) - new_item.node.add_family(new_item.s, new_item.rule, i, originator.node, item.node) - if new_item.expect in self.TERMINALS: - # Add (B :: aC.B, h, y) to Q - to_scan.add(new_item) - elif new_item not in column: - # Add (B :: aC.B, h, y) to Ei and R - column.add(new_item) - items.append(new_item) - - ### The Earley predictor - elif item.expect in self.NON_TERMINALS: ### (item.s == lr0) - new_items = [] - for rule in self.predictions[item.expect]: - new_item = Item(rule, 0, i) - new_items.append(new_item) - - # Process any held completions (H). - if item.expect in held_completions: - new_item = item.advance() - label = (new_item.s, item.start, i) - new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, self.SymbolNode(*label)) - new_item.node.add_family(new_item.s, new_item.rule, new_item.start, item.node, held_completions[item.expect]) - new_items.append(new_item) - - for new_item in new_items: - if new_item.expect in self.TERMINALS: - to_scan.add(new_item) - elif new_item not in column: - column.add(new_item) - items.append(new_item) - - def _parse(self, lexer, columns, to_scan, start_symbol=None): - - def is_quasi_complete(item): - if item.is_complete: - return True - - quasi = item.advance() - while not quasi.is_complete: - if quasi.expect not in self.NULLABLE: - return False - if quasi.rule.origin == start_symbol and quasi.expect == start_symbol: - return False - quasi = quasi.advance() - return True - - # def create_leo_transitives(origin, start): - # ... # removed at commit 4c1cfb2faf24e8f8bff7112627a00b94d261b420 - - def scan(i, token, to_scan): - """The core Earley Scanner. - - This is a custom implementation of the scanner that uses the - Lark lexer to match tokens. The scan list is built by the - Earley predictor, based on the previously completed tokens. - This ensures that at each phase of the parse we have a custom - lexer context, allowing for more complex ambiguities.""" - next_to_scan = self.Set() - next_set = self.Set() - columns.append(next_set) - transitives.append({}) - node_cache = {} - - for item in self.Set(to_scan): - if match(item.expect, token): - new_item = item.advance() - label = (new_item.s, new_item.start, i + 1) - # 'terminals' may not contain token.type when using %declare - # Additionally, token is not always a Token - # For example, it can be a Tree when using TreeMatcher - term = terminals.get(token.type) if isinstance(token, Token) else None - # Set the priority of the token node to 0 so that the - # terminal priorities do not affect the Tree chosen by - # ForestSumVisitor after the basic lexer has already - # "used up" the terminal priorities - token_node = TokenNode(token, term, priority=0) - new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, self.SymbolNode(*label)) - new_item.node.add_family(new_item.s, item.rule, new_item.start, item.node, token_node) - - if new_item.expect in self.TERMINALS: - # add (B ::= Aai+1.B, h, y) to Q' - next_to_scan.add(new_item) - else: - # add (B ::= Aa+1.B, h, y) to Ei+1 - next_set.add(new_item) - - if not next_set and not next_to_scan: - expect = {i.expect.name for i in to_scan} - raise UnexpectedToken(token, expect, considered_rules=set(to_scan), state=frozenset(i.s for i in to_scan)) - - return next_to_scan, node_cache - - - # Define parser functions - match = self.term_matcher - - terminals = self.lexer_conf.terminals_by_name - - # Cache for nodes & tokens created in a particular parse step. - transitives = [{}] - - ## The main Earley loop. - # Run the Prediction/Completion cycle for any Items in the current Earley set. - # Completions will be added to the SPPF tree, and predictions will be recursively - # processed down to terminals/empty nodes to be added to the scanner for the next - # step. - expects = {i.expect for i in to_scan} - i = 0 - node_cache = {} - for token in lexer.lex(expects): - self.predict_and_complete(i, to_scan, columns, transitives, node_cache) - - to_scan, node_cache = scan(i, token, to_scan) - i += 1 - - expects.clear() - expects |= {i.expect for i in to_scan} - - self.predict_and_complete(i, to_scan, columns, transitives, node_cache) - - ## Column is now the final column in the parse. - assert i == len(columns)-1 - return to_scan - - def parse(self, lexer, start): - assert start, start - start_symbol = NonTerminal(start) - - columns = [self.Set()] - to_scan = self.Set() # The scan buffer. 'Q' in E.Scott's paper. - - ## Predict for the start_symbol. - # Add predicted items to the first Earley set (for the predictor) if they - # result in a non-terminal, or the scanner if they result in a terminal. - for rule in self.predictions[start_symbol]: - item = Item(rule, 0, 0) - if item.expect in self.TERMINALS: - to_scan.add(item) - else: - columns[0].add(item) - - to_scan = self._parse(lexer, columns, to_scan, start_symbol) - - # If the parse was successful, the start - # symbol should have been completed in the last step of the Earley cycle, and will be in - # this column. Find the item for the start_symbol, which is the root of the SPPF tree. - solutions = dedup_list(n.node for n in columns[-1] if n.is_complete and n.node is not None and n.s == start_symbol and n.start == 0) - if not solutions: - expected_terminals = [t.expect.name for t in to_scan] - raise UnexpectedEOF(expected_terminals, state=frozenset(i.s for i in to_scan)) - if len(solutions) > 1: - raise RuntimeError('Earley should not generate multiple start symbol items! Please report this bug.') - solution ,= solutions - - if self.debug: - from .earley_forest import ForestToPyDotVisitor - try: - debug_walker = ForestToPyDotVisitor() - except ImportError: - logger.warning("Cannot find dependency 'pydot', will not generate sppf debug image") - else: - debug_walker.visit(solution, "sppf.png") - - - if self.Tree is not None: - # Perform our SPPF -> AST conversion - # Disable the ForestToParseTree cache when ambiguity='resolve' - # to prevent a tree construction bug. See issue #1283 - use_cache = not self.resolve_ambiguity - transformer = ForestToParseTree(self.Tree, self.callbacks, self.forest_sum_visitor and self.forest_sum_visitor(), self.resolve_ambiguity, use_cache) - return transformer.transform(solution) - - # return the root of the SPPF - return solution diff --git a/lark/parsers/earley_common.py b/lark/parsers/earley_common.py deleted file mode 100644 index 0ea2d4f..0000000 --- a/lark/parsers/earley_common.py +++ /dev/null @@ -1,42 +0,0 @@ -"""This module implements useful building blocks for the Earley parser -""" - - -class Item: - "An Earley Item, the atom of the algorithm." - - __slots__ = ('s', 'rule', 'ptr', 'start', 'is_complete', 'expect', 'previous', 'node', '_hash') - def __init__(self, rule, ptr, start): - self.is_complete = len(rule.expansion) == ptr - self.rule = rule # rule - self.ptr = ptr # ptr - self.start = start # j - self.node = None # w - if self.is_complete: - self.s = rule.origin - self.expect = None - self.previous = rule.expansion[ptr - 1] if ptr > 0 and len(rule.expansion) else None - else: - self.s = (rule, ptr) - self.expect = rule.expansion[ptr] - self.previous = rule.expansion[ptr - 1] if ptr > 0 and len(rule.expansion) else None - self._hash = hash((self.s, self.start, self.rule)) - - def advance(self): - return Item(self.rule, self.ptr + 1, self.start) - - def __eq__(self, other): - return self is other or (self.s == other.s and self.start == other.start and self.rule == other.rule) - - def __hash__(self): - return self._hash - - def __repr__(self): - before = ( expansion.name for expansion in self.rule.expansion[:self.ptr] ) - after = ( expansion.name for expansion in self.rule.expansion[self.ptr:] ) - symbol = "{} ::= {}* {}".format(self.rule.origin.name, ' '.join(before), ' '.join(after)) - return '%s (%d)' % (symbol, self.start) - - -# class TransitiveItem(Item): -# ... # removed at commit 4c1cfb2faf24e8f8bff7112627a00b94d261b420 diff --git a/lark/parsers/earley_forest.py b/lark/parsers/earley_forest.py deleted file mode 100644 index c60f3a6..0000000 --- a/lark/parsers/earley_forest.py +++ /dev/null @@ -1,802 +0,0 @@ -""""This module implements an SPPF implementation - -This is used as the primary output mechanism for the Earley parser -in order to store complex ambiguities. - -Full reference and more details is here: -https://web.archive.org/web/20190616123959/http://www.bramvandersanden.com/post/2014/06/shared-packed-parse-forest/ -""" - -from typing import Type, AbstractSet -from random import randint -from collections import deque -from operator import attrgetter -from importlib import import_module -from functools import partial - -from ..parse_tree_builder import AmbiguousIntermediateExpander -from ..visitors import Discard -from ..utils import logger, OrderedSet -from ..tree import Tree - -class ForestNode: - pass - -class SymbolNode(ForestNode): - """ - A Symbol Node represents a symbol (or Intermediate LR0). - - Symbol nodes are keyed by the symbol (s). For intermediate nodes - s will be an LR0, stored as a tuple of (rule, ptr). For completed symbol - nodes, s will be a string representing the non-terminal origin (i.e. - the left hand side of the rule). - - The children of a Symbol or Intermediate Node will always be Packed Nodes; - with each Packed Node child representing a single derivation of a production. - - Hence a Symbol Node with a single child is unambiguous. - - Parameters: - s: A Symbol, or a tuple of (rule, ptr) for an intermediate node. - start: For dynamic lexers, the index of the start of the substring matched by this symbol (inclusive). - end: For dynamic lexers, the index of the end of the substring matched by this symbol (exclusive). - - Properties: - is_intermediate: True if this node is an intermediate node. - priority: The priority of the node's symbol. - """ - Set: Type[AbstractSet] = set # Overridden by StableSymbolNode - __slots__ = ('s', 'start', 'end', '_children', 'paths', 'paths_loaded', 'priority', 'is_intermediate') - def __init__(self, s, start, end): - self.s = s - self.start = start - self.end = end - self._children = self.Set() - self.paths = self.Set() - self.paths_loaded = False - - ### We use inf here as it can be safely negated without resorting to conditionals, - # unlike None or float('NaN'), and sorts appropriately. - self.priority = float('-inf') - self.is_intermediate = isinstance(s, tuple) - - def add_family(self, lr0, rule, start, left, right): - self._children.add(PackedNode(self, lr0, rule, start, left, right)) - - def add_path(self, transitive, node): - self.paths.add((transitive, node)) - - def load_paths(self): - for transitive, node in self.paths: - if transitive.next_titem is not None: - vn = type(self)(transitive.next_titem.s, transitive.next_titem.start, self.end) - vn.add_path(transitive.next_titem, node) - self.add_family(transitive.reduction.rule.origin, transitive.reduction.rule, transitive.reduction.start, transitive.reduction.node, vn) - else: - self.add_family(transitive.reduction.rule.origin, transitive.reduction.rule, transitive.reduction.start, transitive.reduction.node, node) - self.paths_loaded = True - - @property - def is_ambiguous(self): - """Returns True if this node is ambiguous.""" - return len(self.children) > 1 - - @property - def children(self): - """Returns a list of this node's children sorted from greatest to - least priority.""" - if not self.paths_loaded: - self.load_paths() - return sorted(self._children, key=attrgetter('sort_key')) - - def __iter__(self): - return iter(self._children) - - def __repr__(self): - if self.is_intermediate: - rule = self.s[0] - ptr = self.s[1] - before = ( expansion.name for expansion in rule.expansion[:ptr] ) - after = ( expansion.name for expansion in rule.expansion[ptr:] ) - symbol = "{} ::= {}* {}".format(rule.origin.name, ' '.join(before), ' '.join(after)) - else: - symbol = self.s.name - return "({}, {}, {}, {})".format(symbol, self.start, self.end, self.priority) - -class StableSymbolNode(SymbolNode): - "A version of SymbolNode that uses OrderedSet for output stability" - Set = OrderedSet - -class PackedNode(ForestNode): - """ - A Packed Node represents a single derivation in a symbol node. - - Parameters: - rule: The rule associated with this node. - parent: The parent of this node. - left: The left child of this node. ``None`` if one does not exist. - right: The right child of this node. ``None`` if one does not exist. - priority: The priority of this node. - """ - __slots__ = ('parent', 's', 'rule', 'start', 'left', 'right', 'priority', '_hash') - def __init__(self, parent, s, rule, start, left, right): - self.parent = parent - self.s = s - self.start = start - self.rule = rule - self.left = left - self.right = right - self.priority = float('-inf') - self._hash = hash((self.left, self.right)) - - @property - def is_empty(self): - return self.left is None and self.right is None - - @property - def sort_key(self): - """ - Used to sort PackedNode children of SymbolNodes. - A SymbolNode has multiple PackedNodes if it matched - ambiguously. Hence, we use the sort order to identify - the order in which ambiguous children should be considered. - """ - return self.is_empty, -self.priority, self.rule.order - - @property - def children(self): - """Returns a list of this node's children.""" - return [x for x in [self.left, self.right] if x is not None] - - def __iter__(self): - yield self.left - yield self.right - - def __eq__(self, other): - if not isinstance(other, PackedNode): - return False - return self is other or (self.left == other.left and self.right == other.right) - - def __hash__(self): - return self._hash - - def __repr__(self): - if isinstance(self.s, tuple): - rule = self.s[0] - ptr = self.s[1] - before = ( expansion.name for expansion in rule.expansion[:ptr] ) - after = ( expansion.name for expansion in rule.expansion[ptr:] ) - symbol = "{} ::= {}* {}".format(rule.origin.name, ' '.join(before), ' '.join(after)) - else: - symbol = self.s.name - return "({}, {}, {}, {})".format(symbol, self.start, self.priority, self.rule.order) - -class TokenNode(ForestNode): - """ - A Token Node represents a matched terminal and is always a leaf node. - - Parameters: - token: The Token associated with this node. - term: The TerminalDef matched by the token. - priority: The priority of this node. - """ - __slots__ = ('token', 'term', 'priority', '_hash') - def __init__(self, token, term, priority=None): - self.token = token - self.term = term - if priority is not None: - self.priority = priority - else: - self.priority = term.priority if term is not None else 0 - self._hash = hash(token) - - def __eq__(self, other): - if not isinstance(other, TokenNode): - return False - return self is other or (self.token == other.token) - - def __hash__(self): - return self._hash - - def __repr__(self): - return repr(self.token) - -class ForestVisitor: - """ - An abstract base class for building forest visitors. - - This class performs a controllable depth-first walk of an SPPF. - The visitor will not enter cycles and will backtrack if one is encountered. - Subclasses are notified of cycles through the ``on_cycle`` method. - - Behavior for visit events is defined by overriding the - ``visit*node*`` functions. - - The walk is controlled by the return values of the ``visit*node_in`` - methods. Returning a node(s) will schedule them to be visited. The visitor - will begin to backtrack if no nodes are returned. - - Parameters: - single_visit: If ``True``, non-Token nodes will only be visited once. - """ - - def __init__(self, single_visit=False): - self.single_visit = single_visit - - def visit_token_node(self, node): - """Called when a ``Token`` is visited. ``Token`` nodes are always leaves.""" - pass - - def visit_symbol_node_in(self, node): - """Called when a symbol node is visited. Nodes that are returned - will be scheduled to be visited. If ``visit_intermediate_node_in`` - is not implemented, this function will be called for intermediate - nodes as well.""" - pass - - def visit_symbol_node_out(self, node): - """Called after all nodes returned from a corresponding ``visit_symbol_node_in`` - call have been visited. If ``visit_intermediate_node_out`` - is not implemented, this function will be called for intermediate - nodes as well.""" - pass - - def visit_packed_node_in(self, node): - """Called when a packed node is visited. Nodes that are returned - will be scheduled to be visited. """ - pass - - def visit_packed_node_out(self, node): - """Called after all nodes returned from a corresponding ``visit_packed_node_in`` - call have been visited.""" - pass - - def on_cycle(self, node, path): - """Called when a cycle is encountered. - - Parameters: - node: The node that causes a cycle. - path: The list of nodes being visited: nodes that have been - entered but not exited. The first element is the root in a forest - visit, and the last element is the node visited most recently. - ``path`` should be treated as read-only. - """ - pass - - def get_cycle_in_path(self, node, path): - """A utility function for use in ``on_cycle`` to obtain a slice of - ``path`` that only contains the nodes that make up the cycle.""" - index = len(path) - 1 - while id(path[index]) != id(node): - index -= 1 - return path[index:] - - def visit(self, root): - # Visiting is a list of IDs of all symbol/intermediate nodes currently in - # the stack. It serves two purposes: to detect when we 'recurse' in and out - # of a symbol/intermediate so that we can process both up and down. Also, - # since the SPPF can have cycles it allows us to detect if we're trying - # to recurse into a node that's already on the stack (infinite recursion). - visiting = set() - - # set of all nodes that have been visited - visited = set() - - # a list of nodes that are currently being visited - # used for the `on_cycle` callback - path = [] - - # We do not use recursion here to walk the Forest due to the limited - # stack size in python. Therefore input_stack is essentially our stack. - input_stack = deque([root]) - - # It is much faster to cache these as locals since they are called - # many times in large parses. - vpno = getattr(self, 'visit_packed_node_out') - vpni = getattr(self, 'visit_packed_node_in') - vsno = getattr(self, 'visit_symbol_node_out') - vsni = getattr(self, 'visit_symbol_node_in') - vino = getattr(self, 'visit_intermediate_node_out', vsno) - vini = getattr(self, 'visit_intermediate_node_in', vsni) - vtn = getattr(self, 'visit_token_node') - oc = getattr(self, 'on_cycle') - - while input_stack: - current = next(reversed(input_stack)) - try: - next_node = next(current) - except StopIteration: - input_stack.pop() - continue - except TypeError: - ### If the current object is not an iterator, pass through to Token/SymbolNode - pass - else: - if next_node is None: - continue - - if id(next_node) in visiting: - oc(next_node, path) - continue - - input_stack.append(next_node) - continue - - if isinstance(current, TokenNode): - vtn(current.token) - input_stack.pop() - continue - - current_id = id(current) - if current_id in visiting: - if isinstance(current, PackedNode): - vpno(current) - elif current.is_intermediate: - vino(current) - else: - vsno(current) - input_stack.pop() - path.pop() - visiting.remove(current_id) - visited.add(current_id) - elif self.single_visit and current_id in visited: - input_stack.pop() - else: - visiting.add(current_id) - path.append(current) - if isinstance(current, PackedNode): - next_node = vpni(current) - elif current.is_intermediate: - next_node = vini(current) - else: - next_node = vsni(current) - if next_node is None: - continue - - if not isinstance(next_node, ForestNode): - next_node = iter(next_node) - elif id(next_node) in visiting: - oc(next_node, path) - continue - - input_stack.append(next_node) - -class ForestTransformer(ForestVisitor): - """The base class for a bottom-up forest transformation. Most users will - want to use ``TreeForestTransformer`` instead as it has a friendlier - interface and covers most use cases. - - Transformations are applied via inheritance and overriding of the - ``transform*node`` methods. - - ``transform_token_node`` receives a ``Token`` as an argument. - All other methods receive the node that is being transformed and - a list of the results of the transformations of that node's children. - The return value of these methods are the resulting transformations. - - If ``Discard`` is raised in a node's transformation, no data from that node - will be passed to its parent's transformation. - """ - - def __init__(self): - super(ForestTransformer, self).__init__() - # results of transformations - self.data = dict() - # used to track parent nodes - self.node_stack = deque() - - def transform(self, root): - """Perform a transformation on an SPPF.""" - self.node_stack.append('result') - self.data['result'] = [] - self.visit(root) - assert len(self.data['result']) <= 1 - if self.data['result']: - return self.data['result'][0] - - def transform_symbol_node(self, node, data): - """Transform a symbol node.""" - return node - - def transform_intermediate_node(self, node, data): - """Transform an intermediate node.""" - return node - - def transform_packed_node(self, node, data): - """Transform a packed node.""" - return node - - def transform_token_node(self, node): - """Transform a ``Token``.""" - return node - - def visit_symbol_node_in(self, node): - self.node_stack.append(id(node)) - self.data[id(node)] = [] - return node.children - - def visit_packed_node_in(self, node): - self.node_stack.append(id(node)) - self.data[id(node)] = [] - return node.children - - def visit_token_node(self, node): - transformed = self.transform_token_node(node) - if transformed is not Discard: - self.data[self.node_stack[-1]].append(transformed) - - def _visit_node_out_helper(self, node, method): - self.node_stack.pop() - transformed = method(node, self.data[id(node)]) - if transformed is not Discard: - self.data[self.node_stack[-1]].append(transformed) - del self.data[id(node)] - - def visit_symbol_node_out(self, node): - self._visit_node_out_helper(node, self.transform_symbol_node) - - def visit_intermediate_node_out(self, node): - self._visit_node_out_helper(node, self.transform_intermediate_node) - - def visit_packed_node_out(self, node): - self._visit_node_out_helper(node, self.transform_packed_node) - - -class ForestSumVisitor(ForestVisitor): - """ - A visitor for prioritizing ambiguous parts of the Forest. - - This visitor is used when support for explicit priorities on - rules is requested (whether normal, or invert). It walks the - forest (or subsets thereof) and cascades properties upwards - from the leaves. - - It would be ideal to do this during parsing, however this would - require processing each Earley item multiple times. That's - a big performance drawback; so running a forest walk is the - lesser of two evils: there can be significantly more Earley - items created during parsing than there are SPPF nodes in the - final tree. - """ - def __init__(self): - super(ForestSumVisitor, self).__init__(single_visit=True) - - def visit_packed_node_in(self, node): - yield node.left - yield node.right - - def visit_symbol_node_in(self, node): - return iter(node.children) - - def visit_packed_node_out(self, node): - priority = node.rule.options.priority if not node.parent.is_intermediate and node.rule.options.priority else 0 - priority += getattr(node.right, 'priority', 0) - priority += getattr(node.left, 'priority', 0) - node.priority = priority - - def visit_symbol_node_out(self, node): - node.priority = max(child.priority for child in node.children) - -class PackedData(): - """Used in transformationss of packed nodes to distinguish the data - that comes from the left child and the right child. - """ - - class _NoData(): - pass - - NO_DATA = _NoData() - - def __init__(self, node, data): - self.left = self.NO_DATA - self.right = self.NO_DATA - if data: - if node.left is not None: - self.left = data[0] - if len(data) > 1: - self.right = data[1] - else: - self.right = data[0] - -class ForestToParseTree(ForestTransformer): - """Used by the earley parser when ambiguity equals 'resolve' or - 'explicit'. Transforms an SPPF into an (ambiguous) parse tree. - - Parameters: - tree_class: The tree class to use for construction - callbacks: A dictionary of rules to functions that output a tree - prioritizer: A ``ForestVisitor`` that manipulates the priorities of ForestNodes - resolve_ambiguity: If True, ambiguities will be resolved based on - priorities. Otherwise, `_ambig` nodes will be in the resulting tree. - use_cache: If True, the results of packed node transformations will be cached. - """ - - def __init__(self, tree_class=Tree, callbacks=dict(), prioritizer=ForestSumVisitor(), resolve_ambiguity=True, use_cache=True): - super(ForestToParseTree, self).__init__() - self.tree_class = tree_class - self.callbacks = callbacks - self.prioritizer = prioritizer - self.resolve_ambiguity = resolve_ambiguity - self._use_cache = use_cache - self._cache = {} - self._on_cycle_retreat = False - self._cycle_node = None - self._successful_visits = set() - - def visit(self, root): - if self.prioritizer: - self.prioritizer.visit(root) - super(ForestToParseTree, self).visit(root) - self._cache = {} - - def on_cycle(self, node, path): - logger.debug("Cycle encountered in the SPPF at node: %s. " - "As infinite ambiguities cannot be represented in a tree, " - "this family of derivations will be discarded.", node) - self._cycle_node = node - self._on_cycle_retreat = True - - def _check_cycle(self, node): - if self._on_cycle_retreat: - if id(node) == id(self._cycle_node) or id(node) in self._successful_visits: - self._cycle_node = None - self._on_cycle_retreat = False - else: - return Discard - - def _collapse_ambig(self, children): - new_children = [] - for child in children: - if hasattr(child, 'data') and child.data == '_ambig': - new_children += child.children - else: - new_children.append(child) - return new_children - - def _call_rule_func(self, node, data): - # called when transforming children of symbol nodes - # data is a list of trees or tokens that correspond to the - # symbol's rule expansion - return self.callbacks[node.rule](data) - - def _call_ambig_func(self, node, data): - # called when transforming a symbol node - # data is a list of trees where each tree's data is - # equal to the name of the symbol or one of its aliases. - if len(data) > 1: - return self.tree_class('_ambig', data) - elif data: - return data[0] - return Discard - - def transform_symbol_node(self, node, data): - if id(node) not in self._successful_visits: - return Discard - r = self._check_cycle(node) - if r is Discard: - return r - self._successful_visits.remove(id(node)) - data = self._collapse_ambig(data) - return self._call_ambig_func(node, data) - - def transform_intermediate_node(self, node, data): - if id(node) not in self._successful_visits: - return Discard - r = self._check_cycle(node) - if r is Discard: - return r - self._successful_visits.remove(id(node)) - if len(data) > 1: - children = [self.tree_class('_inter', c) for c in data] - return self.tree_class('_iambig', children) - return data[0] - - def transform_packed_node(self, node, data): - r = self._check_cycle(node) - if r is Discard: - return r - if self.resolve_ambiguity and id(node.parent) in self._successful_visits: - return Discard - if self._use_cache and id(node) in self._cache: - return self._cache[id(node)] - children = [] - assert len(data) <= 2 - data = PackedData(node, data) - if data.left is not PackedData.NO_DATA: - if node.left.is_intermediate and isinstance(data.left, list): - children += data.left - else: - children.append(data.left) - if data.right is not PackedData.NO_DATA: - children.append(data.right) - transformed = children if node.parent.is_intermediate else self._call_rule_func(node, children) - if self._use_cache: - self._cache[id(node)] = transformed - return transformed - - def visit_symbol_node_in(self, node): - super(ForestToParseTree, self).visit_symbol_node_in(node) - if self._on_cycle_retreat: - return - return node.children - - def visit_packed_node_in(self, node): - self._on_cycle_retreat = False - to_visit = super(ForestToParseTree, self).visit_packed_node_in(node) - if not self.resolve_ambiguity or id(node.parent) not in self._successful_visits: - if not self._use_cache or id(node) not in self._cache: - return to_visit - - def visit_packed_node_out(self, node): - super(ForestToParseTree, self).visit_packed_node_out(node) - if not self._on_cycle_retreat: - self._successful_visits.add(id(node.parent)) - -def handles_ambiguity(func): - """Decorator for methods of subclasses of ``TreeForestTransformer``. - Denotes that the method should receive a list of transformed derivations.""" - func.handles_ambiguity = True - return func - -class TreeForestTransformer(ForestToParseTree): - """A ``ForestTransformer`` with a tree ``Transformer``-like interface. - By default, it will construct a tree. - - Methods provided via inheritance are called based on the rule/symbol - names of nodes in the forest. - - Methods that act on rules will receive a list of the results of the - transformations of the rule's children. By default, trees and tokens. - - Methods that act on tokens will receive a token. - - Alternatively, methods that act on rules may be annotated with - ``handles_ambiguity``. In this case, the function will receive a list - of all the transformations of all the derivations of the rule. - By default, a list of trees where each tree.data is equal to the - rule name or one of its aliases. - - Non-tree transformations are made possible by override of - ``__default__``, ``__default_token__``, and ``__default_ambig__``. - - Note: - Tree shaping features such as inlined rules and token filtering are - not built into the transformation. Positions are also not propagated. - - Parameters: - tree_class: The tree class to use for construction - prioritizer: A ``ForestVisitor`` that manipulates the priorities of nodes in the SPPF. - resolve_ambiguity: If True, ambiguities will be resolved based on priorities. - use_cache (bool): If True, caches the results of some transformations, - potentially improving performance when ``resolve_ambiguity==False``. - Only use if you know what you are doing: i.e. All transformation - functions are pure and referentially transparent. - """ - - def __init__(self, tree_class=Tree, prioritizer=ForestSumVisitor(), resolve_ambiguity=True, use_cache=False): - super(TreeForestTransformer, self).__init__(tree_class, dict(), prioritizer, resolve_ambiguity, use_cache) - - def __default__(self, name, data): - """Default operation on tree (for override). - - Returns a tree with name with data as children. - """ - return self.tree_class(name, data) - - def __default_ambig__(self, name, data): - """Default operation on ambiguous rule (for override). - - Wraps data in an '_ambig_' node if it contains more than - one element. - """ - if len(data) > 1: - return self.tree_class('_ambig', data) - elif data: - return data[0] - return Discard - - def __default_token__(self, node): - """Default operation on ``Token`` (for override). - - Returns ``node``. - """ - return node - - def transform_token_node(self, node): - return getattr(self, node.type, self.__default_token__)(node) - - def _call_rule_func(self, node, data): - name = node.rule.alias or node.rule.options.template_source or node.rule.origin.name - user_func = getattr(self, name, self.__default__) - if user_func == self.__default__ or hasattr(user_func, 'handles_ambiguity'): - user_func = partial(self.__default__, name) - if not self.resolve_ambiguity: - wrapper = partial(AmbiguousIntermediateExpander, self.tree_class) - user_func = wrapper(user_func) - return user_func(data) - - def _call_ambig_func(self, node, data): - name = node.s.name - user_func = getattr(self, name, self.__default_ambig__) - if user_func == self.__default_ambig__ or not hasattr(user_func, 'handles_ambiguity'): - user_func = partial(self.__default_ambig__, name) - return user_func(data) - -class ForestToPyDotVisitor(ForestVisitor): - """ - A Forest visitor which writes the SPPF to a PNG. - - The SPPF can get really large, really quickly because - of the amount of meta-data it stores, so this is probably - only useful for trivial trees and learning how the SPPF - is structured. - """ - def __init__(self, rankdir="TB"): - super(ForestToPyDotVisitor, self).__init__(single_visit=True) - self.pydot = import_module('pydot') - self.graph = self.pydot.Dot(graph_type='digraph', rankdir=rankdir) - - def visit(self, root, filename): - super(ForestToPyDotVisitor, self).visit(root) - try: - self.graph.write_png(filename) - except FileNotFoundError as e: - logger.error("Could not write png: ", e) - - def visit_token_node(self, node): - graph_node_id = str(id(node)) - graph_node_label = "\"{}\"".format(node.value.replace('"', '\\"')) - graph_node_color = 0x808080 - graph_node_style = "\"filled,rounded\"" - graph_node_shape = "diamond" - graph_node = self.pydot.Node(graph_node_id, style=graph_node_style, fillcolor="#{:06x}".format(graph_node_color), shape=graph_node_shape, label=graph_node_label) - self.graph.add_node(graph_node) - - def visit_packed_node_in(self, node): - graph_node_id = str(id(node)) - graph_node_label = repr(node) - graph_node_color = 0x808080 - graph_node_style = "filled" - graph_node_shape = "diamond" - graph_node = self.pydot.Node(graph_node_id, style=graph_node_style, fillcolor="#{:06x}".format(graph_node_color), shape=graph_node_shape, label=graph_node_label) - self.graph.add_node(graph_node) - yield node.left - yield node.right - - def visit_packed_node_out(self, node): - graph_node_id = str(id(node)) - graph_node = self.graph.get_node(graph_node_id)[0] - for child in [node.left, node.right]: - if child is not None: - child_graph_node_id = str(id(child.token if isinstance(child, TokenNode) else child)) - child_graph_node = self.graph.get_node(child_graph_node_id)[0] - self.graph.add_edge(self.pydot.Edge(graph_node, child_graph_node)) - else: - #### Try and be above the Python object ID range; probably impl. specific, but maybe this is okay. - child_graph_node_id = str(randint(100000000000000000000000000000,123456789012345678901234567890)) - child_graph_node_style = "invis" - child_graph_node = self.pydot.Node(child_graph_node_id, style=child_graph_node_style, label="None") - child_edge_style = "invis" - self.graph.add_node(child_graph_node) - self.graph.add_edge(self.pydot.Edge(graph_node, child_graph_node, style=child_edge_style)) - - def visit_symbol_node_in(self, node): - graph_node_id = str(id(node)) - graph_node_label = repr(node) - graph_node_color = 0x808080 - graph_node_style = "\"filled\"" - if node.is_intermediate: - graph_node_shape = "ellipse" - else: - graph_node_shape = "rectangle" - graph_node = self.pydot.Node(graph_node_id, style=graph_node_style, fillcolor="#{:06x}".format(graph_node_color), shape=graph_node_shape, label=graph_node_label) - self.graph.add_node(graph_node) - return iter(node.children) - - def visit_symbol_node_out(self, node): - graph_node_id = str(id(node)) - graph_node = self.graph.get_node(graph_node_id)[0] - for child in node.children: - child_graph_node_id = str(id(child)) - child_graph_node = self.graph.get_node(child_graph_node_id)[0] - self.graph.add_edge(self.pydot.Edge(graph_node, child_graph_node)) diff --git a/lark/parsers/grammar_analysis.py b/lark/parsers/grammar_analysis.py deleted file mode 100644 index 28d3cb6..0000000 --- a/lark/parsers/grammar_analysis.py +++ /dev/null @@ -1,203 +0,0 @@ -"Provides for superficial grammar analysis." - -from collections import Counter, defaultdict -from typing import List, Dict, Iterator, FrozenSet, Set - -from ..utils import bfs, fzset, classify, OrderedSet -from ..exceptions import GrammarError -from ..grammar import Rule, Terminal, NonTerminal, Symbol -from ..common import ParserConf - - -class RulePtr: - __slots__ = ('rule', 'index') - rule: Rule - index: int - - def __init__(self, rule: Rule, index: int): - assert isinstance(rule, Rule) - assert index <= len(rule.expansion) - self.rule = rule - self.index = index - - def __repr__(self): - before = [x.name for x in self.rule.expansion[:self.index]] - after = [x.name for x in self.rule.expansion[self.index:]] - return '<%s : %s * %s>' % (self.rule.origin.name, ' '.join(before), ' '.join(after)) - - @property - def next(self) -> Symbol: - return self.rule.expansion[self.index] - - def advance(self, sym: Symbol) -> 'RulePtr': - assert self.next == sym - return RulePtr(self.rule, self.index+1) - - @property - def is_satisfied(self) -> bool: - return self.index == len(self.rule.expansion) - - def __eq__(self, other) -> bool: - if not isinstance(other, RulePtr): - return NotImplemented - return self.rule == other.rule and self.index == other.index - - def __hash__(self) -> int: - return hash((self.rule, self.index)) - - -State = FrozenSet[RulePtr] - -# state generation ensures no duplicate LR0ItemSets -class LR0ItemSet: - __slots__ = ('kernel', 'closure', 'transitions', 'lookaheads') - - kernel: State - closure: State - transitions: Dict[Symbol, 'LR0ItemSet'] - lookaheads: Dict[Symbol, Set[Rule]] - - def __init__(self, kernel, closure): - self.kernel = fzset(kernel) - self.closure = fzset(closure) - self.transitions = {} - self.lookaheads = defaultdict(set) - - def __repr__(self): - return '{%s | %s}' % (', '.join([repr(r) for r in self.kernel]), ', '.join([repr(r) for r in self.closure])) - - -def update_set(set1, set2): - if not set2 or set1 > set2: - return False - - copy = set(set1) - set1 |= set2 - return set1 != copy - -def calculate_sets(rules): - """Calculate FOLLOW sets. - - Adapted from: http://lara.epfl.ch/w/cc09:algorithm_for_first_and_follow_sets""" - symbols = {sym for rule in rules for sym in rule.expansion} | {rule.origin for rule in rules} - - # foreach grammar rule X ::= Y(1) ... Y(k) - # if k=0 or {Y(1),...,Y(k)} subset of NULLABLE then - # NULLABLE = NULLABLE union {X} - # for i = 1 to k - # if i=1 or {Y(1),...,Y(i-1)} subset of NULLABLE then - # FIRST(X) = FIRST(X) union FIRST(Y(i)) - # for j = i+1 to k - # if i=k or {Y(i+1),...Y(k)} subset of NULLABLE then - # FOLLOW(Y(i)) = FOLLOW(Y(i)) union FOLLOW(X) - # if i+1=j or {Y(i+1),...,Y(j-1)} subset of NULLABLE then - # FOLLOW(Y(i)) = FOLLOW(Y(i)) union FIRST(Y(j)) - # until none of NULLABLE,FIRST,FOLLOW changed in last iteration - - NULLABLE = set() - FIRST = {} - FOLLOW = {} - for sym in symbols: - FIRST[sym]={sym} if sym.is_term else set() - FOLLOW[sym]=set() - - # Calculate NULLABLE and FIRST - changed = True - while changed: - changed = False - - for rule in rules: - if set(rule.expansion) <= NULLABLE: - if update_set(NULLABLE, {rule.origin}): - changed = True - - for i, sym in enumerate(rule.expansion): - if set(rule.expansion[:i]) <= NULLABLE: - if update_set(FIRST[rule.origin], FIRST[sym]): - changed = True - else: - break - - # Calculate FOLLOW - changed = True - while changed: - changed = False - - for rule in rules: - for i, sym in enumerate(rule.expansion): - if i==len(rule.expansion)-1 or set(rule.expansion[i+1:]) <= NULLABLE: - if update_set(FOLLOW[sym], FOLLOW[rule.origin]): - changed = True - - for j in range(i+1, len(rule.expansion)): - if set(rule.expansion[i+1:j]) <= NULLABLE: - if update_set(FOLLOW[sym], FIRST[rule.expansion[j]]): - changed = True - - return FIRST, FOLLOW, NULLABLE - - -class GrammarAnalyzer: - def __init__(self, parser_conf: ParserConf, debug: bool=False, strict: bool=False): - self.debug = debug - self.strict = strict - - root_rules = {start: Rule(NonTerminal('$root_' + start), [NonTerminal(start), Terminal('$END')]) - for start in parser_conf.start} - - rules = parser_conf.rules + list(root_rules.values()) - self.rules_by_origin: Dict[NonTerminal, List[Rule]] = classify(rules, lambda r: r.origin) - - if len(rules) != len(set(rules)): - duplicates = [item for item, count in Counter(rules).items() if count > 1] - raise GrammarError("Rules defined twice: %s" % ', '.join(str(i) for i in duplicates)) - - for r in rules: - for sym in r.expansion: - if not (sym.is_term or sym in self.rules_by_origin): - raise GrammarError("Using an undefined rule: %s" % sym) - - self.start_states = {start: self.expand_rule(root_rule.origin) - for start, root_rule in root_rules.items()} - - self.end_states = {start: fzset({RulePtr(root_rule, len(root_rule.expansion))}) - for start, root_rule in root_rules.items()} - - lr0_root_rules = {start: Rule(NonTerminal('$root_' + start), [NonTerminal(start)]) - for start in parser_conf.start} - - lr0_rules = parser_conf.rules + list(lr0_root_rules.values()) - assert(len(lr0_rules) == len(set(lr0_rules))) - - self.lr0_rules_by_origin = classify(lr0_rules, lambda r: r.origin) - - # cache RulePtr(r, 0) in r (no duplicate RulePtr objects) - self.lr0_start_states = {start: LR0ItemSet([RulePtr(root_rule, 0)], self.expand_rule(root_rule.origin, self.lr0_rules_by_origin)) - for start, root_rule in lr0_root_rules.items()} - - self.FIRST, self.FOLLOW, self.NULLABLE = calculate_sets(rules) - - def expand_rule(self, source_rule: NonTerminal, rules_by_origin=None) -> OrderedSet[RulePtr]: - "Returns all init_ptrs accessible by rule (recursive)" - - if rules_by_origin is None: - rules_by_origin = self.rules_by_origin - - init_ptrs = OrderedSet[RulePtr]() - def _expand_rule(rule: NonTerminal) -> Iterator[NonTerminal]: - assert not rule.is_term, rule - - for r in rules_by_origin[rule]: - init_ptr = RulePtr(r, 0) - init_ptrs.add(init_ptr) - - if r.expansion: # if not empty rule - new_r = init_ptr.next - if not new_r.is_term: - assert isinstance(new_r, NonTerminal) - yield new_r - - for _ in bfs([source_rule], _expand_rule): - pass - - return init_ptrs diff --git a/lark/parsers/lalr_analysis.py b/lark/parsers/lalr_analysis.py deleted file mode 100644 index b7b3fdf..0000000 --- a/lark/parsers/lalr_analysis.py +++ /dev/null @@ -1,332 +0,0 @@ -"""This module builds a LALR(1) transition-table for lalr_parser.py - -For now, shift/reduce conflicts are automatically resolved as shifts. -""" - -# Author: Erez Shinan (2017) -# Email : erezshin@gmail.com - -from typing import Dict, Set, Iterator, Tuple, List, TypeVar, Generic -from collections import defaultdict - -from ..utils import classify, classify_bool, bfs, fzset, Enumerator, logger -from ..exceptions import GrammarError - -from .grammar_analysis import GrammarAnalyzer, Terminal, LR0ItemSet, RulePtr, State -from ..grammar import Rule, Symbol -from ..common import ParserConf - -###{standalone - -class Action: - def __init__(self, name): - self.name = name - def __str__(self): - return self.name - def __repr__(self): - return str(self) - -Shift = Action('Shift') -Reduce = Action('Reduce') - -StateT = TypeVar("StateT") - -class ParseTableBase(Generic[StateT]): - states: Dict[StateT, Dict[str, Tuple]] - start_states: Dict[str, StateT] - end_states: Dict[str, StateT] - - def __init__(self, states, start_states, end_states): - self.states = states - self.start_states = start_states - self.end_states = end_states - - def serialize(self, memo): - tokens = Enumerator() - - states = { - state: {tokens.get(token): ((1, arg.serialize(memo)) if action is Reduce else (0, arg)) - for token, (action, arg) in actions.items()} - for state, actions in self.states.items() - } - - return { - 'tokens': tokens.reversed(), - 'states': states, - 'start_states': self.start_states, - 'end_states': self.end_states, - } - - @classmethod - def deserialize(cls, data, memo): - tokens = data['tokens'] - states = { - state: {tokens[token]: ((Reduce, Rule.deserialize(arg, memo)) if action==1 else (Shift, arg)) - for token, (action, arg) in actions.items()} - for state, actions in data['states'].items() - } - return cls(states, data['start_states'], data['end_states']) - -class ParseTable(ParseTableBase['State']): - """Parse-table whose key is State, i.e. set[RulePtr] - - Slower than IntParseTable, but useful for debugging - """ - pass - - -class IntParseTable(ParseTableBase[int]): - """Parse-table whose key is int. Best for performance.""" - - @classmethod - def from_ParseTable(cls, parse_table: ParseTable): - enum = list(parse_table.states) - state_to_idx: Dict['State', int] = {s:i for i,s in enumerate(enum)} - int_states = {} - - for s, la in parse_table.states.items(): - la = {k:(v[0], state_to_idx[v[1]]) if v[0] is Shift else v - for k,v in la.items()} - int_states[ state_to_idx[s] ] = la - - - start_states = {start:state_to_idx[s] for start, s in parse_table.start_states.items()} - end_states = {start:state_to_idx[s] for start, s in parse_table.end_states.items()} - return cls(int_states, start_states, end_states) - -###} - - -# digraph and traverse, see The Theory and Practice of Compiler Writing - -# computes F(x) = G(x) union (union { G(y) | x R y }) -# X: nodes -# R: relation (function mapping node -> list of nodes that satisfy the relation) -# G: set valued function -def digraph(X, R, G): - F = {} - S = [] - N = dict.fromkeys(X, 0) - for x in X: - # this is always true for the first iteration, but N[x] may be updated in traverse below - if N[x] == 0: - traverse(x, S, N, X, R, G, F) - return F - -# x: single node -# S: stack -# N: weights -# X: nodes -# R: relation (see above) -# G: set valued function -# F: set valued function we are computing (map of input -> output) -def traverse(x, S, N, X, R, G, F): - S.append(x) - d = len(S) - N[x] = d - F[x] = G[x] - for y in R[x]: - if N[y] == 0: - traverse(y, S, N, X, R, G, F) - n_x = N[x] - assert(n_x > 0) - n_y = N[y] - assert(n_y != 0) - if (n_y > 0) and (n_y < n_x): - N[x] = n_y - F[x].update(F[y]) - if N[x] == d: - f_x = F[x] - while True: - z = S.pop() - N[z] = -1 - F[z] = f_x - if z == x: - break - - -class LALR_Analyzer(GrammarAnalyzer): - lr0_itemsets: Set[LR0ItemSet] - nonterminal_transitions: List[Tuple[LR0ItemSet, Symbol]] - lookback: Dict[Tuple[LR0ItemSet, Symbol], Set[Tuple[LR0ItemSet, Rule]]] - includes: Dict[Tuple[LR0ItemSet, Symbol], Set[Tuple[LR0ItemSet, Symbol]]] - reads: Dict[Tuple[LR0ItemSet, Symbol], Set[Tuple[LR0ItemSet, Symbol]]] - directly_reads: Dict[Tuple[LR0ItemSet, Symbol], Set[Symbol]] - - - def __init__(self, parser_conf: ParserConf, debug: bool=False, strict: bool=False): - GrammarAnalyzer.__init__(self, parser_conf, debug, strict) - self.nonterminal_transitions = [] - self.directly_reads = defaultdict(set) - self.reads = defaultdict(set) - self.includes = defaultdict(set) - self.lookback = defaultdict(set) - - - def compute_lr0_states(self) -> None: - self.lr0_itemsets = set() - # map of kernels to LR0ItemSets - cache: Dict['State', LR0ItemSet] = {} - - def step(state: LR0ItemSet) -> Iterator[LR0ItemSet]: - _, unsat = classify_bool(state.closure, lambda rp: rp.is_satisfied) - - d = classify(unsat, lambda rp: rp.next) - for sym, rps in d.items(): - kernel = fzset({rp.advance(sym) for rp in rps}) - new_state = cache.get(kernel, None) - if new_state is None: - closure = set(kernel) - for rp in kernel: - if not rp.is_satisfied and not rp.next.is_term: - closure |= self.expand_rule(rp.next, self.lr0_rules_by_origin) - new_state = LR0ItemSet(kernel, closure) - cache[kernel] = new_state - - state.transitions[sym] = new_state - yield new_state - - self.lr0_itemsets.add(state) - - for _ in bfs(self.lr0_start_states.values(), step): - pass - - def compute_reads_relations(self): - # handle start state - for root in self.lr0_start_states.values(): - assert(len(root.kernel) == 1) - for rp in root.kernel: - assert(rp.index == 0) - self.directly_reads[(root, rp.next)] = set([ Terminal('$END') ]) - - for state in self.lr0_itemsets: - seen = set() - for rp in state.closure: - if rp.is_satisfied: - continue - s = rp.next - # if s is a not a nonterminal - if s not in self.lr0_rules_by_origin: - continue - if s in seen: - continue - seen.add(s) - nt = (state, s) - self.nonterminal_transitions.append(nt) - dr = self.directly_reads[nt] - r = self.reads[nt] - next_state = state.transitions[s] - for rp2 in next_state.closure: - if rp2.is_satisfied: - continue - s2 = rp2.next - # if s2 is a terminal - if s2 not in self.lr0_rules_by_origin: - dr.add(s2) - if s2 in self.NULLABLE: - r.add((next_state, s2)) - - def compute_includes_lookback(self): - for nt in self.nonterminal_transitions: - state, nonterminal = nt - includes = [] - lookback = self.lookback[nt] - for rp in state.closure: - if rp.rule.origin != nonterminal: - continue - # traverse the states for rp(.rule) - state2 = state - for i in range(rp.index, len(rp.rule.expansion)): - s = rp.rule.expansion[i] - nt2 = (state2, s) - state2 = state2.transitions[s] - if nt2 not in self.reads: - continue - for j in range(i + 1, len(rp.rule.expansion)): - if rp.rule.expansion[j] not in self.NULLABLE: - break - else: - includes.append(nt2) - # state2 is at the final state for rp.rule - if rp.index == 0: - for rp2 in state2.closure: - if (rp2.rule == rp.rule) and rp2.is_satisfied: - lookback.add((state2, rp2.rule)) - for nt2 in includes: - self.includes[nt2].add(nt) - - def compute_lookaheads(self): - read_sets = digraph(self.nonterminal_transitions, self.reads, self.directly_reads) - follow_sets = digraph(self.nonterminal_transitions, self.includes, read_sets) - - for nt, lookbacks in self.lookback.items(): - for state, rule in lookbacks: - for s in follow_sets[nt]: - state.lookaheads[s].add(rule) - - def compute_lalr1_states(self) -> None: - m: Dict[LR0ItemSet, Dict[str, Tuple]] = {} - reduce_reduce = [] - for itemset in self.lr0_itemsets: - actions: Dict[Symbol, Tuple] = {la: (Shift, next_state.closure) - for la, next_state in itemset.transitions.items()} - for la, rules in itemset.lookaheads.items(): - if len(rules) > 1: - # Try to resolve conflict based on priority - p = [(r.options.priority or 0, r) for r in rules] - p.sort(key=lambda r: r[0], reverse=True) - best, second_best = p[:2] - if best[0] > second_best[0]: - rules = {best[1]} - else: - reduce_reduce.append((itemset, la, rules)) - continue - - rule ,= rules - if la in actions: - if self.strict: - raise GrammarError(f"Shift/Reduce conflict for terminal {la.name}. [strict-mode]\n ") - elif self.debug: - logger.warning('Shift/Reduce conflict for terminal %s: (resolving as shift)', la.name) - logger.warning(' * %s', rule) - else: - logger.debug('Shift/Reduce conflict for terminal %s: (resolving as shift)', la.name) - logger.debug(' * %s', rule) - else: - actions[la] = (Reduce, rule) - m[itemset] = { k.name: v for k, v in actions.items() } - - if reduce_reduce: - msgs = [] - for itemset, la, rules in reduce_reduce: - msg = 'Reduce/Reduce collision in %s between the following rules: %s' % (la, ''.join([ '\n\t- ' + str(r) for r in rules ])) - if self.debug: - msg += '\n collision occurred in state: {%s\n }' % ''.join(['\n\t' + str(x) for x in itemset.closure]) - msgs.append(msg) - raise GrammarError('\n\n'.join(msgs)) - - states = { k.closure: v for k, v in m.items() } - - # compute end states - end_states: Dict[str, 'State'] = {} - for state in states: - for rp in state: - for start in self.lr0_start_states: - if rp.rule.origin.name == ('$root_' + start) and rp.is_satisfied: - assert start not in end_states - end_states[start] = state - - start_states = { start: state.closure for start, state in self.lr0_start_states.items() } - _parse_table = ParseTable(states, start_states, end_states) - - if self.debug: - self.parse_table = _parse_table - else: - self.parse_table = IntParseTable.from_ParseTable(_parse_table) - - def compute_lalr(self): - self.compute_lr0_states() - self.compute_reads_relations() - self.compute_includes_lookback() - self.compute_lookaheads() - self.compute_lalr1_states() diff --git a/lark/parsers/lalr_interactive_parser.py b/lark/parsers/lalr_interactive_parser.py deleted file mode 100644 index fbf0d1f..0000000 --- a/lark/parsers/lalr_interactive_parser.py +++ /dev/null @@ -1,158 +0,0 @@ -# This module provides a LALR interactive parser, which is used for debugging and error handling - -from typing import Iterator, List -from copy import copy -import warnings - -from lark.exceptions import UnexpectedToken -from lark.lexer import Token, LexerThread -from .lalr_parser_state import ParserState - -###{standalone - -class InteractiveParser: - """InteractiveParser gives you advanced control over parsing and error handling when parsing with LALR. - - For a simpler interface, see the ``on_error`` argument to ``Lark.parse()``. - """ - def __init__(self, parser, parser_state: ParserState, lexer_thread: LexerThread): - self.parser = parser - self.parser_state = parser_state - self.lexer_thread = lexer_thread - self.result = None - - @property - def lexer_state(self) -> LexerThread: - warnings.warn("lexer_state will be removed in subsequent releases. Use lexer_thread instead.", DeprecationWarning) - return self.lexer_thread - - def feed_token(self, token: Token): - """Feed the parser with a token, and advance it to the next state, as if it received it from the lexer. - - Note that ``token`` has to be an instance of ``Token``. - """ - return self.parser_state.feed_token(token, token.type == '$END') - - def iter_parse(self) -> Iterator[Token]: - """Step through the different stages of the parse, by reading tokens from the lexer - and feeding them to the parser, one per iteration. - - Returns an iterator of the tokens it encounters. - - When the parse is over, the resulting tree can be found in ``InteractiveParser.result``. - """ - for token in self.lexer_thread.lex(self.parser_state): - yield token - self.result = self.feed_token(token) - - def exhaust_lexer(self) -> List[Token]: - """Try to feed the rest of the lexer state into the interactive parser. - - Note that this modifies the instance in place and does not feed an '$END' Token - """ - return list(self.iter_parse()) - - - def feed_eof(self, last_token=None): - """Feed a '$END' Token. Borrows from 'last_token' if given.""" - eof = Token.new_borrow_pos('$END', '', last_token) if last_token is not None else self.lexer_thread._Token('$END', '', 0, 1, 1) - return self.feed_token(eof) - - - def __copy__(self): - """Create a new interactive parser with a separate state. - - Calls to feed_token() won't affect the old instance, and vice-versa. - """ - return self.copy() - - def copy(self, deepcopy_values=True): - return type(self)( - self.parser, - self.parser_state.copy(deepcopy_values=deepcopy_values), - copy(self.lexer_thread), - ) - - def __eq__(self, other): - if not isinstance(other, InteractiveParser): - return False - - return self.parser_state == other.parser_state and self.lexer_thread == other.lexer_thread - - def as_immutable(self): - """Convert to an ``ImmutableInteractiveParser``.""" - p = copy(self) - return ImmutableInteractiveParser(p.parser, p.parser_state, p.lexer_thread) - - def pretty(self): - """Print the output of ``choices()`` in a way that's easier to read.""" - out = ["Parser choices:"] - for k, v in self.choices().items(): - out.append('\t- %s -> %r' % (k, v)) - out.append('stack size: %s' % len(self.parser_state.state_stack)) - return '\n'.join(out) - - def choices(self): - """Returns a dictionary of token types, matched to their action in the parser. - - Only returns token types that are accepted by the current state. - - Updated by ``feed_token()``. - """ - return self.parser_state.parse_conf.parse_table.states[self.parser_state.position] - - def accepts(self): - """Returns the set of possible tokens that will advance the parser into a new valid state.""" - accepts = set() - conf_no_callbacks = copy(self.parser_state.parse_conf) - # We don't want to call callbacks here since those might have arbitrary side effects - # and are unnecessarily slow. - conf_no_callbacks.callbacks = {} - for t in self.choices(): - if t.isupper(): # is terminal? - new_cursor = self.copy(deepcopy_values=False) - new_cursor.parser_state.parse_conf = conf_no_callbacks - try: - new_cursor.feed_token(self.lexer_thread._Token(t, '')) - except UnexpectedToken: - pass - else: - accepts.add(t) - return accepts - - def resume_parse(self): - """Resume automated parsing from the current state. - """ - return self.parser.parse_from_state(self.parser_state, last_token=self.lexer_thread.state.last_token) - - - -class ImmutableInteractiveParser(InteractiveParser): - """Same as ``InteractiveParser``, but operations create a new instance instead - of changing it in-place. - """ - - result = None - - def __hash__(self): - return hash((self.parser_state, self.lexer_thread)) - - def feed_token(self, token): - c = copy(self) - c.result = InteractiveParser.feed_token(c, token) - return c - - def exhaust_lexer(self): - """Try to feed the rest of the lexer state into the parser. - - Note that this returns a new ImmutableInteractiveParser and does not feed an '$END' Token""" - cursor = self.as_mutable() - cursor.exhaust_lexer() - return cursor.as_immutable() - - def as_mutable(self): - """Convert to an ``InteractiveParser``.""" - p = copy(self) - return InteractiveParser(p.parser, p.parser_state, p.lexer_thread) - -###} diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py deleted file mode 100644 index 6ae2a04..0000000 --- a/lark/parsers/lalr_parser.py +++ /dev/null @@ -1,122 +0,0 @@ -"""This module implements a LALR(1) Parser -""" -# Author: Erez Shinan (2017) -# Email : erezshin@gmail.com -from typing import Dict, Any, Optional -from ..lexer import Token, LexerThread -from ..utils import Serialize -from ..common import ParserConf, ParserCallbacks - -from .lalr_analysis import LALR_Analyzer, IntParseTable, ParseTableBase -from .lalr_interactive_parser import InteractiveParser -from lark.exceptions import UnexpectedCharacters, UnexpectedInput, UnexpectedToken -from .lalr_parser_state import ParserState, ParseConf - -###{standalone - -class LALR_Parser(Serialize): - def __init__(self, parser_conf: ParserConf, debug: bool=False, strict: bool=False): - analysis = LALR_Analyzer(parser_conf, debug=debug, strict=strict) - analysis.compute_lalr() - callbacks = parser_conf.callbacks - - self._parse_table = analysis.parse_table - self.parser_conf = parser_conf - self.parser = _Parser(analysis.parse_table, callbacks, debug) - - @classmethod - def deserialize(cls, data, memo, callbacks, debug=False): - inst = cls.__new__(cls) - inst._parse_table = IntParseTable.deserialize(data, memo) - inst.parser = _Parser(inst._parse_table, callbacks, debug) - return inst - - def serialize(self, memo: Any = None) -> Dict[str, Any]: - return self._parse_table.serialize(memo) - - def parse_interactive(self, lexer: LexerThread, start: str): - return self.parser.parse(lexer, start, start_interactive=True) - - def parse(self, lexer, start, on_error=None): - try: - return self.parser.parse(lexer, start) - except UnexpectedInput as e: - if on_error is None: - raise - - while True: - if isinstance(e, UnexpectedCharacters): - s = e.interactive_parser.lexer_thread.state - p = s.line_ctr.char_pos - - if not on_error(e): - raise e - - if isinstance(e, UnexpectedCharacters): - # If user didn't change the character position, then we should - if p == s.line_ctr.char_pos: - s.line_ctr.feed(s.text[p:p+1]) - - try: - return e.interactive_parser.resume_parse() - except UnexpectedToken as e2: - if (isinstance(e, UnexpectedToken) - and e.token.type == e2.token.type == '$END' - and e.interactive_parser == e2.interactive_parser): - # Prevent infinite loop - raise e2 - e = e2 - except UnexpectedCharacters as e2: - e = e2 - - -class _Parser: - parse_table: ParseTableBase - callbacks: ParserCallbacks - debug: bool - - def __init__(self, parse_table: ParseTableBase, callbacks: ParserCallbacks, debug: bool=False): - self.parse_table = parse_table - self.callbacks = callbacks - self.debug = debug - - def parse(self, lexer: LexerThread, start: str, value_stack=None, state_stack=None, start_interactive=False): - parse_conf = ParseConf(self.parse_table, self.callbacks, start) - parser_state = ParserState(parse_conf, lexer, state_stack, value_stack) - if start_interactive: - return InteractiveParser(self, parser_state, parser_state.lexer) - return self.parse_from_state(parser_state) - - - def parse_from_state(self, state: ParserState, last_token: Optional[Token]=None): - """Run the main LALR parser loop - - Parameters: - state - the initial state. Changed in-place. - last_token - Used only for line information in case of an empty lexer. - """ - try: - token = last_token - for token in state.lexer.lex(state): - assert token is not None - state.feed_token(token) - - end_token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1) - return state.feed_token(end_token, True) - except UnexpectedInput as e: - try: - e.interactive_parser = InteractiveParser(self, state, state.lexer) - except NameError: - pass - raise e - except Exception as e: - if self.debug: - print("") - print("STATE STACK DUMP") - print("----------------") - for i, s in enumerate(state.state_stack): - print('%d)' % i , s) - print("") - - raise -###} diff --git a/lark/parsers/lalr_parser_state.py b/lark/parsers/lalr_parser_state.py deleted file mode 100644 index 273bc00..0000000 --- a/lark/parsers/lalr_parser_state.py +++ /dev/null @@ -1,110 +0,0 @@ -from copy import deepcopy, copy -from typing import Dict, Any, Generic, List -from ..lexer import Token, LexerThread -from ..common import ParserCallbacks - -from .lalr_analysis import Shift, ParseTableBase, StateT -from lark.exceptions import UnexpectedToken - -###{standalone - -class ParseConf(Generic[StateT]): - __slots__ = 'parse_table', 'callbacks', 'start', 'start_state', 'end_state', 'states' - - parse_table: ParseTableBase[StateT] - callbacks: ParserCallbacks - start: str - - start_state: StateT - end_state: StateT - states: Dict[StateT, Dict[str, tuple]] - - def __init__(self, parse_table: ParseTableBase[StateT], callbacks: ParserCallbacks, start: str): - self.parse_table = parse_table - - self.start_state = self.parse_table.start_states[start] - self.end_state = self.parse_table.end_states[start] - self.states = self.parse_table.states - - self.callbacks = callbacks - self.start = start - -class ParserState(Generic[StateT]): - __slots__ = 'parse_conf', 'lexer', 'state_stack', 'value_stack' - - parse_conf: ParseConf[StateT] - lexer: LexerThread - state_stack: List[StateT] - value_stack: list - - def __init__(self, parse_conf: ParseConf[StateT], lexer: LexerThread, state_stack=None, value_stack=None): - self.parse_conf = parse_conf - self.lexer = lexer - self.state_stack = state_stack or [self.parse_conf.start_state] - self.value_stack = value_stack or [] - - @property - def position(self) -> StateT: - return self.state_stack[-1] - - # Necessary for match_examples() to work - def __eq__(self, other) -> bool: - if not isinstance(other, ParserState): - return NotImplemented - return len(self.state_stack) == len(other.state_stack) and self.position == other.position - - def __copy__(self): - return self.copy() - - def copy(self, deepcopy_values=True) -> 'ParserState[StateT]': - return type(self)( - self.parse_conf, - self.lexer, # XXX copy - copy(self.state_stack), - deepcopy(self.value_stack) if deepcopy_values else copy(self.value_stack), - ) - - def feed_token(self, token: Token, is_end=False) -> Any: - state_stack = self.state_stack - value_stack = self.value_stack - states = self.parse_conf.states - end_state = self.parse_conf.end_state - callbacks = self.parse_conf.callbacks - - while True: - state = state_stack[-1] - try: - action, arg = states[state][token.type] - except KeyError: - expected = {s for s in states[state].keys() if s.isupper()} - raise UnexpectedToken(token, expected, state=self, interactive_parser=None) - - assert arg != end_state - - if action is Shift: - # shift once and return - assert not is_end - state_stack.append(arg) - value_stack.append(token if token.type not in callbacks else callbacks[token.type](token)) - return - else: - # reduce+shift as many times as necessary - rule = arg - size = len(rule.expansion) - if size: - s = value_stack[-size:] - del state_stack[-size:] - del value_stack[-size:] - else: - s = [] - - value = callbacks[rule](s) if callbacks else s - - _action, new_state = states[state_stack[-1]][rule.origin.name] - assert _action is Shift - state_stack.append(new_state) - value_stack.append(value) - - if is_end and state_stack[-1] == end_state: - return value_stack[-1] -###} diff --git a/lark/parsers/xearley.py b/lark/parsers/xearley.py deleted file mode 100644 index 13d592d..0000000 --- a/lark/parsers/xearley.py +++ /dev/null @@ -1,166 +0,0 @@ -"""This module implements an Earley parser with a dynamic lexer - -The core Earley algorithm used here is based on Elizabeth Scott's implementation, here: - https://www.sciencedirect.com/science/article/pii/S1571066108001497 - -That is probably the best reference for understanding the algorithm here. - -The Earley parser outputs an SPPF-tree as per that document. The SPPF tree format -is better documented here: - http://www.bramvandersanden.com/post/2014/06/shared-packed-parse-forest/ - -Instead of running a lexer beforehand, or using a costy char-by-char method, this parser -uses regular expressions by necessity, achieving high-performance while maintaining all of -Earley's power in parsing any CFG. -""" - -from typing import TYPE_CHECKING, Callable, Optional, List, Any -from collections import defaultdict - -from ..tree import Tree -from ..exceptions import UnexpectedCharacters -from ..lexer import Token -from ..grammar import Terminal -from .earley import Parser as BaseParser -from .earley_forest import TokenNode - -if TYPE_CHECKING: - from ..common import LexerConf, ParserConf - -class Parser(BaseParser): - def __init__(self, lexer_conf: 'LexerConf', parser_conf: 'ParserConf', term_matcher: Callable, - resolve_ambiguity: bool=True, complete_lex: bool=False, debug: bool=False, - tree_class: Optional[Callable[[str, List], Any]]=Tree, ordered_sets: bool=True): - BaseParser.__init__(self, lexer_conf, parser_conf, term_matcher, resolve_ambiguity, - debug, tree_class, ordered_sets) - self.ignore = [Terminal(t) for t in lexer_conf.ignore] - self.complete_lex = complete_lex - - def _parse(self, stream, columns, to_scan, start_symbol=None): - - def scan(i, to_scan): - """The core Earley Scanner. - - This is a custom implementation of the scanner that uses the - Lark lexer to match tokens. The scan list is built by the - Earley predictor, based on the previously completed tokens. - This ensures that at each phase of the parse we have a custom - lexer context, allowing for more complex ambiguities.""" - - node_cache = {} - - # 1) Loop the expectations and ask the lexer to match. - # Since regexp is forward looking on the input stream, and we only - # want to process tokens when we hit the point in the stream at which - # they complete, we push all tokens into a buffer (delayed_matches), to - # be held possibly for a later parse step when we reach the point in the - # input stream at which they complete. - for item in self.Set(to_scan): - m = match(item.expect, stream, i) - if m: - t = Token(item.expect.name, m.group(0), i, text_line, text_column) - delayed_matches[m.end()].append( (item, i, t) ) - - if self.complete_lex: - s = m.group(0) - for j in range(1, len(s)): - m = match(item.expect, s[:-j]) - if m: - t = Token(item.expect.name, m.group(0), i, text_line, text_column) - delayed_matches[i+m.end()].append( (item, i, t) ) - - # XXX The following 3 lines were commented out for causing a bug. See issue #768 - # # Remove any items that successfully matched in this pass from the to_scan buffer. - # # This ensures we don't carry over tokens that already matched, if we're ignoring below. - # to_scan.remove(item) - - # 3) Process any ignores. This is typically used for e.g. whitespace. - # We carry over any unmatched items from the to_scan buffer to be matched again after - # the ignore. This should allow us to use ignored symbols in non-terminals to implement - # e.g. mandatory spacing. - for x in self.ignore: - m = match(x, stream, i) - if m: - # Carry over any items still in the scan buffer, to past the end of the ignored items. - delayed_matches[m.end()].extend([(item, i, None) for item in to_scan ]) - - # If we're ignoring up to the end of the file, # carry over the start symbol if it already completed. - delayed_matches[m.end()].extend([(item, i, None) for item in columns[i] if item.is_complete and item.s == start_symbol]) - - next_to_scan = self.Set() - next_set = self.Set() - columns.append(next_set) - transitives.append({}) - - ## 4) Process Tokens from delayed_matches. - # This is the core of the Earley scanner. Create an SPPF node for each Token, - # and create the symbol node in the SPPF tree. Advance the item that completed, - # and add the resulting new item to either the Earley set (for processing by the - # completer/predictor) or the to_scan buffer for the next parse step. - for item, start, token in delayed_matches[i+1]: - if token is not None: - token.end_line = text_line - token.end_column = text_column + 1 - token.end_pos = i + 1 - - new_item = item.advance() - label = (new_item.s, new_item.start, i + 1) - token_node = TokenNode(token, terminals[token.type]) - new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, self.SymbolNode(*label)) - new_item.node.add_family(new_item.s, item.rule, new_item.start, item.node, token_node) - else: - new_item = item - - if new_item.expect in self.TERMINALS: - # add (B ::= Aai+1.B, h, y) to Q' - next_to_scan.add(new_item) - else: - # add (B ::= Aa+1.B, h, y) to Ei+1 - next_set.add(new_item) - - del delayed_matches[i+1] # No longer needed, so unburden memory - - if not next_set and not delayed_matches and not next_to_scan: - considered_rules = list(sorted(to_scan, key=lambda key: key.rule.origin.name)) - raise UnexpectedCharacters(stream, i, text_line, text_column, {item.expect.name for item in to_scan}, - set(to_scan), state=frozenset(i.s for i in to_scan), - considered_rules=considered_rules - ) - - return next_to_scan, node_cache - - - delayed_matches = defaultdict(list) - match = self.term_matcher - terminals = self.lexer_conf.terminals_by_name - - # Cache for nodes & tokens created in a particular parse step. - transitives = [{}] - - text_line = 1 - text_column = 1 - - ## The main Earley loop. - # Run the Prediction/Completion cycle for any Items in the current Earley set. - # Completions will be added to the SPPF tree, and predictions will be recursively - # processed down to terminals/empty nodes to be added to the scanner for the next - # step. - i = 0 - node_cache = {} - for token in stream: - self.predict_and_complete(i, to_scan, columns, transitives, node_cache) - - to_scan, node_cache = scan(i, to_scan) - - if token == '\n': - text_line += 1 - text_column = 1 - else: - text_column += 1 - i += 1 - - self.predict_and_complete(i, to_scan, columns, transitives, node_cache) - - ## Column is now the final column in the parse. - assert i == len(columns)-1 - return to_scan diff --git a/lark/py.typed b/lark/py.typed deleted file mode 100644 index e69de29..0000000 diff --git a/lark/reconstruct.py b/lark/reconstruct.py deleted file mode 100644 index 2d8423a..0000000 --- a/lark/reconstruct.py +++ /dev/null @@ -1,107 +0,0 @@ -"""This is an experimental tool for reconstructing text from a shaped tree, based on a Lark grammar. -""" - -from typing import Dict, Callable, Iterable, Optional - -from .lark import Lark -from .tree import Tree, ParseTree -from .visitors import Transformer_InPlace -from .lexer import Token, PatternStr, TerminalDef -from .grammar import Terminal, NonTerminal, Symbol - -from .tree_matcher import TreeMatcher, is_discarded_terminal -from .utils import is_id_continue - -def is_iter_empty(i): - try: - _ = next(i) - return False - except StopIteration: - return True - - -class WriteTokensTransformer(Transformer_InPlace): - "Inserts discarded tokens into their correct place, according to the rules of grammar" - - tokens: Dict[str, TerminalDef] - term_subs: Dict[str, Callable[[Symbol], str]] - - def __init__(self, tokens: Dict[str, TerminalDef], term_subs: Dict[str, Callable[[Symbol], str]]) -> None: - self.tokens = tokens - self.term_subs = term_subs - - def __default__(self, data, children, meta): - if not getattr(meta, 'match_tree', False): - return Tree(data, children) - - iter_args = iter(children) - to_write = [] - for sym in meta.orig_expansion: - if is_discarded_terminal(sym): - try: - v = self.term_subs[sym.name](sym) - except KeyError: - t = self.tokens[sym.name] - if not isinstance(t.pattern, PatternStr): - raise NotImplementedError("Reconstructing regexps not supported yet: %s" % t) - - v = t.pattern.value - to_write.append(v) - else: - x = next(iter_args) - if isinstance(x, list): - to_write += x - else: - if isinstance(x, Token): - assert Terminal(x.type) == sym, x - else: - assert NonTerminal(x.data) == sym, (sym, x) - to_write.append(x) - - assert is_iter_empty(iter_args) - return to_write - - -class Reconstructor(TreeMatcher): - """ - A Reconstructor that will, given a full parse Tree, generate source code. - - Note: - The reconstructor cannot generate values from regexps. If you need to produce discarded - regexes, such as newlines, use `term_subs` and provide default values for them. - - Parameters: - parser: a Lark instance - term_subs: a dictionary of [Terminal name as str] to [output text as str] - """ - - write_tokens: WriteTokensTransformer - - def __init__(self, parser: Lark, term_subs: Optional[Dict[str, Callable[[Symbol], str]]]=None) -> None: - TreeMatcher.__init__(self, parser) - - self.write_tokens = WriteTokensTransformer({t.name:t for t in self.tokens}, term_subs or {}) - - def _reconstruct(self, tree): - unreduced_tree = self.match_tree(tree, tree.data) - - res = self.write_tokens.transform(unreduced_tree) - for item in res: - if isinstance(item, Tree): - # TODO use orig_expansion.rulename to support templates - yield from self._reconstruct(item) - else: - yield item - - def reconstruct(self, tree: ParseTree, postproc: Optional[Callable[[Iterable[str]], Iterable[str]]]=None, insert_spaces: bool=True) -> str: - x = self._reconstruct(tree) - if postproc: - x = postproc(x) - y = [] - prev_item = '' - for item in x: - if insert_spaces and prev_item and item and is_id_continue(prev_item[-1]) and is_id_continue(item[0]): - y.append(' ') - y.append(item) - prev_item = item - return ''.join(y) diff --git a/lark/tools/__init__.py b/lark/tools/__init__.py deleted file mode 100644 index eeb40e1..0000000 --- a/lark/tools/__init__.py +++ /dev/null @@ -1,70 +0,0 @@ -import sys -from argparse import ArgumentParser, FileType -from textwrap import indent -from logging import DEBUG, INFO, WARN, ERROR -from typing import Optional -import warnings - -from lark import Lark, logger -try: - from interegular import logger as interegular_logger - has_interegular = True -except ImportError: - has_interegular = False - -lalr_argparser = ArgumentParser(add_help=False, epilog='Look at the Lark documentation for more info on the options') - -flags = [ - ('d', 'debug'), - 'keep_all_tokens', - 'regex', - 'propagate_positions', - 'maybe_placeholders', - 'use_bytes' -] - -options = ['start', 'lexer'] - -lalr_argparser.add_argument('-v', '--verbose', action='count', default=0, help="Increase Logger output level, up to three times") -lalr_argparser.add_argument('-s', '--start', action='append', default=[]) -lalr_argparser.add_argument('-l', '--lexer', default='contextual', choices=('basic', 'contextual')) -lalr_argparser.add_argument('-o', '--out', type=FileType('w', encoding='utf-8'), default=sys.stdout, help='the output file (default=stdout)') -lalr_argparser.add_argument('grammar_file', type=FileType('r', encoding='utf-8'), help='A valid .lark file') - -for flag in flags: - if isinstance(flag, tuple): - options.append(flag[1]) - lalr_argparser.add_argument('-' + flag[0], '--' + flag[1], action='store_true') - elif isinstance(flag, str): - options.append(flag) - lalr_argparser.add_argument('--' + flag, action='store_true') - else: - raise NotImplementedError("flags must only contain strings or tuples of strings") - - -def build_lalr(namespace): - logger.setLevel((ERROR, WARN, INFO, DEBUG)[min(namespace.verbose, 3)]) - if has_interegular: - interegular_logger.setLevel(logger.getEffectiveLevel()) - if len(namespace.start) == 0: - namespace.start.append('start') - kwargs = {n: getattr(namespace, n) for n in options} - return Lark(namespace.grammar_file, parser='lalr', **kwargs), namespace.out - - -def showwarning_as_comment(message, category, filename, lineno, file=None, line=None): - # Based on warnings._showwarnmsg_impl - text = warnings.formatwarning(message, category, filename, lineno, line) - text = indent(text, '# ') - if file is None: - file = sys.stderr - if file is None: - return - try: - file.write(text) - except OSError: - pass - - -def make_warnings_comments(): - warnings.showwarning = showwarning_as_comment diff --git a/lark/tools/nearley.py b/lark/tools/nearley.py deleted file mode 100644 index 1fc27d5..0000000 --- a/lark/tools/nearley.py +++ /dev/null @@ -1,202 +0,0 @@ -"Converts Nearley grammars to Lark" - -import os.path -import sys -import codecs -import argparse - - -from lark import Lark, Transformer, v_args - -nearley_grammar = r""" - start: (ruledef|directive)+ - - directive: "@" NAME (STRING|NAME) - | "@" JS -> js_code - ruledef: NAME "->" expansions - | NAME REGEXP "->" expansions -> macro - expansions: expansion ("|" expansion)* - - expansion: expr+ js - - ?expr: item (":" /[+*?]/)? - - ?item: rule|string|regexp|null - | "(" expansions ")" - - rule: NAME - string: STRING - regexp: REGEXP - null: "null" - JS: /{%.*?%}/s - js: JS? - - NAME: /[a-zA-Z_$]\w*/ - COMMENT: /#[^\n]*/ - REGEXP: /\[.*?\]/ - - STRING: _STRING "i"? - - %import common.ESCAPED_STRING -> _STRING - %import common.WS - %ignore WS - %ignore COMMENT - - """ - -nearley_grammar_parser = Lark(nearley_grammar, parser='earley', lexer='basic') - -def _get_rulename(name): - name = {'_': '_ws_maybe', '__': '_ws'}.get(name, name) - return 'n_' + name.replace('$', '__DOLLAR__').lower() - -@v_args(inline=True) -class NearleyToLark(Transformer): - def __init__(self): - self._count = 0 - self.extra_rules = {} - self.extra_rules_rev = {} - self.alias_js_code = {} - - def _new_function(self, code): - name = 'alias_%d' % self._count - self._count += 1 - - self.alias_js_code[name] = code - return name - - def _extra_rule(self, rule): - if rule in self.extra_rules_rev: - return self.extra_rules_rev[rule] - - name = 'xrule_%d' % len(self.extra_rules) - assert name not in self.extra_rules - self.extra_rules[name] = rule - self.extra_rules_rev[rule] = name - return name - - def rule(self, name): - return _get_rulename(name) - - def ruledef(self, name, exps): - return '!%s: %s' % (_get_rulename(name), exps) - - def expr(self, item, op): - rule = '(%s)%s' % (item, op) - return self._extra_rule(rule) - - def regexp(self, r): - return '/%s/' % r - - def null(self): - return '' - - def string(self, s): - return self._extra_rule(s) - - def expansion(self, *x): - x, js = x[:-1], x[-1] - if js.children: - js_code ,= js.children - js_code = js_code[2:-2] - alias = '-> ' + self._new_function(js_code) - else: - alias = '' - return ' '.join(x) + alias - - def expansions(self, *x): - return '%s' % ('\n |'.join(x)) - - def start(self, *rules): - return '\n'.join(filter(None, rules)) - -def _nearley_to_lark(g, builtin_path, n2l, js_code, folder_path, includes): - rule_defs = [] - - tree = nearley_grammar_parser.parse(g) - for statement in tree.children: - if statement.data == 'directive': - directive, arg = statement.children - if directive in ('builtin', 'include'): - folder = builtin_path if directive == 'builtin' else folder_path - path = os.path.join(folder, arg[1:-1]) - if path not in includes: - includes.add(path) - with codecs.open(path, encoding='utf8') as f: - text = f.read() - rule_defs += _nearley_to_lark(text, builtin_path, n2l, js_code, os.path.abspath(os.path.dirname(path)), includes) - else: - assert False, directive - elif statement.data == 'js_code': - code ,= statement.children - code = code[2:-2] - js_code.append(code) - elif statement.data == 'macro': - pass # TODO Add support for macros! - elif statement.data == 'ruledef': - rule_defs.append(n2l.transform(statement)) - else: - raise Exception("Unknown statement: %s" % statement) - - return rule_defs - - -def create_code_for_nearley_grammar(g, start, builtin_path, folder_path, es6=False): - import js2py - - emit_code = [] - def emit(x=None): - if x: - emit_code.append(x) - emit_code.append('\n') - - js_code = ['function id(x) {return x[0];}'] - n2l = NearleyToLark() - rule_defs = _nearley_to_lark(g, builtin_path, n2l, js_code, folder_path, set()) - lark_g = '\n'.join(rule_defs) - lark_g += '\n'+'\n'.join('!%s: %s' % item for item in n2l.extra_rules.items()) - - emit('from lark import Lark, Transformer') - emit() - emit('grammar = ' + repr(lark_g)) - emit() - - for alias, code in n2l.alias_js_code.items(): - js_code.append('%s = (%s);' % (alias, code)) - - if es6: - emit(js2py.translate_js6('\n'.join(js_code))) - else: - emit(js2py.translate_js('\n'.join(js_code))) - emit('class TransformNearley(Transformer):') - for alias in n2l.alias_js_code: - emit(" %s = var.get('%s').to_python()" % (alias, alias)) - emit(" __default__ = lambda self, n, c, m: c if c else None") - - emit() - emit('parser = Lark(grammar, start="n_%s", maybe_placeholders=False)' % start) - emit('def parse(text):') - emit(' return TransformNearley().transform(parser.parse(text))') - - return ''.join(emit_code) - -def main(fn, start, nearley_lib, es6=False): - with codecs.open(fn, encoding='utf8') as f: - grammar = f.read() - return create_code_for_nearley_grammar(grammar, start, os.path.join(nearley_lib, 'builtin'), os.path.abspath(os.path.dirname(fn)), es6=es6) - -def get_arg_parser(): - parser = argparse.ArgumentParser(description='Reads a Nearley grammar (with js functions), and outputs an equivalent lark parser.') - parser.add_argument('nearley_grammar', help='Path to the file containing the nearley grammar') - parser.add_argument('start_rule', help='Rule within the nearley grammar to make the base rule') - parser.add_argument('nearley_lib', help='Path to root directory of nearley codebase (used for including builtins)') - parser.add_argument('--es6', help='Enable experimental ES6 support', action='store_true') - return parser - -if __name__ == '__main__': - parser = get_arg_parser() - if len(sys.argv) == 1: - parser.print_help(sys.stderr) - sys.exit(1) - args = parser.parse_args() - print(main(fn=args.nearley_grammar, start=args.start_rule, nearley_lib=args.nearley_lib, es6=args.es6)) diff --git a/lark/tools/serialize.py b/lark/tools/serialize.py deleted file mode 100644 index eb28824..0000000 --- a/lark/tools/serialize.py +++ /dev/null @@ -1,32 +0,0 @@ -import sys -import json - -from lark.grammar import Rule -from lark.lexer import TerminalDef -from lark.tools import lalr_argparser, build_lalr - -import argparse - -argparser = argparse.ArgumentParser(prog='python -m lark.tools.serialize', parents=[lalr_argparser], - description="Lark Serialization Tool - Stores Lark's internal state & LALR analysis as a JSON file", - epilog='Look at the Lark documentation for more info on the options') - - -def serialize(lark_inst, outfile): - data, memo = lark_inst.memo_serialize([TerminalDef, Rule]) - outfile.write('{\n') - outfile.write(' "data": %s,\n' % json.dumps(data)) - outfile.write(' "memo": %s\n' % json.dumps(memo)) - outfile.write('}\n') - - -def main(): - if len(sys.argv)==1: - argparser.print_help(sys.stderr) - sys.exit(1) - ns = argparser.parse_args() - serialize(*build_lalr(ns)) - - -if __name__ == '__main__': - main() diff --git a/lark/tools/standalone.py b/lark/tools/standalone.py deleted file mode 100644 index 9940ccb..0000000 --- a/lark/tools/standalone.py +++ /dev/null @@ -1,196 +0,0 @@ -###{standalone -# -# -# Lark Stand-alone Generator Tool -# ---------------------------------- -# Generates a stand-alone LALR(1) parser -# -# Git: https://github.com/erezsh/lark -# Author: Erez Shinan (erezshin@gmail.com) -# -# -# >>> LICENSE -# -# This tool and its generated code use a separate license from Lark, -# and are subject to the terms of the Mozilla Public License, v. 2.0. -# If a copy of the MPL was not distributed with this -# file, You can obtain one at https://mozilla.org/MPL/2.0/. -# -# If you wish to purchase a commercial license for this tool and its -# generated code, you may contact me via email or otherwise. -# -# If MPL2 is incompatible with your free or open-source project, -# contact me and we'll work it out. -# -# - -from copy import deepcopy -from abc import ABC, abstractmethod -from types import ModuleType -from typing import ( - TypeVar, Generic, Type, Tuple, List, Dict, Iterator, Collection, Callable, Optional, FrozenSet, Any, - Union, Iterable, IO, TYPE_CHECKING, overload, Sequence, - Pattern as REPattern, ClassVar, Set, Mapping -) -###} - -import sys -import token, tokenize -import os -from os import path -from collections import defaultdict -from functools import partial -from argparse import ArgumentParser - -import lark -from lark.tools import lalr_argparser, build_lalr, make_warnings_comments - - -from lark.grammar import Rule -from lark.lexer import TerminalDef - -_dir = path.dirname(__file__) -_larkdir = path.join(_dir, path.pardir) - - -EXTRACT_STANDALONE_FILES = [ - 'tools/standalone.py', - 'exceptions.py', - 'utils.py', - 'tree.py', - 'visitors.py', - 'grammar.py', - 'lexer.py', - 'common.py', - 'parse_tree_builder.py', - 'parsers/lalr_analysis.py', - 'parsers/lalr_parser_state.py', - 'parsers/lalr_parser.py', - 'parsers/lalr_interactive_parser.py', - 'parser_frontends.py', - 'lark.py', - 'indenter.py', -] - -def extract_sections(lines): - section = None - text = [] - sections = defaultdict(list) - for line in lines: - if line.startswith('###'): - if line[3] == '{': - section = line[4:].strip() - elif line[3] == '}': - sections[section] += text - section = None - text = [] - else: - raise ValueError(line) - elif section: - text.append(line) - - return {name: ''.join(text) for name, text in sections.items()} - - -def strip_docstrings(line_gen): - """ Strip comments and docstrings from a file. - Based on code from: https://stackoverflow.com/questions/1769332/script-to-remove-python-comments-docstrings - """ - res = [] - - prev_toktype = token.INDENT - last_lineno = -1 - last_col = 0 - - tokgen = tokenize.generate_tokens(line_gen) - for toktype, ttext, (slineno, scol), (elineno, ecol), ltext in tokgen: - if slineno > last_lineno: - last_col = 0 - if scol > last_col: - res.append(" " * (scol - last_col)) - if toktype == token.STRING and prev_toktype == token.INDENT: - # Docstring - res.append("#--") - elif toktype == tokenize.COMMENT: - # Comment - res.append("##\n") - else: - res.append(ttext) - prev_toktype = toktype - last_col = ecol - last_lineno = elineno - - return ''.join(res) - - -def gen_standalone(lark_inst, output=None, out=sys.stdout, compress=False): - if output is None: - output = partial(print, file=out) - - import pickle, zlib, base64 - def compressed_output(obj): - s = pickle.dumps(obj, pickle.HIGHEST_PROTOCOL) - c = zlib.compress(s) - output(repr(base64.b64encode(c))) - - def output_decompress(name): - output('%(name)s = pickle.loads(zlib.decompress(base64.b64decode(%(name)s)))' % locals()) - - output('# The file was automatically generated by Lark v%s' % lark.__version__) - output('__version__ = "%s"' % lark.__version__) - output() - - for i, pyfile in enumerate(EXTRACT_STANDALONE_FILES): - with open(os.path.join(_larkdir, pyfile)) as f: - code = extract_sections(f)['standalone'] - if i: # if not this file - code = strip_docstrings(partial(next, iter(code.splitlines(True)))) - output(code) - - data, m = lark_inst.memo_serialize([TerminalDef, Rule]) - output('import pickle, zlib, base64') - if compress: - output('DATA = (') - compressed_output(data) - output(')') - output_decompress('DATA') - output('MEMO = (') - compressed_output(m) - output(')') - output_decompress('MEMO') - else: - output('DATA = (') - output(data) - output(')') - output('MEMO = (') - output(m) - output(')') - - - output('Shift = 0') - output('Reduce = 1') - output("def Lark_StandAlone(**kwargs):") - output(" return Lark._load_from_dict(DATA, MEMO, **kwargs)") - - - - -def main(): - make_warnings_comments() - parser = ArgumentParser(prog="prog='python -m lark.tools.standalone'", description="Lark Stand-alone Generator Tool", - parents=[lalr_argparser], epilog='Look at the Lark documentation for more info on the options') - parser.add_argument('-c', '--compress', action='store_true', default=0, help="Enable compression") - if len(sys.argv) == 1: - parser.print_help(sys.stderr) - sys.exit(1) - ns = parser.parse_args() - - lark_inst, out = build_lalr(ns) - gen_standalone(lark_inst, out=out, compress=ns.compress) - - ns.out.close() - ns.grammar_file.close() - - -if __name__ == '__main__': - main() diff --git a/lark/tree.py b/lark/tree.py deleted file mode 100644 index 76f8738..0000000 --- a/lark/tree.py +++ /dev/null @@ -1,267 +0,0 @@ -import sys -from copy import deepcopy - -from typing import List, Callable, Iterator, Union, Optional, Generic, TypeVar, TYPE_CHECKING - -if TYPE_CHECKING: - from .lexer import TerminalDef, Token - try: - import rich - except ImportError: - pass - from typing import Literal - -###{standalone - -class Meta: - - empty: bool - line: int - column: int - start_pos: int - end_line: int - end_column: int - end_pos: int - orig_expansion: 'List[TerminalDef]' - match_tree: bool - - def __init__(self): - self.empty = True - - -_Leaf_T = TypeVar("_Leaf_T") -Branch = Union[_Leaf_T, 'Tree[_Leaf_T]'] - - -class Tree(Generic[_Leaf_T]): - """The main tree class. - - Creates a new tree, and stores "data" and "children" in attributes of the same name. - Trees can be hashed and compared. - - Parameters: - data: The name of the rule or alias - children: List of matched sub-rules and terminals - meta: Line & Column numbers (if ``propagate_positions`` is enabled). - meta attributes: (line, column, end_line, end_column, start_pos, end_pos, - container_line, container_column, container_end_line, container_end_column) - container_* attributes consider all symbols, including those that have been inlined in the tree. - For example, in the rule 'a: _A B _C', the regular attributes will mark the start and end of B, - but the container_* attributes will also include _A and _C in the range. However, rules that - contain 'a' will consider it in full, including _A and _C for all attributes. - """ - - data: str - children: 'List[Branch[_Leaf_T]]' - - def __init__(self, data: str, children: 'List[Branch[_Leaf_T]]', meta: Optional[Meta]=None) -> None: - self.data = data - self.children = children - self._meta = meta - - @property - def meta(self) -> Meta: - if self._meta is None: - self._meta = Meta() - return self._meta - - def __repr__(self): - return 'Tree(%r, %r)' % (self.data, self.children) - - def _pretty_label(self): - return self.data - - def _pretty(self, level, indent_str): - yield f'{indent_str*level}{self._pretty_label()}' - if len(self.children) == 1 and not isinstance(self.children[0], Tree): - yield f'\t{self.children[0]}\n' - else: - yield '\n' - for n in self.children: - if isinstance(n, Tree): - yield from n._pretty(level+1, indent_str) - else: - yield f'{indent_str*(level+1)}{n}\n' - - def pretty(self, indent_str: str=' ') -> str: - """Returns an indented string representation of the tree. - - Great for debugging. - """ - return ''.join(self._pretty(0, indent_str)) - - def __rich__(self, parent:Optional['rich.tree.Tree']=None) -> 'rich.tree.Tree': - """Returns a tree widget for the 'rich' library. - - Example: - :: - from rich import print - from lark import Tree - - tree = Tree('root', ['node1', 'node2']) - print(tree) - """ - return self._rich(parent) - - def _rich(self, parent): - if parent: - tree = parent.add(f'[bold]{self.data}[/bold]') - else: - import rich.tree - tree = rich.tree.Tree(self.data) - - for c in self.children: - if isinstance(c, Tree): - c._rich(tree) - else: - tree.add(f'[green]{c}[/green]') - - return tree - - def __eq__(self, other): - try: - return self.data == other.data and self.children == other.children - except AttributeError: - return False - - def __ne__(self, other): - return not (self == other) - - def __hash__(self) -> int: - return hash((self.data, tuple(self.children))) - - def iter_subtrees(self) -> 'Iterator[Tree[_Leaf_T]]': - """Depth-first iteration. - - Iterates over all the subtrees, never returning to the same node twice (Lark's parse-tree is actually a DAG). - """ - queue = [self] - subtrees = dict() - for subtree in queue: - subtrees[id(subtree)] = subtree - queue += [c for c in reversed(subtree.children) - if isinstance(c, Tree) and id(c) not in subtrees] - - del queue - return reversed(list(subtrees.values())) - - def iter_subtrees_topdown(self): - """Breadth-first iteration. - - Iterates over all the subtrees, return nodes in order like pretty() does. - """ - stack = [self] - stack_append = stack.append - stack_pop = stack.pop - while stack: - node = stack_pop() - if not isinstance(node, Tree): - continue - yield node - for child in reversed(node.children): - stack_append(child) - - def find_pred(self, pred: 'Callable[[Tree[_Leaf_T]], bool]') -> 'Iterator[Tree[_Leaf_T]]': - """Returns all nodes of the tree that evaluate pred(node) as true.""" - return filter(pred, self.iter_subtrees()) - - def find_data(self, data: str) -> 'Iterator[Tree[_Leaf_T]]': - """Returns all nodes of the tree whose data equals the given data.""" - return self.find_pred(lambda t: t.data == data) - -###} - - def expand_kids_by_data(self, *data_values): - """Expand (inline) children with any of the given data values. Returns True if anything changed""" - changed = False - for i in range(len(self.children)-1, -1, -1): - child = self.children[i] - if isinstance(child, Tree) and child.data in data_values: - self.children[i:i+1] = child.children - changed = True - return changed - - - def scan_values(self, pred: 'Callable[[Branch[_Leaf_T]], bool]') -> Iterator[_Leaf_T]: - """Return all values in the tree that evaluate pred(value) as true. - - This can be used to find all the tokens in the tree. - - Example: - >>> all_tokens = tree.scan_values(lambda v: isinstance(v, Token)) - """ - for c in self.children: - if isinstance(c, Tree): - for t in c.scan_values(pred): - yield t - else: - if pred(c): - yield c - - def __deepcopy__(self, memo): - return type(self)(self.data, deepcopy(self.children, memo), meta=self._meta) - - def copy(self) -> 'Tree[_Leaf_T]': - return type(self)(self.data, self.children) - - def set(self, data: str, children: 'List[Branch[_Leaf_T]]') -> None: - self.data = data - self.children = children - - -ParseTree = Tree['Token'] - - -class SlottedTree(Tree): - __slots__ = 'data', 'children', 'rule', '_meta' - - -def pydot__tree_to_png(tree: Tree, filename: str, rankdir: 'Literal["TB", "LR", "BT", "RL"]'="LR", **kwargs) -> None: - graph = pydot__tree_to_graph(tree, rankdir, **kwargs) - graph.write_png(filename) - - -def pydot__tree_to_dot(tree: Tree, filename, rankdir="LR", **kwargs): - graph = pydot__tree_to_graph(tree, rankdir, **kwargs) - graph.write(filename) - - -def pydot__tree_to_graph(tree: Tree, rankdir="LR", **kwargs): - """Creates a colorful image that represents the tree (data+children, without meta) - - Possible values for `rankdir` are "TB", "LR", "BT", "RL", corresponding to - directed graphs drawn from top to bottom, from left to right, from bottom to - top, and from right to left, respectively. - - `kwargs` can be any graph attribute (e. g. `dpi=200`). For a list of - possible attributes, see https://www.graphviz.org/doc/info/attrs.html. - """ - - import pydot # type: ignore[import-not-found] - graph = pydot.Dot(graph_type='digraph', rankdir=rankdir, **kwargs) - - i = [0] - - def new_leaf(leaf): - node = pydot.Node(i[0], label=repr(leaf)) - i[0] += 1 - graph.add_node(node) - return node - - def _to_pydot(subtree): - color = hash(subtree.data) & 0xffffff - color |= 0x808080 - - subnodes = [_to_pydot(child) if isinstance(child, Tree) else new_leaf(child) - for child in subtree.children] - node = pydot.Node(i[0], style="filled", fillcolor="#%x" % color, label=subtree.data) - i[0] += 1 - graph.add_node(node) - - for subnode in subnodes: - graph.add_edge(pydot.Edge(node, subnode)) - - return node - - _to_pydot(tree) - return graph diff --git a/lark/tree_matcher.py b/lark/tree_matcher.py deleted file mode 100644 index 0f42652..0000000 --- a/lark/tree_matcher.py +++ /dev/null @@ -1,186 +0,0 @@ -"""Tree matcher based on Lark grammar""" - -import re -from collections import defaultdict - -from . import Tree, Token -from .common import ParserConf -from .parsers import earley -from .grammar import Rule, Terminal, NonTerminal - - -def is_discarded_terminal(t): - return t.is_term and t.filter_out - - -class _MakeTreeMatch: - def __init__(self, name, expansion): - self.name = name - self.expansion = expansion - - def __call__(self, args): - t = Tree(self.name, args) - t.meta.match_tree = True - t.meta.orig_expansion = self.expansion - return t - - -def _best_from_group(seq, group_key, cmp_key): - d = {} - for item in seq: - key = group_key(item) - if key in d: - v1 = cmp_key(item) - v2 = cmp_key(d[key]) - if v2 > v1: - d[key] = item - else: - d[key] = item - return list(d.values()) - - -def _best_rules_from_group(rules): - rules = _best_from_group(rules, lambda r: r, lambda r: -len(r.expansion)) - rules.sort(key=lambda r: len(r.expansion)) - return rules - - -def _match(term, token): - if isinstance(token, Tree): - name, _args = parse_rulename(term.name) - return token.data == name - elif isinstance(token, Token): - return term == Terminal(token.type) - assert False, (term, token) - - -def make_recons_rule(origin, expansion, old_expansion): - return Rule(origin, expansion, alias=_MakeTreeMatch(origin.name, old_expansion)) - - -def make_recons_rule_to_term(origin, term): - return make_recons_rule(origin, [Terminal(term.name)], [term]) - - -def parse_rulename(s): - "Parse rule names that may contain a template syntax (like rule{a, b, ...})" - name, args_str = re.match(r'(\w+)(?:{(.+)})?', s).groups() - args = args_str and [a.strip() for a in args_str.split(',')] - return name, args - - - -class ChildrenLexer: - def __init__(self, children): - self.children = children - - def lex(self, parser_state): - return self.children - -class TreeMatcher: - """Match the elements of a tree node, based on an ontology - provided by a Lark grammar. - - Supports templates and inlined rules (`rule{a, b,..}` and `_rule`) - - Initialize with an instance of Lark. - """ - - def __init__(self, parser): - # XXX TODO calling compile twice returns different results! - assert not parser.options.maybe_placeholders - # XXX TODO: we just ignore the potential existence of a postlexer - self.tokens, rules, _extra = parser.grammar.compile(parser.options.start, set()) - - self.rules_for_root = defaultdict(list) - - self.rules = list(self._build_recons_rules(rules)) - self.rules.reverse() - - # Choose the best rule from each group of {rule => [rule.alias]}, since we only really need one derivation. - self.rules = _best_rules_from_group(self.rules) - - self.parser = parser - self._parser_cache = {} - - def _build_recons_rules(self, rules): - "Convert tree-parsing/construction rules to tree-matching rules" - expand1s = {r.origin for r in rules if r.options.expand1} - - aliases = defaultdict(list) - for r in rules: - if r.alias: - aliases[r.origin].append(r.alias) - - rule_names = {r.origin for r in rules} - nonterminals = {sym for sym in rule_names - if sym.name.startswith('_') or sym in expand1s or sym in aliases} - - seen = set() - for r in rules: - recons_exp = [sym if sym in nonterminals else Terminal(sym.name) - for sym in r.expansion if not is_discarded_terminal(sym)] - - # Skip self-recursive constructs - if recons_exp == [r.origin] and r.alias is None: - continue - - sym = NonTerminal(r.alias) if r.alias else r.origin - rule = make_recons_rule(sym, recons_exp, r.expansion) - - if sym in expand1s and len(recons_exp) != 1: - self.rules_for_root[sym.name].append(rule) - - if sym.name not in seen: - yield make_recons_rule_to_term(sym, sym) - seen.add(sym.name) - else: - if sym.name.startswith('_') or sym in expand1s: - yield rule - else: - self.rules_for_root[sym.name].append(rule) - - for origin, rule_aliases in aliases.items(): - for alias in rule_aliases: - yield make_recons_rule_to_term(origin, NonTerminal(alias)) - yield make_recons_rule_to_term(origin, origin) - - def match_tree(self, tree, rulename): - """Match the elements of `tree` to the symbols of rule `rulename`. - - Parameters: - tree (Tree): the tree node to match - rulename (str): The expected full rule name (including template args) - - Returns: - Tree: an unreduced tree that matches `rulename` - - Raises: - UnexpectedToken: If no match was found. - - Note: - It's the callers' responsibility match the tree recursively. - """ - if rulename: - # validate - name, _args = parse_rulename(rulename) - assert tree.data == name - else: - rulename = tree.data - - # TODO: ambiguity? - try: - parser = self._parser_cache[rulename] - except KeyError: - rules = self.rules + _best_rules_from_group(self.rules_for_root[rulename]) - - # TODO pass callbacks through dict, instead of alias? - callbacks = {rule: rule.alias for rule in rules} - conf = ParserConf(rules, callbacks, [rulename]) - parser = earley.Parser(self.parser.lexer_conf, conf, _match, resolve_ambiguity=True) - self._parser_cache[rulename] = parser - - # find a full derivation - unreduced_tree = parser.parse(ChildrenLexer(tree.children), rulename) - assert unreduced_tree.data == rulename - return unreduced_tree diff --git a/lark/tree_templates.py b/lark/tree_templates.py deleted file mode 100644 index bc067c5..0000000 --- a/lark/tree_templates.py +++ /dev/null @@ -1,180 +0,0 @@ -"""This module defines utilities for matching and translation tree templates. - -A tree templates is a tree that contains nodes that are template variables. - -""" - -from typing import Union, Optional, Mapping, Dict, Tuple, Iterator - -from lark import Tree, Transformer -from lark.exceptions import MissingVariableError - -Branch = Union[Tree[str], str] -TreeOrCode = Union[Tree[str], str] -MatchResult = Dict[str, Tree] -_TEMPLATE_MARKER = '$' - - -class TemplateConf: - """Template Configuration - - Allows customization for different uses of Template - - parse() must return a Tree instance. - """ - - def __init__(self, parse=None): - self._parse = parse - - def test_var(self, var: Union[Tree[str], str]) -> Optional[str]: - """Given a tree node, if it is a template variable return its name. Otherwise, return None. - - This method may be overridden for customization - - Parameters: - var: Tree | str - The tree node to test - - """ - if isinstance(var, str): - return _get_template_name(var) - - if ( - isinstance(var, Tree) - and var.data == "var" - and len(var.children) > 0 - and isinstance(var.children[0], str) - ): - return _get_template_name(var.children[0]) - - return None - - def _get_tree(self, template: TreeOrCode) -> Tree[str]: - if isinstance(template, str): - assert self._parse - template = self._parse(template) - - if not isinstance(template, Tree): - raise TypeError("template parser must return a Tree instance") - - return template - - def __call__(self, template: Tree[str]) -> 'Template': - return Template(template, conf=self) - - def _match_tree_template(self, template: TreeOrCode, tree: Branch) -> Optional[MatchResult]: - """Returns dict of {var: match} if found a match, else None - """ - template_var = self.test_var(template) - if template_var: - if not isinstance(tree, Tree): - raise TypeError(f"Template variables can only match Tree instances. Not {tree!r}") - return {template_var: tree} - - if isinstance(template, str): - if template == tree: - return {} - return None - - assert isinstance(template, Tree) and isinstance(tree, Tree), f"template={template} tree={tree}" - - if template.data == tree.data and len(template.children) == len(tree.children): - res = {} - for t1, t2 in zip(template.children, tree.children): - matches = self._match_tree_template(t1, t2) - if matches is None: - return None - - res.update(matches) - - return res - - return None - - -class _ReplaceVars(Transformer[str, Tree[str]]): - def __init__(self, conf: TemplateConf, vars: Mapping[str, Tree[str]]) -> None: - super().__init__() - self._conf = conf - self._vars = vars - - def __default__(self, data, children, meta) -> Tree[str]: - tree = super().__default__(data, children, meta) - - var = self._conf.test_var(tree) - if var: - try: - return self._vars[var] - except KeyError: - raise MissingVariableError(f"No mapping for template variable ({var})") - return tree - - -class Template: - """Represents a tree template, tied to a specific configuration - - A tree template is a tree that contains nodes that are template variables. - Those variables will match any tree. - (future versions may support annotations on the variables, to allow more complex templates) - """ - - def __init__(self, tree: Tree[str], conf: TemplateConf = TemplateConf()): - self.conf = conf - self.tree = conf._get_tree(tree) - - def match(self, tree: TreeOrCode) -> Optional[MatchResult]: - """Match a tree template to a tree. - - A tree template without variables will only match ``tree`` if it is equal to the template. - - Parameters: - tree (Tree): The tree to match to the template - - Returns: - Optional[Dict[str, Tree]]: If match is found, returns a dictionary mapping - template variable names to their matching tree nodes. - If no match was found, returns None. - """ - tree = self.conf._get_tree(tree) - return self.conf._match_tree_template(self.tree, tree) - - def search(self, tree: TreeOrCode) -> Iterator[Tuple[Tree[str], MatchResult]]: - """Search for all occurrences of the tree template inside ``tree``. - """ - tree = self.conf._get_tree(tree) - for subtree in tree.iter_subtrees(): - res = self.match(subtree) - if res: - yield subtree, res - - def apply_vars(self, vars: Mapping[str, Tree[str]]) -> Tree[str]: - """Apply vars to the template tree - """ - return _ReplaceVars(self.conf, vars).transform(self.tree) - - -def translate(t1: Template, t2: Template, tree: TreeOrCode): - """Search tree and translate each occurrence of t1 into t2. - """ - tree = t1.conf._get_tree(tree) # ensure it's a tree, parse if necessary and possible - for subtree, vars in t1.search(tree): - res = t2.apply_vars(vars) - subtree.set(res.data, res.children) - return tree - - -class TemplateTranslator: - """Utility class for translating a collection of patterns - """ - - def __init__(self, translations: Mapping[Template, Template]): - assert all(isinstance(k, Template) and isinstance(v, Template) for k, v in translations.items()) - self.translations = translations - - def translate(self, tree: Tree[str]): - for k, v in self.translations.items(): - tree = translate(k, v, tree) - return tree - - -def _get_template_name(value: str) -> Optional[str]: - return value.lstrip(_TEMPLATE_MARKER) if value.startswith(_TEMPLATE_MARKER) else None diff --git a/lark/utils.py b/lark/utils.py deleted file mode 100644 index 2d33f69..0000000 --- a/lark/utils.py +++ /dev/null @@ -1,346 +0,0 @@ -import unicodedata -import os -from itertools import product -from collections import deque -from typing import Callable, Iterator, List, Optional, Tuple, Type, TypeVar, Union, Dict, Any, Sequence, Iterable, AbstractSet - -###{standalone -import sys, re -import logging - -logger: logging.Logger = logging.getLogger("lark") -logger.addHandler(logging.StreamHandler()) -# Set to highest level, since we have some warnings amongst the code -# By default, we should not output any log messages -logger.setLevel(logging.CRITICAL) - - -NO_VALUE = object() - -T = TypeVar("T") - - -def classify(seq: Iterable, key: Optional[Callable] = None, value: Optional[Callable] = None) -> Dict: - d: Dict[Any, Any] = {} - for item in seq: - k = key(item) if (key is not None) else item - v = value(item) if (value is not None) else item - try: - d[k].append(v) - except KeyError: - d[k] = [v] - return d - - -def _deserialize(data: Any, namespace: Dict[str, Any], memo: Dict) -> Any: - if isinstance(data, dict): - if '__type__' in data: # Object - class_ = namespace[data['__type__']] - return class_.deserialize(data, memo) - elif '@' in data: - return memo[data['@']] - return {key:_deserialize(value, namespace, memo) for key, value in data.items()} - elif isinstance(data, list): - return [_deserialize(value, namespace, memo) for value in data] - return data - - -_T = TypeVar("_T", bound="Serialize") - -class Serialize: - """Safe-ish serialization interface that doesn't rely on Pickle - - Attributes: - __serialize_fields__ (List[str]): Fields (aka attributes) to serialize. - __serialize_namespace__ (list): List of classes that deserialization is allowed to instantiate. - Should include all field types that aren't builtin types. - """ - - def memo_serialize(self, types_to_memoize: List) -> Any: - memo = SerializeMemoizer(types_to_memoize) - return self.serialize(memo), memo.serialize() - - def serialize(self, memo = None) -> Dict[str, Any]: - if memo and memo.in_types(self): - return {'@': memo.memoized.get(self)} - - fields = getattr(self, '__serialize_fields__') - res = {f: _serialize(getattr(self, f), memo) for f in fields} - res['__type__'] = type(self).__name__ - if hasattr(self, '_serialize'): - self._serialize(res, memo) - return res - - @classmethod - def deserialize(cls: Type[_T], data: Dict[str, Any], memo: Dict[int, Any]) -> _T: - namespace = getattr(cls, '__serialize_namespace__', []) - namespace = {c.__name__:c for c in namespace} - - fields = getattr(cls, '__serialize_fields__') - - if '@' in data: - return memo[data['@']] - - inst = cls.__new__(cls) - for f in fields: - try: - setattr(inst, f, _deserialize(data[f], namespace, memo)) - except KeyError as e: - raise KeyError("Cannot find key for class", cls, e) - - if hasattr(inst, '_deserialize'): - inst._deserialize() - - return inst - - -class SerializeMemoizer(Serialize): - "A version of serialize that memoizes objects to reduce space" - - __serialize_fields__ = 'memoized', - - def __init__(self, types_to_memoize: List) -> None: - self.types_to_memoize = tuple(types_to_memoize) - self.memoized = Enumerator() - - def in_types(self, value: Serialize) -> bool: - return isinstance(value, self.types_to_memoize) - - def serialize(self) -> Dict[int, Any]: # type: ignore[override] - return _serialize(self.memoized.reversed(), None) - - @classmethod - def deserialize(cls, data: Dict[int, Any], namespace: Dict[str, Any], memo: Dict[Any, Any]) -> Dict[int, Any]: # type: ignore[override] - return _deserialize(data, namespace, memo) - - -try: - import regex - _has_regex = True -except ImportError: - _has_regex = False - -if sys.version_info >= (3, 11): - import re._parser as sre_parse - import re._constants as sre_constants -else: - import sre_parse - import sre_constants - -categ_pattern = re.compile(r'\\p{[A-Za-z_]+}') - -def get_regexp_width(expr: str) -> Union[Tuple[int, int], List[int]]: - if _has_regex: - # Since `sre_parse` cannot deal with Unicode categories of the form `\p{Mn}`, we replace these with - # a simple letter, which makes no difference as we are only trying to get the possible lengths of the regex - # match here below. - regexp_final = re.sub(categ_pattern, 'A', expr) - else: - if re.search(categ_pattern, expr): - raise ImportError('`regex` module must be installed in order to use Unicode categories.', expr) - regexp_final = expr - try: - # Fixed in next version (past 0.960) of typeshed - return [int(x) for x in sre_parse.parse(regexp_final).getwidth()] - except sre_constants.error: - if not _has_regex: - raise ValueError(expr) - else: - # sre_parse does not support the new features in regex. To not completely fail in that case, - # we manually test for the most important info (whether the empty string is matched) - c = regex.compile(regexp_final) - # Python 3.11.7 introducded sre_parse.MAXWIDTH that is used instead of MAXREPEAT - # See lark-parser/lark#1376 and python/cpython#109859 - MAXWIDTH = getattr(sre_parse, "MAXWIDTH", sre_constants.MAXREPEAT) - if c.match('') is None: - # MAXREPEAT is a none pickable subclass of int, therefore needs to be converted to enable caching - return 1, int(MAXWIDTH) - else: - return 0, int(MAXWIDTH) - -###} - - -_ID_START = 'Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Mn', 'Mc', 'Pc' -_ID_CONTINUE = _ID_START + ('Nd', 'Nl',) - -def _test_unicode_category(s: str, categories: Sequence[str]) -> bool: - if len(s) != 1: - return all(_test_unicode_category(char, categories) for char in s) - return s == '_' or unicodedata.category(s) in categories - -def is_id_continue(s: str) -> bool: - """ - Checks if all characters in `s` are alphanumeric characters (Unicode standard, so diacritics, indian vowels, non-latin - numbers, etc. all pass). Synonymous with a Python `ID_CONTINUE` identifier. See PEP 3131 for details. - """ - return _test_unicode_category(s, _ID_CONTINUE) - -def is_id_start(s: str) -> bool: - """ - Checks if all characters in `s` are alphabetic characters (Unicode standard, so diacritics, indian vowels, non-latin - numbers, etc. all pass). Synonymous with a Python `ID_START` identifier. See PEP 3131 for details. - """ - return _test_unicode_category(s, _ID_START) - - -def dedup_list(l: Iterable[T]) -> List[T]: - """Given a list (l) will removing duplicates from the list, - preserving the original order of the list. Assumes that - the list entries are hashable.""" - return list(dict.fromkeys(l)) - - -class Enumerator(Serialize): - def __init__(self) -> None: - self.enums: Dict[Any, int] = {} - - def get(self, item) -> int: - if item not in self.enums: - self.enums[item] = len(self.enums) - return self.enums[item] - - def __len__(self): - return len(self.enums) - - def reversed(self) -> Dict[int, Any]: - r = {v: k for k, v in self.enums.items()} - assert len(r) == len(self.enums) - return r - - - -def combine_alternatives(lists): - """ - Accepts a list of alternatives, and enumerates all their possible concatenations. - - Examples: - >>> combine_alternatives([range(2), [4,5]]) - [[0, 4], [0, 5], [1, 4], [1, 5]] - - >>> combine_alternatives(["abc", "xy", '$']) - [['a', 'x', '$'], ['a', 'y', '$'], ['b', 'x', '$'], ['b', 'y', '$'], ['c', 'x', '$'], ['c', 'y', '$']] - - >>> combine_alternatives([]) - [[]] - """ - if not lists: - return [[]] - assert all(l for l in lists), lists - return list(product(*lists)) - -try: - import atomicwrites - _has_atomicwrites = True -except ImportError: - _has_atomicwrites = False - -class FS: - exists = staticmethod(os.path.exists) - - @staticmethod - def open(name, mode="r", **kwargs): - if _has_atomicwrites and "w" in mode: - return atomicwrites.atomic_write(name, mode=mode, overwrite=True, **kwargs) - else: - return open(name, mode, **kwargs) - - -class fzset(frozenset): - def __repr__(self): - return '{%s}' % ', '.join(map(repr, self)) - - -def classify_bool(seq: Iterable, pred: Callable) -> Any: - false_elems = [] - true_elems = [elem for elem in seq if pred(elem) or false_elems.append(elem)] # type: ignore[func-returns-value] - return true_elems, false_elems - - -def bfs(initial: Iterable, expand: Callable) -> Iterator: - open_q = deque(list(initial)) - visited = set(open_q) - while open_q: - node = open_q.popleft() - yield node - for next_node in expand(node): - if next_node not in visited: - visited.add(next_node) - open_q.append(next_node) - -def bfs_all_unique(initial, expand): - "bfs, but doesn't keep track of visited (aka seen), because there can be no repetitions" - open_q = deque(list(initial)) - while open_q: - node = open_q.popleft() - yield node - open_q += expand(node) - - -def _serialize(value: Any, memo: Optional[SerializeMemoizer]) -> Any: - if isinstance(value, Serialize): - return value.serialize(memo) - elif isinstance(value, list): - return [_serialize(elem, memo) for elem in value] - elif isinstance(value, frozenset): - return list(value) # TODO reversible? - elif isinstance(value, dict): - return {key:_serialize(elem, memo) for key, elem in value.items()} - # assert value is None or isinstance(value, (int, float, str, tuple)), value - return value - - - - -def small_factors(n: int, max_factor: int) -> List[Tuple[int, int]]: - """ - Splits n up into smaller factors and summands <= max_factor. - Returns a list of [(a, b), ...] - so that the following code returns n: - - n = 1 - for a, b in values: - n = n * a + b - - Currently, we also keep a + b <= max_factor, but that might change - """ - assert n >= 0 - assert max_factor > 2 - if n <= max_factor: - return [(n, 0)] - - for a in range(max_factor, 1, -1): - r, b = divmod(n, a) - if a + b <= max_factor: - return small_factors(r, max_factor) + [(a, b)] - assert False, "Failed to factorize %s" % n - - -class OrderedSet(AbstractSet[T]): - """A minimal OrderedSet implementation, using a dictionary. - - (relies on the dictionary being ordered) - """ - def __init__(self, items: Iterable[T] =()): - self.d = dict.fromkeys(items) - - def __contains__(self, item: Any) -> bool: - return item in self.d - - def add(self, item: T): - self.d[item] = None - - def __iter__(self) -> Iterator[T]: - return iter(self.d) - - def remove(self, item: T): - del self.d[item] - - def __bool__(self): - return bool(self.d) - - def __len__(self) -> int: - return len(self.d) - - def __repr__(self): - return f"{type(self).__name__}({', '.join(map(repr,self))})" diff --git a/lark/visitors.py b/lark/visitors.py deleted file mode 100644 index 0e051ed..0000000 --- a/lark/visitors.py +++ /dev/null @@ -1,596 +0,0 @@ -from typing import TypeVar, Tuple, List, Callable, Generic, Type, Union, Optional, Any, cast -from abc import ABC - -from .utils import combine_alternatives -from .tree import Tree, Branch -from .exceptions import VisitError, GrammarError -from .lexer import Token - -###{standalone -from functools import wraps, update_wrapper -from inspect import getmembers, getmro - -_Return_T = TypeVar('_Return_T') -_Return_V = TypeVar('_Return_V') -_Leaf_T = TypeVar('_Leaf_T') -_Leaf_U = TypeVar('_Leaf_U') -_R = TypeVar('_R') -_FUNC = Callable[..., _Return_T] -_DECORATED = Union[_FUNC, type] - -class _DiscardType: - """When the Discard value is returned from a transformer callback, - that node is discarded and won't appear in the parent. - - Note: - This feature is disabled when the transformer is provided to Lark - using the ``transformer`` keyword (aka Tree-less LALR mode). - - Example: - :: - - class T(Transformer): - def ignore_tree(self, children): - return Discard - - def IGNORE_TOKEN(self, token): - return Discard - """ - - def __repr__(self): - return "lark.visitors.Discard" - -Discard = _DiscardType() - -# Transformers - -class _Decoratable: - "Provides support for decorating methods with @v_args" - - @classmethod - def _apply_v_args(cls, visit_wrapper): - mro = getmro(cls) - assert mro[0] is cls - libmembers = {name for _cls in mro[1:] for name, _ in getmembers(_cls)} - for name, value in getmembers(cls): - - # Make sure the function isn't inherited (unless it's overwritten) - if name.startswith('_') or (name in libmembers and name not in cls.__dict__): - continue - if not callable(value): - continue - - # Skip if v_args already applied (at the function level) - if isinstance(cls.__dict__[name], _VArgsWrapper): - continue - - setattr(cls, name, _VArgsWrapper(cls.__dict__[name], visit_wrapper)) - return cls - - def __class_getitem__(cls, _): - return cls - - -class Transformer(_Decoratable, ABC, Generic[_Leaf_T, _Return_T]): - """Transformers work bottom-up (or depth-first), starting with visiting the leaves and working - their way up until ending at the root of the tree. - - For each node visited, the transformer will call the appropriate method (callbacks), according to the - node's ``data``, and use the returned value to replace the node, thereby creating a new tree structure. - - Transformers can be used to implement map & reduce patterns. Because nodes are reduced from leaf to root, - at any point the callbacks may assume the children have already been transformed (if applicable). - - If the transformer cannot find a method with the right name, it will instead call ``__default__``, which by - default creates a copy of the node. - - To discard a node, return Discard (``lark.visitors.Discard``). - - ``Transformer`` can do anything ``Visitor`` can do, but because it reconstructs the tree, - it is slightly less efficient. - - A transformer without methods essentially performs a non-memoized partial deepcopy. - - All these classes implement the transformer interface: - - - ``Transformer`` - Recursively transforms the tree. This is the one you probably want. - - ``Transformer_InPlace`` - Non-recursive. Changes the tree in-place instead of returning new instances - - ``Transformer_InPlaceRecursive`` - Recursive. Changes the tree in-place instead of returning new instances - - Parameters: - visit_tokens (bool, optional): Should the transformer visit tokens in addition to rules. - Setting this to ``False`` is slightly faster. Defaults to ``True``. - (For processing ignored tokens, use the ``lexer_callbacks`` options) - - """ - __visit_tokens__ = True # For backwards compatibility - - def __init__(self, visit_tokens: bool=True) -> None: - self.__visit_tokens__ = visit_tokens - - def _call_userfunc(self, tree, new_children=None): - # Assumes tree is already transformed - children = new_children if new_children is not None else tree.children - try: - f = getattr(self, tree.data) - except AttributeError: - return self.__default__(tree.data, children, tree.meta) - else: - try: - wrapper = getattr(f, 'visit_wrapper', None) - if wrapper is not None: - return f.visit_wrapper(f, tree.data, children, tree.meta) - else: - return f(children) - except GrammarError: - raise - except Exception as e: - raise VisitError(tree.data, tree, e) - - def _call_userfunc_token(self, token): - try: - f = getattr(self, token.type) - except AttributeError: - return self.__default_token__(token) - else: - try: - return f(token) - except GrammarError: - raise - except Exception as e: - raise VisitError(token.type, token, e) - - def _transform_children(self, children): - for c in children: - if isinstance(c, Tree): - res = self._transform_tree(c) - elif self.__visit_tokens__ and isinstance(c, Token): - res = self._call_userfunc_token(c) - else: - res = c - - if res is not Discard: - yield res - - def _transform_tree(self, tree): - children = list(self._transform_children(tree.children)) - return self._call_userfunc(tree, children) - - def transform(self, tree: Tree[_Leaf_T]) -> _Return_T: - "Transform the given tree, and return the final result" - res = list(self._transform_children([tree])) - if not res: - return None # type: ignore[return-value] - assert len(res) == 1 - return res[0] - - def __mul__( - self: 'Transformer[_Leaf_T, Tree[_Leaf_U]]', - other: 'Union[Transformer[_Leaf_U, _Return_V], TransformerChain[_Leaf_U, _Return_V,]]' - ) -> 'TransformerChain[_Leaf_T, _Return_V]': - """Chain two transformers together, returning a new transformer. - """ - return TransformerChain(self, other) - - def __default__(self, data, children, meta): - """Default function that is called if there is no attribute matching ``data`` - - Can be overridden. Defaults to creating a new copy of the tree node (i.e. ``return Tree(data, children, meta)``) - """ - return Tree(data, children, meta) - - def __default_token__(self, token): - """Default function that is called if there is no attribute matching ``token.type`` - - Can be overridden. Defaults to returning the token as-is. - """ - return token - - -def merge_transformers(base_transformer=None, **transformers_to_merge): - """Merge a collection of transformers into the base_transformer, each into its own 'namespace'. - - When called, it will collect the methods from each transformer, and assign them to base_transformer, - with their name prefixed with the given keyword, as ``prefix__methodname``. - - This function is especially useful for processing grammars that import other grammars, - thereby creating some of their rules in a 'namespace'. (i.e with a consistent name prefix). - In this case, the key for the transformer should match the name of the imported grammar. - - Parameters: - base_transformer (Transformer, optional): The transformer that all other transformers will be added to. - **transformers_to_merge: Keyword arguments, in the form of ``name_prefix = transformer``. - - Raises: - AttributeError: In case of a name collision in the merged methods - - Example: - :: - - class TBase(Transformer): - def start(self, children): - return children[0] + 'bar' - - class TImportedGrammar(Transformer): - def foo(self, children): - return "foo" - - composed_transformer = merge_transformers(TBase(), imported=TImportedGrammar()) - - t = Tree('start', [ Tree('imported__foo', []) ]) - - assert composed_transformer.transform(t) == 'foobar' - - """ - if base_transformer is None: - base_transformer = Transformer() - for prefix, transformer in transformers_to_merge.items(): - for method_name in dir(transformer): - method = getattr(transformer, method_name) - if not callable(method): - continue - if method_name.startswith("_") or method_name == "transform": - continue - prefixed_method = prefix + "__" + method_name - if hasattr(base_transformer, prefixed_method): - raise AttributeError("Cannot merge: method '%s' appears more than once" % prefixed_method) - - setattr(base_transformer, prefixed_method, method) - - return base_transformer - - -class InlineTransformer(Transformer): # XXX Deprecated - def _call_userfunc(self, tree, new_children=None): - # Assumes tree is already transformed - children = new_children if new_children is not None else tree.children - try: - f = getattr(self, tree.data) - except AttributeError: - return self.__default__(tree.data, children, tree.meta) - else: - return f(*children) - - -class TransformerChain(Generic[_Leaf_T, _Return_T]): - - transformers: 'Tuple[Union[Transformer, TransformerChain], ...]' - - def __init__(self, *transformers: 'Union[Transformer, TransformerChain]') -> None: - self.transformers = transformers - - def transform(self, tree: Tree[_Leaf_T]) -> _Return_T: - for t in self.transformers: - tree = t.transform(tree) - return cast(_Return_T, tree) - - def __mul__( - self: 'TransformerChain[_Leaf_T, Tree[_Leaf_U]]', - other: 'Union[Transformer[_Leaf_U, _Return_V], TransformerChain[_Leaf_U, _Return_V]]' - ) -> 'TransformerChain[_Leaf_T, _Return_V]': - return TransformerChain(*self.transformers + (other,)) - - -class Transformer_InPlace(Transformer[_Leaf_T, _Return_T]): - """Same as Transformer, but non-recursive, and changes the tree in-place instead of returning new instances - - Useful for huge trees. Conservative in memory. - """ - def _transform_tree(self, tree): # Cancel recursion - return self._call_userfunc(tree) - - def transform(self, tree: Tree[_Leaf_T]) -> _Return_T: - for subtree in tree.iter_subtrees(): - subtree.children = list(self._transform_children(subtree.children)) - - return self._transform_tree(tree) - - -class Transformer_NonRecursive(Transformer[_Leaf_T, _Return_T]): - """Same as Transformer but non-recursive. - - Like Transformer, it doesn't change the original tree. - - Useful for huge trees. - """ - - def transform(self, tree: Tree[_Leaf_T]) -> _Return_T: - # Tree to postfix - rev_postfix = [] - q: List[Branch[_Leaf_T]] = [tree] - while q: - t = q.pop() - rev_postfix.append(t) - if isinstance(t, Tree): - q += t.children - - # Postfix to tree - stack: List = [] - for x in reversed(rev_postfix): - if isinstance(x, Tree): - size = len(x.children) - if size: - args = stack[-size:] - del stack[-size:] - else: - args = [] - - res = self._call_userfunc(x, args) - if res is not Discard: - stack.append(res) - - elif self.__visit_tokens__ and isinstance(x, Token): - res = self._call_userfunc_token(x) - if res is not Discard: - stack.append(res) - else: - stack.append(x) - - result, = stack # We should have only one tree remaining - # There are no guarantees on the type of the value produced by calling a user func for a - # child will produce. This means type system can't statically know that the final result is - # _Return_T. As a result a cast is required. - return cast(_Return_T, result) - - -class Transformer_InPlaceRecursive(Transformer): - "Same as Transformer, recursive, but changes the tree in-place instead of returning new instances" - def _transform_tree(self, tree): - tree.children = list(self._transform_children(tree.children)) - return self._call_userfunc(tree) - - -# Visitors - -class VisitorBase: - def _call_userfunc(self, tree): - return getattr(self, tree.data, self.__default__)(tree) - - def __default__(self, tree): - """Default function that is called if there is no attribute matching ``tree.data`` - - Can be overridden. Defaults to doing nothing. - """ - return tree - - def __class_getitem__(cls, _): - return cls - - -class Visitor(VisitorBase, ABC, Generic[_Leaf_T]): - """Tree visitor, non-recursive (can handle huge trees). - - Visiting a node calls its methods (provided by the user via inheritance) according to ``tree.data`` - """ - - def visit(self, tree: Tree[_Leaf_T]) -> Tree[_Leaf_T]: - "Visits the tree, starting with the leaves and finally the root (bottom-up)" - for subtree in tree.iter_subtrees(): - self._call_userfunc(subtree) - return tree - - def visit_topdown(self, tree: Tree[_Leaf_T]) -> Tree[_Leaf_T]: - "Visit the tree, starting at the root, and ending at the leaves (top-down)" - for subtree in tree.iter_subtrees_topdown(): - self._call_userfunc(subtree) - return tree - - -class Visitor_Recursive(VisitorBase, Generic[_Leaf_T]): - """Bottom-up visitor, recursive. - - Visiting a node calls its methods (provided by the user via inheritance) according to ``tree.data`` - - Slightly faster than the non-recursive version. - """ - - def visit(self, tree: Tree[_Leaf_T]) -> Tree[_Leaf_T]: - "Visits the tree, starting with the leaves and finally the root (bottom-up)" - for child in tree.children: - if isinstance(child, Tree): - self.visit(child) - - self._call_userfunc(tree) - return tree - - def visit_topdown(self,tree: Tree[_Leaf_T]) -> Tree[_Leaf_T]: - "Visit the tree, starting at the root, and ending at the leaves (top-down)" - self._call_userfunc(tree) - - for child in tree.children: - if isinstance(child, Tree): - self.visit_topdown(child) - - return tree - - -class Interpreter(_Decoratable, ABC, Generic[_Leaf_T, _Return_T]): - """Interpreter walks the tree starting at the root. - - Visits the tree, starting with the root and finally the leaves (top-down) - - For each tree node, it calls its methods (provided by user via inheritance) according to ``tree.data``. - - Unlike ``Transformer`` and ``Visitor``, the Interpreter doesn't automatically visit its sub-branches. - The user has to explicitly call ``visit``, ``visit_children``, or use the ``@visit_children_decor``. - This allows the user to implement branching and loops. - """ - - def visit(self, tree: Tree[_Leaf_T]) -> _Return_T: - # There are no guarantees on the type of the value produced by calling a user func for a - # child will produce. So only annotate the public method and use an internal method when - # visiting child trees. - return self._visit_tree(tree) - - def _visit_tree(self, tree: Tree[_Leaf_T]): - f = getattr(self, tree.data) - wrapper = getattr(f, 'visit_wrapper', None) - if wrapper is not None: - return f.visit_wrapper(f, tree.data, tree.children, tree.meta) - else: - return f(tree) - - def visit_children(self, tree: Tree[_Leaf_T]) -> List: - return [self._visit_tree(child) if isinstance(child, Tree) else child - for child in tree.children] - - def __getattr__(self, name): - return self.__default__ - - def __default__(self, tree): - return self.visit_children(tree) - - -_InterMethod = Callable[[Type[Interpreter], _Return_T], _R] - -def visit_children_decor(func: _InterMethod) -> _InterMethod: - "See Interpreter" - @wraps(func) - def inner(cls, tree): - values = cls.visit_children(tree) - return func(cls, values) - return inner - -# Decorators - -def _apply_v_args(obj, visit_wrapper): - try: - _apply = obj._apply_v_args - except AttributeError: - return _VArgsWrapper(obj, visit_wrapper) - else: - return _apply(visit_wrapper) - - -class _VArgsWrapper: - """ - A wrapper around a Callable. It delegates `__call__` to the Callable. - If the Callable has a `__get__`, that is also delegate and the resulting function is wrapped. - Otherwise, we use the original function mirroring the behaviour without a __get__. - We also have the visit_wrapper attribute to be used by Transformers. - """ - base_func: Callable - - def __init__(self, func: Callable, visit_wrapper: Callable[[Callable, str, list, Any], Any]): - if isinstance(func, _VArgsWrapper): - func = func.base_func - self.base_func = func - self.visit_wrapper = visit_wrapper - update_wrapper(self, func) - - def __call__(self, *args, **kwargs): - return self.base_func(*args, **kwargs) - - def __get__(self, instance, owner=None): - try: - # Use the __get__ attribute of the type instead of the instance - # to fully mirror the behavior of getattr - g = type(self.base_func).__get__ - except AttributeError: - return self - else: - return _VArgsWrapper(g(self.base_func, instance, owner), self.visit_wrapper) - - def __set_name__(self, owner, name): - try: - f = type(self.base_func).__set_name__ - except AttributeError: - return - else: - f(self.base_func, owner, name) - - -def _vargs_inline(f, _data, children, _meta): - return f(*children) -def _vargs_meta_inline(f, _data, children, meta): - return f(meta, *children) -def _vargs_meta(f, _data, children, meta): - return f(meta, children) -def _vargs_tree(f, data, children, meta): - return f(Tree(data, children, meta)) - - -def v_args(inline: bool = False, meta: bool = False, tree: bool = False, wrapper: Optional[Callable] = None) -> Callable[[_DECORATED], _DECORATED]: - """A convenience decorator factory for modifying the behavior of user-supplied callback methods of ``Transformer`` classes. - - By default, transformer callback methods accept one argument - a list of the node's children. - - ``v_args`` can modify this behavior. When used on a ``Transformer`` class definition, it applies to - all the callback methods inside it. - - ``v_args`` can be applied to a single method, or to an entire class. When applied to both, - the options given to the method take precedence. - - Parameters: - inline (bool, optional): Children are provided as ``*args`` instead of a list argument (not recommended for very long lists). - meta (bool, optional): Provides two arguments: ``meta`` and ``children`` (instead of just the latter) - tree (bool, optional): Provides the entire tree as the argument, instead of the children. - wrapper (function, optional): Provide a function to decorate all methods. - - Example: - :: - - @v_args(inline=True) - class SolveArith(Transformer): - def add(self, left, right): - return left + right - - @v_args(meta=True) - def mul(self, meta, children): - logger.info(f'mul at line {meta.line}') - left, right = children - return left * right - - - class ReverseNotation(Transformer_InPlace): - @v_args(tree=True) - def tree_node(self, tree): - tree.children = tree.children[::-1] - """ - if tree and (meta or inline): - raise ValueError("Visitor functions cannot combine 'tree' with 'meta' or 'inline'.") - - func = None - if meta: - if inline: - func = _vargs_meta_inline - else: - func = _vargs_meta - elif inline: - func = _vargs_inline - elif tree: - func = _vargs_tree - - if wrapper is not None: - if func is not None: - raise ValueError("Cannot use 'wrapper' along with 'tree', 'meta' or 'inline'.") - func = wrapper - - def _visitor_args_dec(obj): - return _apply_v_args(obj, func) - return _visitor_args_dec - - -###} - - -# --- Visitor Utilities --- - -class CollapseAmbiguities(Transformer): - """ - Transforms a tree that contains any number of _ambig nodes into a list of trees, - each one containing an unambiguous tree. - - The length of the resulting list is the product of the length of all _ambig nodes. - - Warning: This may quickly explode for highly ambiguous trees. - - """ - def _ambig(self, options): - return sum(options, []) - - def __default__(self, data, children_lists, meta): - return [Tree(data, children, meta) for children in combine_alternatives(children_lists)] - - def __default_token__(self, t): - return [t] diff --git a/lib/List.pf b/lib/List.pf index f82e3ab..ab6dbe4 100644 --- a/lib/List.pf +++ b/lib/List.pf @@ -713,6 +713,23 @@ proof } end +theorem drop_append: all E:type. all xs:List. all ys:List. + drop(length(xs), xs ++ ys) = ys +proof + arbitrary E:type + induction List + case empty { + arbitrary ys:List + definition {length, operator++, drop} + } + case node(x, xs') suppose IH { + arbitrary ys:List + definition {length, operator++, operator+, drop} + and rewrite zero_add[length(xs')] + | IH[ys] + } +end + theorem nth_drop: all T:type. all n:Nat. all xs:List, i:Nat, d:T. nth(drop(n, xs), d)(i) = nth(xs, d)(n + i) proof diff --git a/lib/Nat.pf b/lib/Nat.pf index 4b3bef7..1f19725 100644 --- a/lib/Nat.pf +++ b/lib/Nat.pf @@ -591,6 +591,12 @@ proof ls, sl end +theorem not_less_zero: + all x:Nat. not (x < 0) +proof + definition {operator<, operator≤} +end + theorem less_equal_implies_less_or_equal: all x:Nat. all y:Nat. if x ≤ y then x < y or x = y diff --git a/rec_desc_parser.py b/rec_desc_parser.py index fa21df0..95f230a 100644 --- a/rec_desc_parser.py +++ b/rec_desc_parser.py @@ -682,7 +682,12 @@ def parse_proof_hi(token_list, i): error(meta_from_tokens(token_list[i], token_list[i]), 'expected the keyword `by` after formula of `have`, ' \ + 'not\n\t' + token_list[i].value) - body,i = parse_proof(token_list, i) + try: + body,i = parse_proof(token_list, i) + except Exception as e: + raise Exception(str(e) + '\nwhile parsing: ' \ + + '"have" label ":" formula "by" proof proof\n' \ + + ' ^^^^^') return PLet(meta_from_tokens(token, token_list[i-1]), label, proved, because, body), i @@ -691,8 +696,13 @@ def parse_proof_hi(token_list, i): typ, i = parse_type(token_list, i) cases = [] while token_list[i].type == 'CASE': + try: c, i = parse_induction_case(token_list, i) - cases.append(c) + except Exception as e: + raise Exception(str(e) + '\nwhile parsing: ' \ + + '\t"case" pattern "{" proof "}"\n'\ + + '\t ^^^^^') + cases.append(c) return (Induction(meta_from_tokens(token, token_list[i-1]), typ, cases), i) elif token.type == 'INJECTIVE': @@ -784,7 +794,12 @@ def parse_proof_hi(token_list, i): i = i + 1 proof, i = parse_proof(token_list, i) meta = meta_from_tokens(token, token_list[i-1]) - body, i = parse_proof(token_list, i) + try: + body, i = parse_proof(token_list, i) + except Exception as e: + raise Exception(str(e) + '\nwhile parsing: ' \ + + '"suffices" formula "by" proof proof\n' \ + + ' ^^^^^') return (Suffices(meta, formula, proof, body), i) elif token.type == 'SUPPOSE' or token.type == 'ASSUME': @@ -852,7 +867,7 @@ def parse_proof_hi(token_list, i): name, i = parse_identifier(token_list, i) except Exception as e: error(meta_from_tokens(token, token_list[i]), - 'expected a proof\n' + str(e)) + 'expected a proof, not `' + token_list[i].value + '`') return (PVar(meta_from_tokens(token, token), name), i) def parse_proof_list(token_list, i): @@ -873,8 +888,7 @@ def parse_case(token_list, i): error(meta_from_tokens(token_list[start],token_list[i]), 'expected a `{` after assumption of `case`, not\n\t' \ + token_list[i].value \ - + '\nwhile parsing\n' \ - + '\tcase ::= "case" identifier ":" formula "{" proof "}"') + + '\nwhile parsing:\n\t"case" label ":" formula "{" proof "}"') i = i + 1 body, i = parse_proof(token_list, i) if token_list[i].type != 'RBRACE':