diff --git a/lark/lark.py b/lark/lark.py index 53fa05ae..7883816a 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -56,6 +56,7 @@ class LarkOptions(Serialize): propagate_positions: Union[bool, str] maybe_placeholders: bool cache: Union[bool, str] + cache_grammar: bool regex: bool g_regex_flags: int keep_all_tokens: bool @@ -99,6 +100,10 @@ class LarkOptions(Serialize): - When ``False``, does nothing (default) - When ``True``, caches to a temporary file in the local directory - When given a string, caches to the path pointed by the string + cache_grammar + For use with ``cache`` option. When ``True``, the unanalyzed grammar is also included in the cache. + Useful for classes that require the ``Lark.grammar`` to be present (e.g. Reconstructor). + (default= ``False``) regex When True, uses the ``regex`` module instead of the stdlib ``re``. g_regex_flags @@ -165,6 +170,7 @@ class LarkOptions(Serialize): 'keep_all_tokens': False, 'tree_class': None, 'cache': False, + 'cache_grammar': False, 'postlex': None, 'parser': 'earley', 'lexer': 'auto', @@ -211,6 +217,9 @@ def __init__(self, options_dict: Dict[str, Any]) -> None: raise ConfigurationError('Cannot specify an embedded transformer when using the Earley algorithm. ' 'Please use your transformer on the resulting parse tree, or use a different algorithm (i.e. LALR)') + if self.cache_grammar and not self.cache: + raise ConfigurationError('cache_grammar cannot be set when cache is disabled') + if o: raise ConfigurationError("Unknown options: %s" % o.keys()) @@ -264,10 +273,16 @@ class Lark(Serialize): parser: 'ParsingFrontend' terminals: Collection[TerminalDef] + __serialize_fields__ = ['parser', 'rules', 'options'] + def __init__(self, grammar: 'Union[Grammar, str, IO[str]]', **options) -> None: self.options = LarkOptions(options) re_module: types.ModuleType + # Update which fields are serialized + if self.options.cache_grammar: + self.__serialize_fields__ = self.__serialize_fields__ + ['grammar'] + # Set regex or re module use_regex = self.options.regex if use_regex: @@ -327,7 +342,9 @@ def __init__(self, grammar: 'Union[Grammar, str, IO[str]]', **options) -> None: # specific reason - we just want a username. username = "unknown" - cache_fn = tempfile.gettempdir() + "/.lark_cache_%s_%s_%s_%s.tmp" % (username, cache_sha256, *sys.version_info[:2]) + + cache_fn = tempfile.gettempdir() + "/.lark_%s_%s_%s_%s_%s.tmp" % ( + "cache_grammar" if self.options.cache_grammar else "cache", username, cache_sha256, *sys.version_info[:2]) old_options = self.options try: @@ -397,7 +414,7 @@ def __init__(self, grammar: 'Union[Grammar, str, IO[str]]', **options) -> None: raise ConfigurationError("invalid ambiguity option: %r. Must be one of %r" % (self.options.ambiguity, _VALID_AMBIGUITY_OPTIONS)) if self.options.parser is None: - terminals_to_keep = '*' + terminals_to_keep = '*' # For lexer-only mode, keep all terminals elif self.options.postlex is not None: terminals_to_keep = set(self.options.postlex.always_accept) else: @@ -454,8 +471,6 @@ def __init__(self, grammar: 'Union[Grammar, str, IO[str]]', **options) -> None: if __doc__: __doc__ += "\n\n" + LarkOptions.OPTIONS_DOC - __serialize_fields__ = 'parser', 'rules', 'options' - def _build_lexer(self, dont_ignore: bool=False) -> BasicLexer: lexer_conf = self.lexer_conf if dont_ignore: @@ -531,6 +546,8 @@ def _load(self: _T, f: Any, **kwargs) -> _T: assert memo_json memo = SerializeMemoizer.deserialize(memo_json, {'Rule': Rule, 'TerminalDef': TerminalDef}, {}) + if 'grammar' in data: + self.grammar = Grammar.deserialize(data['grammar'], memo) options = dict(data['options']) if (set(kwargs) - _LOAD_ALLOWED_OPTIONS) & set(LarkOptions._defaults): raise ConfigurationError("Some options are not allowed when loading a Parser: {}" diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 362a845d..a2968e92 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -11,7 +11,7 @@ from contextlib import suppress from typing import List, Tuple, Union, Callable, Dict, Optional, Sequence, Generator -from .utils import bfs, logger, classify_bool, is_id_continue, is_id_start, bfs_all_unique, small_factors, OrderedSet +from .utils import bfs, logger, classify_bool, is_id_continue, is_id_start, bfs_all_unique, small_factors, OrderedSet, Serialize from .lexer import Token, TerminalDef, PatternStr, PatternRE, Pattern from .parse_tree_builder import ParseTreeBuilder @@ -676,7 +676,7 @@ def nr_deepcopy_tree(t): return Transformer_NonRecursive(False).transform(t) -class Grammar: +class Grammar(Serialize): term_defs: List[Tuple[str, Tuple[Tree, int]]] rule_defs: List[Tuple[str, Tuple[str, ...], Tree, RuleOptions]] @@ -687,6 +687,8 @@ def __init__(self, rule_defs: List[Tuple[str, Tuple[str, ...], Tree, RuleOptions self.rule_defs = rule_defs self.ignore = ignore + __serialize_fields__ = 'term_defs', 'rule_defs', 'ignore' + def compile(self, start, terminals_to_keep) -> Tuple[List[TerminalDef], List[Rule], List[str]]: # We change the trees in-place (to support huge grammars) # So deepcopy allows calling compile more than once. @@ -977,7 +979,6 @@ def _parse_grammar(text, name, start='start'): return PrepareGrammar().transform(tree) - def _error_repr(error): if isinstance(error, UnexpectedToken): error2 = _translate_parser_exception(_get_parser().parse, error) diff --git a/lark/tree_matcher.py b/lark/tree_matcher.py index 0f42652e..a7ae6b24 100644 --- a/lark/tree_matcher.py +++ b/lark/tree_matcher.py @@ -1,10 +1,12 @@ """Tree matcher based on Lark grammar""" import re +from typing import List, Dict from collections import defaultdict -from . import Tree, Token +from . import Tree, Token, Lark from .common import ParserConf +from .exceptions import ConfigurationError from .parsers import earley from .grammar import Rule, Terminal, NonTerminal @@ -39,7 +41,7 @@ def _best_from_group(seq, group_key, cmp_key): return list(d.values()) -def _best_rules_from_group(rules): +def _best_rules_from_group(rules: List[Rule]) -> List[Rule]: rules = _best_from_group(rules, lambda r: r, lambda r: -len(r.expansion)) rules.sort(key=lambda r: len(r.expansion)) return rules @@ -85,12 +87,23 @@ class TreeMatcher: Initialize with an instance of Lark. """ + rules_for_root: Dict[str, List[Rule]] + rules: List[Rule] + parser: Lark - def __init__(self, parser): + def __init__(self, parser: Lark): # XXX TODO calling compile twice returns different results! assert not parser.options.maybe_placeholders - # XXX TODO: we just ignore the potential existence of a postlexer - self.tokens, rules, _extra = parser.grammar.compile(parser.options.start, set()) + + if parser.options.postlex and parser.options.postlex.always_accept: + # If postlexer's always_accept is used, we need to recompile the grammar with empty terminals-to-keep + if not hasattr(parser, 'grammar'): + raise ConfigurationError('Source grammar not available from cached parser, use cache_grammar=True' + if parser.options.cache else "Source grammar not available!") + self.tokens, rules, _extra = parser.grammar.compile(parser.options.start, set()) + else: + self.tokens = list(parser.terminals) + rules = list(parser.rules) self.rules_for_root = defaultdict(list) @@ -101,9 +114,9 @@ def __init__(self, parser): self.rules = _best_rules_from_group(self.rules) self.parser = parser - self._parser_cache = {} + self._parser_cache: Dict[str, earley.Parser] = {} - def _build_recons_rules(self, rules): + def _build_recons_rules(self, rules: List[Rule]): "Convert tree-parsing/construction rules to tree-matching rules" expand1s = {r.origin for r in rules if r.options.expand1} @@ -145,7 +158,7 @@ def _build_recons_rules(self, rules): yield make_recons_rule_to_term(origin, NonTerminal(alias)) yield make_recons_rule_to_term(origin, origin) - def match_tree(self, tree, rulename): + def match_tree(self, tree: Tree, rulename: str) -> Tree: """Match the elements of `tree` to the symbols of rule `rulename`. Parameters: @@ -159,7 +172,7 @@ def match_tree(self, tree, rulename): UnexpectedToken: If no match was found. Note: - It's the callers' responsibility match the tree recursively. + It's the callers' responsibility to match the tree recursively. """ if rulename: # validate @@ -176,11 +189,11 @@ def match_tree(self, tree, rulename): # TODO pass callbacks through dict, instead of alias? callbacks = {rule: rule.alias for rule in rules} - conf = ParserConf(rules, callbacks, [rulename]) + conf = ParserConf(rules, callbacks, [rulename]) # type: ignore[arg-type] parser = earley.Parser(self.parser.lexer_conf, conf, _match, resolve_ambiguity=True) self._parser_cache[rulename] = parser # find a full derivation - unreduced_tree = parser.parse(ChildrenLexer(tree.children), rulename) + unreduced_tree: Tree = parser.parse(ChildrenLexer(tree.children), rulename) assert unreduced_tree.data == rulename return unreduced_tree diff --git a/tests/test_cache.py b/tests/test_cache.py index e10a17b6..49d68f9c 100644 --- a/tests/test_cache.py +++ b/tests/test_cache.py @@ -4,8 +4,11 @@ from unittest import TestCase, main, skipIf from lark import Lark, Tree, Transformer, UnexpectedInput +from lark.exceptions import ConfigurationError from lark.lexer import Lexer, Token import lark.lark as lark_module +from lark.reconstruct import Reconstructor +from . import test_reconstructor from io import BytesIO @@ -186,5 +189,45 @@ def test_error_message(self): parser2.parse(text) self.assertEqual(str(cm1.exception), str(cm2.exception)) + def test_cache_grammar(self): + with self.assertRaises(ConfigurationError): + Lark(self.g, parser='lalr', cache=False, cache_grammar=True) + + assert len(self.mock_fs.files) == 0 + parser1 = Lark(self.g, parser='lalr', cache=True, cache_grammar=True) + parser2 = Lark(self.g, parser='lalr', cache=True, cache_grammar=True) + assert parser2.parse('a') == Tree('start', []) + + # Assert that the cache file was created, and uses a different name than regular cache + assert len(self.mock_fs.files) == 1 + assert 'cache_grammar' in list(self.mock_fs.files)[0] + + # Assert the cached grammar is equal to the original grammar + assert parser1.grammar is not parser2.grammar + assert parser1.grammar.term_defs == parser2.grammar.term_defs + # Using repr() because RuleOptions doesn't implement __eq__ + assert repr(parser1.grammar.rule_defs) == repr(parser2.grammar.rule_defs) + + def test_reconstruct(self): + # Test that Reconstructor works with cached parsers (using cache_grammar) + grammar = """ + start: (rule | NL)* + rule: WORD ":" NUMBER + NL: /(\\r?\\n)+\\s*/ + """ + test_reconstructor.common + + code = """ + Elephants: 12 + """ + + _parser = Lark(grammar, parser='lalr', maybe_placeholders=False, cache=True, cache_grammar=True) + assert len(self.mock_fs.files) == 1 + parser = Lark(grammar, parser='lalr', maybe_placeholders=False, cache=True, cache_grammar=True) + assert _parser.grammar is not parser.grammar + tree = parser.parse(code) + new = Reconstructor(parser).reconstruct(tree) + self.assertEqual(test_reconstructor._remove_ws(code), test_reconstructor._remove_ws(new)) + + if __name__ == '__main__': main()