lark-parser · erezsh · Jul 13, 2025 · Jul 13, 2025 · Jul 6, 2025 · Jul 6, 2025
diff --git a/lark/lark.py b/lark/lark.py
@@ -56,6 +56,7 @@
     propagate_positions: Union[bool, str]
     maybe_placeholders: bool
     cache: Union[bool, str]
+    cache_grammar: bool
     regex: bool
     g_regex_flags: int
     keep_all_tokens: bool
@@ -99,6 +100,10 @@
             - When ``False``, does nothing (default)
             - When ``True``, caches to a temporary file in the local directory
             - When given a string, caches to the path pointed by the string
+    cache_grammar
+            For use with ``cache`` option. When ``True``, the unanalyzed grammar is also included in the cache.
+            Useful for classes that require the ``Lark.grammar`` to be present (e.g. Reconstructor).
+            (default= ``False``)
     regex
             When True, uses the ``regex`` module instead of the stdlib ``re``.
     g_regex_flags
@@ -165,6 +170,7 @@
         'keep_all_tokens': False,
         'tree_class': None,
         'cache': False,
+        'cache_grammar': False,
         'postlex': None,
         'parser': 'earley',
         'lexer': 'auto',
@@ -211,6 +217,9 @@
             raise ConfigurationError('Cannot specify an embedded transformer when using the Earley algorithm. '
                              'Please use your transformer on the resulting parse tree, or use a different algorithm (i.e. LALR)')
 
+        if self.cache_grammar and not self.cache:
+            raise ConfigurationError('cache_grammar cannot be set when cache is disabled')
+
         if o:
             raise ConfigurationError("Unknown options: %s" % o.keys())
 
@@ -264,10 +273,16 @@
     parser: 'ParsingFrontend'
     terminals: Collection[TerminalDef]
 
+    __serialize_fields__ = ['parser', 'rules', 'options']
+
     def __init__(self, grammar: 'Union[Grammar, str, IO[str]]', **options) -> None:
         self.options = LarkOptions(options)
         re_module: types.ModuleType
 
+        # Update which fields are serialized
+        if self.options.cache_grammar:
+            self.__serialize_fields__ = self.__serialize_fields__ + ['grammar']
+
         # Set regex or re module
         use_regex = self.options.regex
         if use_regex:
@@ -327,7 +342,9 @@
                         # specific reason - we just want a username.
                         username = "unknown"
 
-                    cache_fn = tempfile.gettempdir() + "/.lark_cache_%s_%s_%s_%s.tmp" % (username, cache_sha256, *sys.version_info[:2])
+
+                    cache_fn = tempfile.gettempdir() + "/.lark_%s_%s_%s_%s_%s.tmp" % (
+                        "cache_grammar" if self.options.cache_grammar else "cache", username, cache_sha256, *sys.version_info[:2])
 
                 old_options = self.options
                 try:
@@ -397,7 +414,7 @@
             raise ConfigurationError("invalid ambiguity option: %r. Must be one of %r" % (self.options.ambiguity, _VALID_AMBIGUITY_OPTIONS))
 
         if self.options.parser is None:
-            terminals_to_keep = '*'
+            terminals_to_keep = '*'     # For lexer-only mode, keep all terminals
         elif self.options.postlex is not None:
             terminals_to_keep = set(self.options.postlex.always_accept)
         else:
@@ -454,8 +471,6 @@
     if __doc__:
         __doc__ += "\n\n" + LarkOptions.OPTIONS_DOC
 
-    __serialize_fields__ = 'parser', 'rules', 'options'
-
     def _build_lexer(self, dont_ignore: bool=False) -> BasicLexer:
         lexer_conf = self.lexer_conf
         if dont_ignore:
@@ -531,6 +546,8 @@
 
         assert memo_json
         memo = SerializeMemoizer.deserialize(memo_json, {'Rule': Rule, 'TerminalDef': TerminalDef}, {})
+        if 'grammar' in data:
+            self.grammar = Grammar.deserialize(data['grammar'], memo)
         options = dict(data['options'])
         if (set(kwargs) - _LOAD_ALLOWED_OPTIONS) & set(LarkOptions._defaults):
             raise ConfigurationError("Some options are not allowed when loading a Parser: {}"

diff --git a/lark/load_grammar.py b/lark/load_grammar.py
@@ -11,7 +11,7 @@
 from contextlib import suppress
 from typing import List, Tuple, Union, Callable, Dict, Optional, Sequence, Generator
 
-from .utils import bfs, logger, classify_bool, is_id_continue, is_id_start, bfs_all_unique, small_factors, OrderedSet
+from .utils import bfs, logger, classify_bool, is_id_continue, is_id_start, bfs_all_unique, small_factors, OrderedSet, Serialize
 from .lexer import Token, TerminalDef, PatternStr, PatternRE, Pattern
 
 from .parse_tree_builder import ParseTreeBuilder
@@ -676,7 +676,7 @@ def nr_deepcopy_tree(t):
     return Transformer_NonRecursive(False).transform(t)
 
 
-class Grammar:
+class Grammar(Serialize):
 
     term_defs: List[Tuple[str, Tuple[Tree, int]]]
     rule_defs: List[Tuple[str, Tuple[str, ...], Tree, RuleOptions]]
@@ -687,6 +687,8 @@ def __init__(self, rule_defs: List[Tuple[str, Tuple[str, ...], Tree, RuleOptions
         self.rule_defs = rule_defs
         self.ignore = ignore
 
+    __serialize_fields__ = 'term_defs', 'rule_defs', 'ignore'
-    __serialize_fields__ = 'term_defs', 'rule_defs', 'ignore'
+    __serialize_fields__ = ['term_defs', 'rule_defs', 'ignore']
-    __serialize_fields__ = 'term_defs', 'rule_defs', 'ignore'
+    __serialize_fields__ = ['term_defs', 'rule_defs', 'ignore']
+
     def compile(self, start, terminals_to_keep) -> Tuple[List[TerminalDef], List[Rule], List[str]]:
         # We change the trees in-place (to support huge grammars)
         # So deepcopy allows calling compile more than once.
@@ -977,7 +979,6 @@ def _parse_grammar(text, name, start='start'):
 
     return PrepareGrammar().transform(tree)
 
-
 def _error_repr(error):
     if isinstance(error, UnexpectedToken):
         error2 = _translate_parser_exception(_get_parser().parse, error)

diff --git a/lark/tree_matcher.py b/lark/tree_matcher.py
@@ -1,10 +1,12 @@
 """Tree matcher based on Lark grammar"""
 
 import re
+from typing import List, Dict
 from collections import defaultdict
 
-from . import Tree, Token
+from . import Tree, Token, Lark
 from .common import ParserConf
+from .exceptions import ConfigurationError
 from .parsers import earley
 from .grammar import Rule, Terminal, NonTerminal
 
@@ -39,7 +41,7 @@
     return list(d.values())
 
 
-def _best_rules_from_group(rules):
+def _best_rules_from_group(rules: List[Rule]) -> List[Rule]:
     rules = _best_from_group(rules, lambda r: r, lambda r: -len(r.expansion))
     rules.sort(key=lambda r: len(r.expansion))
     return rules
@@ -85,12 +87,23 @@
 
     Initialize with an instance of Lark.
     """
+    rules_for_root: Dict[str, List[Rule]]
+    rules: List[Rule]
+    parser: Lark
 
-    def __init__(self, parser):
+    def __init__(self, parser: Lark):
         # XXX TODO calling compile twice returns different results!
         assert not parser.options.maybe_placeholders
-        # XXX TODO: we just ignore the potential existence of a postlexer
-        self.tokens, rules, _extra = parser.grammar.compile(parser.options.start, set())
+
+        if parser.options.postlex and parser.options.postlex.always_accept:
+            # If postlexer's always_accept is used, we need to recompile the grammar with empty terminals-to-keep
+            if not hasattr(parser, 'grammar'):
+                raise ConfigurationError('Source grammar not available from cached parser, use cache_grammar=True'
+                                         if parser.options.cache else "Source grammar not available!")
+            self.tokens, rules, _extra = parser.grammar.compile(parser.options.start, set())
+        else:
+            self.tokens = list(parser.terminals)
+            rules = list(parser.rules)
 
         self.rules_for_root = defaultdict(list)
 
@@ -101,9 +114,9 @@
         self.rules = _best_rules_from_group(self.rules)
 
         self.parser = parser
-        self._parser_cache = {}
+        self._parser_cache: Dict[str, earley.Parser] = {}
 
-    def _build_recons_rules(self, rules):
+    def _build_recons_rules(self, rules: List[Rule]):
         "Convert tree-parsing/construction rules to tree-matching rules"
         expand1s = {r.origin for r in rules if r.options.expand1}
 
@@ -145,7 +158,7 @@
                 yield make_recons_rule_to_term(origin, NonTerminal(alias))
             yield make_recons_rule_to_term(origin, origin)
 
-    def match_tree(self, tree, rulename):
+    def match_tree(self, tree: Tree, rulename: str) -> Tree:
         """Match the elements of `tree` to the symbols of rule `rulename`.
 
         Parameters:
@@ -159,7 +172,7 @@
             UnexpectedToken: If no match was found.
 
         Note:
-            It's the callers' responsibility match the tree recursively.
+            It's the callers' responsibility to match the tree recursively.
         """
         if rulename:
             # validate
@@ -176,11 +189,11 @@
 
             # TODO pass callbacks through dict, instead of alias?
             callbacks = {rule: rule.alias for rule in rules}
-            conf = ParserConf(rules, callbacks, [rulename])
+            conf = ParserConf(rules, callbacks, [rulename]) # type: ignore[arg-type]
             parser = earley.Parser(self.parser.lexer_conf, conf, _match, resolve_ambiguity=True)
             self._parser_cache[rulename] = parser
 
         # find a full derivation
-        unreduced_tree = parser.parse(ChildrenLexer(tree.children), rulename)
+        unreduced_tree: Tree = parser.parse(ChildrenLexer(tree.children), rulename)
         assert unreduced_tree.data == rulename
         return unreduced_tree
diff --git a/tests/test_cache.py b/tests/test_cache.py
@@ -4,8 +4,11 @@
 from unittest import TestCase, main, skipIf
 
 from lark import Lark, Tree, Transformer, UnexpectedInput
+from lark.exceptions import ConfigurationError
 from lark.lexer import Lexer, Token
 import lark.lark as lark_module
+from lark.reconstruct import Reconstructor
+from . import test_reconstructor
 
 from io import BytesIO
 
@@ -186,5 +189,45 @@ def test_error_message(self):
                 parser2.parse(text)
             self.assertEqual(str(cm1.exception), str(cm2.exception))
 
+    def test_cache_grammar(self):
+        with self.assertRaises(ConfigurationError):
+            Lark(self.g, parser='lalr', cache=False, cache_grammar=True)
+
+        assert len(self.mock_fs.files) == 0
+        parser1 = Lark(self.g, parser='lalr', cache=True, cache_grammar=True)
+        parser2 = Lark(self.g, parser='lalr', cache=True, cache_grammar=True)
+        assert parser2.parse('a') == Tree('start', [])
+
+        # Assert that the cache file was created, and uses a different name than regular cache
+        assert len(self.mock_fs.files) == 1
+        assert 'cache_grammar' in list(self.mock_fs.files)[0]
+
+        # Assert the cached grammar is equal to the original grammar
+        assert parser1.grammar is not parser2.grammar
+        assert parser1.grammar.term_defs == parser2.grammar.term_defs
+        # Using repr() because RuleOptions doesn't implement __eq__
+        assert repr(parser1.grammar.rule_defs) == repr(parser2.grammar.rule_defs)
+
+    def test_reconstruct(self):
+        # Test that Reconstructor works with cached parsers (using cache_grammar)
+        grammar = """
+        start: (rule | NL)*
+        rule: WORD ":" NUMBER
+        NL: /(\\r?\\n)+\\s*/
+        """ + test_reconstructor.common
+
+        code = """
+        Elephants: 12
+        """
+
+        _parser = Lark(grammar, parser='lalr', maybe_placeholders=False, cache=True, cache_grammar=True)
+        assert len(self.mock_fs.files) == 1
+        parser = Lark(grammar, parser='lalr', maybe_placeholders=False, cache=True, cache_grammar=True)
+        assert _parser.grammar is not parser.grammar
+        tree = parser.parse(code)
+        new = Reconstructor(parser).reconstruct(tree)
+        self.assertEqual(test_reconstructor._remove_ws(code), test_reconstructor._remove_ws(new))
+
+
 if __name__ == '__main__':
     main()