Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 21 additions & 4 deletions lark/lark.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@
propagate_positions: Union[bool, str]
maybe_placeholders: bool
cache: Union[bool, str]
cache_grammar: bool
regex: bool
g_regex_flags: int
keep_all_tokens: bool
Expand Down Expand Up @@ -99,6 +100,10 @@
- When ``False``, does nothing (default)
- When ``True``, caches to a temporary file in the local directory
- When given a string, caches to the path pointed by the string
cache_grammar
For use with ``cache`` option. When ``True``, the unanalyzed grammar is also included in the cache.
Useful for classes that require the ``Lark.grammar`` to be present (e.g. Reconstructor).
(default= ``False``)
regex
When True, uses the ``regex`` module instead of the stdlib ``re``.
g_regex_flags
Expand Down Expand Up @@ -165,6 +170,7 @@
'keep_all_tokens': False,
'tree_class': None,
'cache': False,
'cache_grammar': False,
'postlex': None,
'parser': 'earley',
'lexer': 'auto',
Expand Down Expand Up @@ -211,6 +217,9 @@
raise ConfigurationError('Cannot specify an embedded transformer when using the Earley algorithm. '
'Please use your transformer on the resulting parse tree, or use a different algorithm (i.e. LALR)')

if self.cache_grammar and not self.cache:
raise ConfigurationError('cache_grammar cannot be set when cache is disabled')

if o:
raise ConfigurationError("Unknown options: %s" % o.keys())

Expand Down Expand Up @@ -264,10 +273,16 @@
parser: 'ParsingFrontend'
terminals: Collection[TerminalDef]

__serialize_fields__ = ['parser', 'rules', 'options']

def __init__(self, grammar: 'Union[Grammar, str, IO[str]]', **options) -> None:
self.options = LarkOptions(options)
re_module: types.ModuleType

# Update which fields are serialized
if self.options.cache_grammar:
self.__serialize_fields__ = self.__serialize_fields__ + ['grammar']

# Set regex or re module
use_regex = self.options.regex
if use_regex:
Expand Down Expand Up @@ -327,7 +342,9 @@
# specific reason - we just want a username.
username = "unknown"

cache_fn = tempfile.gettempdir() + "/.lark_cache_%s_%s_%s_%s.tmp" % (username, cache_sha256, *sys.version_info[:2])

cache_fn = tempfile.gettempdir() + "/.lark_%s_%s_%s_%s_%s.tmp" % (
"cache_grammar" if self.options.cache_grammar else "cache", username, cache_sha256, *sys.version_info[:2])

old_options = self.options
try:
Expand Down Expand Up @@ -397,7 +414,7 @@
raise ConfigurationError("invalid ambiguity option: %r. Must be one of %r" % (self.options.ambiguity, _VALID_AMBIGUITY_OPTIONS))

if self.options.parser is None:
terminals_to_keep = '*'
terminals_to_keep = '*' # For lexer-only mode, keep all terminals

Check warning on line 417 in lark/lark.py

View check run for this annotation

Codecov / codecov/patch

lark/lark.py#L417

Added line #L417 was not covered by tests
elif self.options.postlex is not None:
terminals_to_keep = set(self.options.postlex.always_accept)
else:
Expand Down Expand Up @@ -454,8 +471,6 @@
if __doc__:
__doc__ += "\n\n" + LarkOptions.OPTIONS_DOC

__serialize_fields__ = 'parser', 'rules', 'options'

def _build_lexer(self, dont_ignore: bool=False) -> BasicLexer:
lexer_conf = self.lexer_conf
if dont_ignore:
Expand Down Expand Up @@ -531,6 +546,8 @@

assert memo_json
memo = SerializeMemoizer.deserialize(memo_json, {'Rule': Rule, 'TerminalDef': TerminalDef}, {})
if 'grammar' in data:
self.grammar = Grammar.deserialize(data['grammar'], memo)
options = dict(data['options'])
if (set(kwargs) - _LOAD_ALLOWED_OPTIONS) & set(LarkOptions._defaults):
raise ConfigurationError("Some options are not allowed when loading a Parser: {}"
Expand Down
7 changes: 4 additions & 3 deletions lark/load_grammar.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from contextlib import suppress
from typing import List, Tuple, Union, Callable, Dict, Optional, Sequence, Generator

from .utils import bfs, logger, classify_bool, is_id_continue, is_id_start, bfs_all_unique, small_factors, OrderedSet
from .utils import bfs, logger, classify_bool, is_id_continue, is_id_start, bfs_all_unique, small_factors, OrderedSet, Serialize
from .lexer import Token, TerminalDef, PatternStr, PatternRE, Pattern

from .parse_tree_builder import ParseTreeBuilder
Expand Down Expand Up @@ -676,7 +676,7 @@ def nr_deepcopy_tree(t):
return Transformer_NonRecursive(False).transform(t)


class Grammar:
class Grammar(Serialize):

term_defs: List[Tuple[str, Tuple[Tree, int]]]
rule_defs: List[Tuple[str, Tuple[str, ...], Tree, RuleOptions]]
Expand All @@ -687,6 +687,8 @@ def __init__(self, rule_defs: List[Tuple[str, Tuple[str, ...], Tree, RuleOptions
self.rule_defs = rule_defs
self.ignore = ignore

__serialize_fields__ = 'term_defs', 'rule_defs', 'ignore'
Copy link

Copilot AI Jul 6, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nitpick] Use a consistent container type for __serialize_fields__ (e.g., a list) to match the pattern in other classes and avoid confusion.

Suggested change
__serialize_fields__ = 'term_defs', 'rule_defs', 'ignore'
__serialize_fields__ = ['term_defs', 'rule_defs', 'ignore']

Copilot uses AI. Check for mistakes.

def compile(self, start, terminals_to_keep) -> Tuple[List[TerminalDef], List[Rule], List[str]]:
# We change the trees in-place (to support huge grammars)
# So deepcopy allows calling compile more than once.
Expand Down Expand Up @@ -977,7 +979,6 @@ def _parse_grammar(text, name, start='start'):

return PrepareGrammar().transform(tree)


def _error_repr(error):
if isinstance(error, UnexpectedToken):
error2 = _translate_parser_exception(_get_parser().parse, error)
Expand Down
35 changes: 24 additions & 11 deletions lark/tree_matcher.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
"""Tree matcher based on Lark grammar"""

import re
from typing import List, Dict
from collections import defaultdict

from . import Tree, Token
from . import Tree, Token, Lark
from .common import ParserConf
from .exceptions import ConfigurationError
from .parsers import earley
from .grammar import Rule, Terminal, NonTerminal

Expand Down Expand Up @@ -39,7 +41,7 @@
return list(d.values())


def _best_rules_from_group(rules):
def _best_rules_from_group(rules: List[Rule]) -> List[Rule]:
rules = _best_from_group(rules, lambda r: r, lambda r: -len(r.expansion))
rules.sort(key=lambda r: len(r.expansion))
return rules
Expand Down Expand Up @@ -85,12 +87,23 @@
Initialize with an instance of Lark.
"""
rules_for_root: Dict[str, List[Rule]]
rules: List[Rule]
parser: Lark

def __init__(self, parser):
def __init__(self, parser: Lark):
# XXX TODO calling compile twice returns different results!
assert not parser.options.maybe_placeholders
# XXX TODO: we just ignore the potential existence of a postlexer
self.tokens, rules, _extra = parser.grammar.compile(parser.options.start, set())

if parser.options.postlex and parser.options.postlex.always_accept:
# If postlexer's always_accept is used, we need to recompile the grammar with empty terminals-to-keep
if not hasattr(parser, 'grammar'):
raise ConfigurationError('Source grammar not available from cached parser, use cache_grammar=True'

Check warning on line 101 in lark/tree_matcher.py

View check run for this annotation

Codecov / codecov/patch

lark/tree_matcher.py#L100-L101

Added lines #L100 - L101 were not covered by tests
if parser.options.cache else "Source grammar not available!")
self.tokens, rules, _extra = parser.grammar.compile(parser.options.start, set())

Check warning on line 103 in lark/tree_matcher.py

View check run for this annotation

Codecov / codecov/patch

lark/tree_matcher.py#L103

Added line #L103 was not covered by tests
else:
self.tokens = list(parser.terminals)
rules = list(parser.rules)

self.rules_for_root = defaultdict(list)

Expand All @@ -101,9 +114,9 @@
self.rules = _best_rules_from_group(self.rules)

self.parser = parser
self._parser_cache = {}
self._parser_cache: Dict[str, earley.Parser] = {}

def _build_recons_rules(self, rules):
def _build_recons_rules(self, rules: List[Rule]):
Copy link

Copilot AI Jul 6, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nitpick] Add an explicit return type annotation for this generator method (e.g., -> Generator[Rule, None, None] or -> Iterator[Rule]) to improve readability and static analysis.

Copilot uses AI. Check for mistakes.
"Convert tree-parsing/construction rules to tree-matching rules"
expand1s = {r.origin for r in rules if r.options.expand1}

Expand Down Expand Up @@ -145,7 +158,7 @@
yield make_recons_rule_to_term(origin, NonTerminal(alias))
yield make_recons_rule_to_term(origin, origin)

def match_tree(self, tree, rulename):
def match_tree(self, tree: Tree, rulename: str) -> Tree:
"""Match the elements of `tree` to the symbols of rule `rulename`.
Parameters:
Expand All @@ -159,7 +172,7 @@
UnexpectedToken: If no match was found.
Note:
It's the callers' responsibility match the tree recursively.
It's the callers' responsibility to match the tree recursively.
"""
if rulename:
# validate
Expand All @@ -176,11 +189,11 @@

# TODO pass callbacks through dict, instead of alias?
callbacks = {rule: rule.alias for rule in rules}
conf = ParserConf(rules, callbacks, [rulename])
conf = ParserConf(rules, callbacks, [rulename]) # type: ignore[arg-type]
parser = earley.Parser(self.parser.lexer_conf, conf, _match, resolve_ambiguity=True)
self._parser_cache[rulename] = parser

# find a full derivation
unreduced_tree = parser.parse(ChildrenLexer(tree.children), rulename)
unreduced_tree: Tree = parser.parse(ChildrenLexer(tree.children), rulename)
assert unreduced_tree.data == rulename
return unreduced_tree
43 changes: 43 additions & 0 deletions tests/test_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,11 @@
from unittest import TestCase, main, skipIf

from lark import Lark, Tree, Transformer, UnexpectedInput
from lark.exceptions import ConfigurationError
from lark.lexer import Lexer, Token
import lark.lark as lark_module
from lark.reconstruct import Reconstructor
from . import test_reconstructor

from io import BytesIO

Expand Down Expand Up @@ -186,5 +189,45 @@ def test_error_message(self):
parser2.parse(text)
self.assertEqual(str(cm1.exception), str(cm2.exception))

def test_cache_grammar(self):
with self.assertRaises(ConfigurationError):
Lark(self.g, parser='lalr', cache=False, cache_grammar=True)

assert len(self.mock_fs.files) == 0
parser1 = Lark(self.g, parser='lalr', cache=True, cache_grammar=True)
parser2 = Lark(self.g, parser='lalr', cache=True, cache_grammar=True)
assert parser2.parse('a') == Tree('start', [])

# Assert that the cache file was created, and uses a different name than regular cache
assert len(self.mock_fs.files) == 1
assert 'cache_grammar' in list(self.mock_fs.files)[0]

# Assert the cached grammar is equal to the original grammar
assert parser1.grammar is not parser2.grammar
assert parser1.grammar.term_defs == parser2.grammar.term_defs
# Using repr() because RuleOptions doesn't implement __eq__
assert repr(parser1.grammar.rule_defs) == repr(parser2.grammar.rule_defs)

def test_reconstruct(self):
# Test that Reconstructor works with cached parsers (using cache_grammar)
grammar = """
start: (rule | NL)*
rule: WORD ":" NUMBER
NL: /(\\r?\\n)+\\s*/
""" + test_reconstructor.common

code = """
Elephants: 12
"""

_parser = Lark(grammar, parser='lalr', maybe_placeholders=False, cache=True, cache_grammar=True)
assert len(self.mock_fs.files) == 1
parser = Lark(grammar, parser='lalr', maybe_placeholders=False, cache=True, cache_grammar=True)
assert _parser.grammar is not parser.grammar
tree = parser.parse(code)
new = Reconstructor(parser).reconstruct(tree)
self.assertEqual(test_reconstructor._remove_ws(code), test_reconstructor._remove_ws(new))


if __name__ == '__main__':
main()
Loading