Speed up new backtracking parser (#2728)
This commit is contained in:
parent
521d1b8129
commit
3e731527e4
@ -24,6 +24,8 @@
|
|||||||
at least one pre-existing blank line (#2736)
|
at least one pre-existing blank line (#2736)
|
||||||
- Verbose mode also now describes how a project root was discovered and which paths will
|
- Verbose mode also now describes how a project root was discovered and which paths will
|
||||||
be formatted. (#2526)
|
be formatted. (#2526)
|
||||||
|
- Speed-up the new backtracking parser about 4X in general (enabled when
|
||||||
|
`--target-version` is set to 3.10 and higher). (#2728)
|
||||||
|
|
||||||
### Packaging
|
### Packaging
|
||||||
|
|
||||||
|
@ -46,6 +46,17 @@ def lam_sub(grammar: Grammar, node: RawNode) -> NL:
|
|||||||
return Node(type=node[0], children=node[3], context=node[2])
|
return Node(type=node[0], children=node[3], context=node[2])
|
||||||
|
|
||||||
|
|
||||||
|
# A placeholder node, used when parser is backtracking.
|
||||||
|
DUMMY_NODE = (-1, None, None, None)
|
||||||
|
|
||||||
|
|
||||||
|
def stack_copy(
|
||||||
|
stack: List[Tuple[DFAS, int, RawNode]]
|
||||||
|
) -> List[Tuple[DFAS, int, RawNode]]:
|
||||||
|
"""Nodeless stack copy."""
|
||||||
|
return [(copy.deepcopy(dfa), label, DUMMY_NODE) for dfa, label, _ in stack]
|
||||||
|
|
||||||
|
|
||||||
class Recorder:
|
class Recorder:
|
||||||
def __init__(self, parser: "Parser", ilabels: List[int], context: Context) -> None:
|
def __init__(self, parser: "Parser", ilabels: List[int], context: Context) -> None:
|
||||||
self.parser = parser
|
self.parser = parser
|
||||||
@ -54,7 +65,7 @@ def __init__(self, parser: "Parser", ilabels: List[int], context: Context) -> No
|
|||||||
|
|
||||||
self._dead_ilabels: Set[int] = set()
|
self._dead_ilabels: Set[int] = set()
|
||||||
self._start_point = self.parser.stack
|
self._start_point = self.parser.stack
|
||||||
self._points = {ilabel: copy.deepcopy(self._start_point) for ilabel in ilabels}
|
self._points = {ilabel: stack_copy(self._start_point) for ilabel in ilabels}
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def ilabels(self) -> Set[int]:
|
def ilabels(self) -> Set[int]:
|
||||||
@ -62,13 +73,32 @@ def ilabels(self) -> Set[int]:
|
|||||||
|
|
||||||
@contextmanager
|
@contextmanager
|
||||||
def switch_to(self, ilabel: int) -> Iterator[None]:
|
def switch_to(self, ilabel: int) -> Iterator[None]:
|
||||||
self.parser.stack = self._points[ilabel]
|
with self.backtrack():
|
||||||
|
self.parser.stack = self._points[ilabel]
|
||||||
|
try:
|
||||||
|
yield
|
||||||
|
except ParseError:
|
||||||
|
self._dead_ilabels.add(ilabel)
|
||||||
|
finally:
|
||||||
|
self.parser.stack = self._start_point
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def backtrack(self) -> Iterator[None]:
|
||||||
|
"""
|
||||||
|
Use the node-level invariant ones for basic parsing operations (push/pop/shift).
|
||||||
|
These still will operate on the stack; but they won't create any new nodes, or
|
||||||
|
modify the contents of any other existing nodes.
|
||||||
|
|
||||||
|
This saves us a ton of time when we are backtracking, since we
|
||||||
|
want to restore to the initial state as quick as possible, which
|
||||||
|
can only be done by having as little mutatations as possible.
|
||||||
|
"""
|
||||||
|
is_backtracking = self.parser.is_backtracking
|
||||||
try:
|
try:
|
||||||
|
self.parser.is_backtracking = True
|
||||||
yield
|
yield
|
||||||
except ParseError:
|
|
||||||
self._dead_ilabels.add(ilabel)
|
|
||||||
finally:
|
finally:
|
||||||
self.parser.stack = self._start_point
|
self.parser.is_backtracking = is_backtracking
|
||||||
|
|
||||||
def add_token(self, tok_type: int, tok_val: Text, raw: bool = False) -> None:
|
def add_token(self, tok_type: int, tok_val: Text, raw: bool = False) -> None:
|
||||||
func: Callable[..., Any]
|
func: Callable[..., Any]
|
||||||
@ -179,6 +209,7 @@ def __init__(self, grammar: Grammar, convert: Optional[Convert] = None) -> None:
|
|||||||
self.grammar = grammar
|
self.grammar = grammar
|
||||||
# See note in docstring above. TL;DR this is ignored.
|
# See note in docstring above. TL;DR this is ignored.
|
||||||
self.convert = convert or lam_sub
|
self.convert = convert or lam_sub
|
||||||
|
self.is_backtracking = False
|
||||||
|
|
||||||
def setup(self, proxy: "TokenProxy", start: Optional[int] = None) -> None:
|
def setup(self, proxy: "TokenProxy", start: Optional[int] = None) -> None:
|
||||||
"""Prepare for parsing.
|
"""Prepare for parsing.
|
||||||
@ -319,28 +350,40 @@ def classify(self, type: int, value: Text, context: Context) -> List[int]:
|
|||||||
|
|
||||||
def shift(self, type: int, value: Text, newstate: int, context: Context) -> None:
|
def shift(self, type: int, value: Text, newstate: int, context: Context) -> None:
|
||||||
"""Shift a token. (Internal)"""
|
"""Shift a token. (Internal)"""
|
||||||
dfa, state, node = self.stack[-1]
|
if self.is_backtracking:
|
||||||
rawnode: RawNode = (type, value, context, None)
|
dfa, state, _ = self.stack[-1]
|
||||||
newnode = convert(self.grammar, rawnode)
|
self.stack[-1] = (dfa, newstate, DUMMY_NODE)
|
||||||
assert node[-1] is not None
|
else:
|
||||||
node[-1].append(newnode)
|
dfa, state, node = self.stack[-1]
|
||||||
self.stack[-1] = (dfa, newstate, node)
|
rawnode: RawNode = (type, value, context, None)
|
||||||
|
newnode = convert(self.grammar, rawnode)
|
||||||
|
assert node[-1] is not None
|
||||||
|
node[-1].append(newnode)
|
||||||
|
self.stack[-1] = (dfa, newstate, node)
|
||||||
|
|
||||||
def push(self, type: int, newdfa: DFAS, newstate: int, context: Context) -> None:
|
def push(self, type: int, newdfa: DFAS, newstate: int, context: Context) -> None:
|
||||||
"""Push a nonterminal. (Internal)"""
|
"""Push a nonterminal. (Internal)"""
|
||||||
dfa, state, node = self.stack[-1]
|
if self.is_backtracking:
|
||||||
newnode: RawNode = (type, None, context, [])
|
dfa, state, _ = self.stack[-1]
|
||||||
self.stack[-1] = (dfa, newstate, node)
|
self.stack[-1] = (dfa, newstate, DUMMY_NODE)
|
||||||
self.stack.append((newdfa, 0, newnode))
|
self.stack.append((newdfa, 0, DUMMY_NODE))
|
||||||
|
else:
|
||||||
|
dfa, state, node = self.stack[-1]
|
||||||
|
newnode: RawNode = (type, None, context, [])
|
||||||
|
self.stack[-1] = (dfa, newstate, node)
|
||||||
|
self.stack.append((newdfa, 0, newnode))
|
||||||
|
|
||||||
def pop(self) -> None:
|
def pop(self) -> None:
|
||||||
"""Pop a nonterminal. (Internal)"""
|
"""Pop a nonterminal. (Internal)"""
|
||||||
popdfa, popstate, popnode = self.stack.pop()
|
if self.is_backtracking:
|
||||||
newnode = convert(self.grammar, popnode)
|
self.stack.pop()
|
||||||
if self.stack:
|
|
||||||
dfa, state, node = self.stack[-1]
|
|
||||||
assert node[-1] is not None
|
|
||||||
node[-1].append(newnode)
|
|
||||||
else:
|
else:
|
||||||
self.rootnode = newnode
|
popdfa, popstate, popnode = self.stack.pop()
|
||||||
self.rootnode.used_names = self.used_names
|
newnode = convert(self.grammar, popnode)
|
||||||
|
if self.stack:
|
||||||
|
dfa, state, node = self.stack[-1]
|
||||||
|
assert node[-1] is not None
|
||||||
|
node[-1].append(newnode)
|
||||||
|
else:
|
||||||
|
self.rootnode = newnode
|
||||||
|
self.rootnode.used_names = self.used_names
|
||||||
|
107
tests/data/pattern_matching_generic.py
Normal file
107
tests/data/pattern_matching_generic.py
Normal file
@ -0,0 +1,107 @@
|
|||||||
|
re.match()
|
||||||
|
match = a
|
||||||
|
with match() as match:
|
||||||
|
match = f"{match}"
|
||||||
|
|
||||||
|
re.match()
|
||||||
|
match = a
|
||||||
|
with match() as match:
|
||||||
|
match = f"{match}"
|
||||||
|
|
||||||
|
|
||||||
|
def get_grammars(target_versions: Set[TargetVersion]) -> List[Grammar]:
|
||||||
|
if not target_versions:
|
||||||
|
# No target_version specified, so try all grammars.
|
||||||
|
return [
|
||||||
|
# Python 3.7+
|
||||||
|
pygram.python_grammar_no_print_statement_no_exec_statement_async_keywords,
|
||||||
|
# Python 3.0-3.6
|
||||||
|
pygram.python_grammar_no_print_statement_no_exec_statement,
|
||||||
|
# Python 2.7 with future print_function import
|
||||||
|
pygram.python_grammar_no_print_statement,
|
||||||
|
# Python 2.7
|
||||||
|
pygram.python_grammar,
|
||||||
|
]
|
||||||
|
|
||||||
|
match match:
|
||||||
|
case case:
|
||||||
|
match match:
|
||||||
|
case case:
|
||||||
|
pass
|
||||||
|
|
||||||
|
if all(version.is_python2() for version in target_versions):
|
||||||
|
# Python 2-only code, so try Python 2 grammars.
|
||||||
|
return [
|
||||||
|
# Python 2.7 with future print_function import
|
||||||
|
pygram.python_grammar_no_print_statement,
|
||||||
|
# Python 2.7
|
||||||
|
pygram.python_grammar,
|
||||||
|
]
|
||||||
|
|
||||||
|
re.match()
|
||||||
|
match = a
|
||||||
|
with match() as match:
|
||||||
|
match = f"{match}"
|
||||||
|
|
||||||
|
def test_patma_139(self):
|
||||||
|
x = False
|
||||||
|
match x:
|
||||||
|
case bool(z):
|
||||||
|
y = 0
|
||||||
|
self.assertIs(x, False)
|
||||||
|
self.assertEqual(y, 0)
|
||||||
|
self.assertIs(z, x)
|
||||||
|
|
||||||
|
# Python 3-compatible code, so only try Python 3 grammar.
|
||||||
|
grammars = []
|
||||||
|
if supports_feature(target_versions, Feature.PATTERN_MATCHING):
|
||||||
|
# Python 3.10+
|
||||||
|
grammars.append(pygram.python_grammar_soft_keywords)
|
||||||
|
# If we have to parse both, try to parse async as a keyword first
|
||||||
|
if not supports_feature(
|
||||||
|
target_versions, Feature.ASYNC_IDENTIFIERS
|
||||||
|
) and not supports_feature(target_versions, Feature.PATTERN_MATCHING):
|
||||||
|
# Python 3.7-3.9
|
||||||
|
grammars.append(
|
||||||
|
pygram.python_grammar_no_print_statement_no_exec_statement_async_keywords
|
||||||
|
)
|
||||||
|
if not supports_feature(target_versions, Feature.ASYNC_KEYWORDS):
|
||||||
|
# Python 3.0-3.6
|
||||||
|
grammars.append(pygram.python_grammar_no_print_statement_no_exec_statement)
|
||||||
|
|
||||||
|
def test_patma_155(self):
|
||||||
|
x = 0
|
||||||
|
y = None
|
||||||
|
match x:
|
||||||
|
case 1e1000:
|
||||||
|
y = 0
|
||||||
|
self.assertEqual(x, 0)
|
||||||
|
self.assertIs(y, None)
|
||||||
|
|
||||||
|
x = range(3)
|
||||||
|
match x:
|
||||||
|
case [y, case as x, z]:
|
||||||
|
w = 0
|
||||||
|
|
||||||
|
# At least one of the above branches must have been taken, because every Python
|
||||||
|
# version has exactly one of the two 'ASYNC_*' flags
|
||||||
|
return grammars
|
||||||
|
|
||||||
|
|
||||||
|
def lib2to3_parse(src_txt: str, target_versions: Iterable[TargetVersion] = ()) -> Node:
|
||||||
|
"""Given a string with source, return the lib2to3 Node."""
|
||||||
|
if not src_txt.endswith("\n"):
|
||||||
|
src_txt += "\n"
|
||||||
|
|
||||||
|
grammars = get_grammars(set(target_versions))
|
||||||
|
|
||||||
|
|
||||||
|
re.match()
|
||||||
|
match = a
|
||||||
|
with match() as match:
|
||||||
|
match = f"{match}"
|
||||||
|
|
||||||
|
re.match()
|
||||||
|
match = a
|
||||||
|
with match() as match:
|
||||||
|
match = f"{match}"
|
@ -69,6 +69,7 @@
|
|||||||
"pattern_matching_complex",
|
"pattern_matching_complex",
|
||||||
"pattern_matching_extras",
|
"pattern_matching_extras",
|
||||||
"pattern_matching_style",
|
"pattern_matching_style",
|
||||||
|
"pattern_matching_generic",
|
||||||
"parenthesized_context_managers",
|
"parenthesized_context_managers",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user