Speed up new backtracking parser (#2728)

2022-01-10 21:22:00 +03:00 · 2022-01-10 21:22:00 +03:00 · 3e731527e4
commit 3e731527e4
parent 521d1b8129
4 changed files with 176 additions and 23 deletions
--- a/CHANGES.md
+++ b/CHANGES.md
@ -24,6 +24,8 @@
  at least one pre-existing blank line (#2736)
 - Verbose mode also now describes how a project root was discovered and which paths will
  be formatted. (#2526)
 - Speed-up the new backtracking parser about 4X in general (enabled when
  `--target-version` is set to 3.10 and higher). (#2728)
 ### Packaging
--- a/src/blib2to3/pgen2/parse.py
+++ b/src/blib2to3/pgen2/parse.py
@ -46,6 +46,17 @@ def lam_sub(grammar: Grammar, node: RawNode) -> NL:
    return Node(type=node[0], children=node[3], context=node[2])
 # A placeholder node, used when parser is backtracking.
 DUMMY_NODE = (-1, None, None, None)
 def stack_copy(
    stack: List[Tuple[DFAS, int, RawNode]]
 ) -> List[Tuple[DFAS, int, RawNode]]:
    """Nodeless stack copy."""
    return [(copy.deepcopy(dfa), label, DUMMY_NODE) for dfa, label, _ in stack]
 class Recorder:
    def __init__(self, parser: "Parser", ilabels: List[int], context: Context) -> None:
        self.parser = parser
@ -54,7 +65,7 @@ def __init__(self, parser: "Parser", ilabels: List[int], context: Context) -> No
        self._dead_ilabels: Set[int] = set()
        self._start_point = self.parser.stack
-        self._points = {ilabel: copy.deepcopy(self._start_point) for ilabel in ilabels}
+        self._points = {ilabel: stack_copy(self._start_point) for ilabel in ilabels}
    @property
    def ilabels(self) -> Set[int]:
@ -62,13 +73,32 @@ def ilabels(self) -> Set[int]:
    @contextmanager
    def switch_to(self, ilabel: int) -> Iterator[None]:
-        self.parser.stack = self._points[ilabel]
+        with self.backtrack():
            self.parser.stack = self._points[ilabel]
            try:
                yield
            except ParseError:
                self._dead_ilabels.add(ilabel)
            finally:
                self.parser.stack = self._start_point
    @contextmanager
    def backtrack(self) -> Iterator[None]:
        """
        Use the node-level invariant ones for basic parsing operations (push/pop/shift).
        These still will operate on the stack; but they won't create any new nodes, or
        modify the contents of any other existing nodes.
        This saves us a ton of time when we are backtracking, since we
        want to restore to the initial state as quick as possible, which
        can only be done by having as little mutatations as possible.
        """
        is_backtracking = self.parser.is_backtracking
        try:
            self.parser.is_backtracking = True
            yield
        except ParseError:
            self._dead_ilabels.add(ilabel)
        finally:
-            self.parser.stack = self._start_point
+            self.parser.is_backtracking = is_backtracking
    def add_token(self, tok_type: int, tok_val: Text, raw: bool = False) -> None:
        func: Callable[..., Any]
@ -179,6 +209,7 @@ def __init__(self, grammar: Grammar, convert: Optional[Convert] = None) -> None:
        self.grammar = grammar
        # See note in docstring above. TL;DR this is ignored.
        self.convert = convert or lam_sub
        self.is_backtracking = False
    def setup(self, proxy: "TokenProxy", start: Optional[int] = None) -> None:
        """Prepare for parsing.
@ -319,28 +350,40 @@ def classify(self, type: int, value: Text, context: Context) -> List[int]:
    def shift(self, type: int, value: Text, newstate: int, context: Context) -> None:
        """Shift a token.  (Internal)"""
-        dfa, state, node = self.stack[-1]
+        if self.is_backtracking:
-        rawnode: RawNode = (type, value, context, None)
+            dfa, state, _ = self.stack[-1]
-        newnode = convert(self.grammar, rawnode)
+            self.stack[-1] = (dfa, newstate, DUMMY_NODE)
-        assert node[-1] is not None
+        else:
-        node[-1].append(newnode)
+            dfa, state, node = self.stack[-1]
-        self.stack[-1] = (dfa, newstate, node)
+            rawnode: RawNode = (type, value, context, None)
            newnode = convert(self.grammar, rawnode)
            assert node[-1] is not None
            node[-1].append(newnode)
            self.stack[-1] = (dfa, newstate, node)
    def push(self, type: int, newdfa: DFAS, newstate: int, context: Context) -> None:
        """Push a nonterminal.  (Internal)"""
-        dfa, state, node = self.stack[-1]
+        if self.is_backtracking:
-        newnode: RawNode = (type, None, context, [])
+            dfa, state, _ = self.stack[-1]
-        self.stack[-1] = (dfa, newstate, node)
+            self.stack[-1] = (dfa, newstate, DUMMY_NODE)
-        self.stack.append((newdfa, 0, newnode))
+            self.stack.append((newdfa, 0, DUMMY_NODE))
        else:
            dfa, state, node = self.stack[-1]
            newnode: RawNode = (type, None, context, [])
            self.stack[-1] = (dfa, newstate, node)
            self.stack.append((newdfa, 0, newnode))
    def pop(self) -> None:
        """Pop a nonterminal.  (Internal)"""
-        popdfa, popstate, popnode = self.stack.pop()
+        if self.is_backtracking:
-        newnode = convert(self.grammar, popnode)
+            self.stack.pop()
        if self.stack:
            dfa, state, node = self.stack[-1]
            assert node[-1] is not None
            node[-1].append(newnode)
        else:
-            self.rootnode = newnode
+            popdfa, popstate, popnode = self.stack.pop()
-            self.rootnode.used_names = self.used_names
+            newnode = convert(self.grammar, popnode)
            if self.stack:
                dfa, state, node = self.stack[-1]
                assert node[-1] is not None
                node[-1].append(newnode)
            else:
                self.rootnode = newnode
                self.rootnode.used_names = self.used_names
--- a/tests/data/pattern_matching_generic.py
+++ b/tests/data/pattern_matching_generic.py
@ -0,0 +1,107 @@
 re.match()
 match = a
 with match() as match:
    match = f"{match}"
 re.match()
 match = a
 with match() as match:
    match = f"{match}"
 def get_grammars(target_versions: Set[TargetVersion]) -> List[Grammar]:
    if not target_versions:
        # No target_version specified, so try all grammars.
        return [
            # Python 3.7+
            pygram.python_grammar_no_print_statement_no_exec_statement_async_keywords,
            # Python 3.0-3.6
            pygram.python_grammar_no_print_statement_no_exec_statement,
            # Python 2.7 with future print_function import
            pygram.python_grammar_no_print_statement,
            # Python 2.7
            pygram.python_grammar,
        ]
    match match:
        case case:
            match match:
                case case:
                    pass
    if all(version.is_python2() for version in target_versions):
        # Python 2-only code, so try Python 2 grammars.
        return [
            # Python 2.7 with future print_function import
            pygram.python_grammar_no_print_statement,
            # Python 2.7
            pygram.python_grammar,
        ]
    re.match()
    match = a
    with match() as match:
        match = f"{match}"
    def test_patma_139(self):
        x = False
        match x:
            case bool(z):
                y = 0
        self.assertIs(x, False)
        self.assertEqual(y, 0)
        self.assertIs(z, x)
    # Python 3-compatible code, so only try Python 3 grammar.
    grammars = []
    if supports_feature(target_versions, Feature.PATTERN_MATCHING):
        # Python 3.10+
        grammars.append(pygram.python_grammar_soft_keywords)
    # If we have to parse both, try to parse async as a keyword first
    if not supports_feature(
        target_versions, Feature.ASYNC_IDENTIFIERS
    ) and not supports_feature(target_versions, Feature.PATTERN_MATCHING):
        # Python 3.7-3.9
        grammars.append(
            pygram.python_grammar_no_print_statement_no_exec_statement_async_keywords
        )
    if not supports_feature(target_versions, Feature.ASYNC_KEYWORDS):
        # Python 3.0-3.6
        grammars.append(pygram.python_grammar_no_print_statement_no_exec_statement)
    def test_patma_155(self):
        x = 0
        y = None
        match x:
            case 1e1000:
                y = 0
        self.assertEqual(x, 0)
        self.assertIs(y, None)
        x = range(3)
        match x:
            case [y, case as x, z]:
                w = 0
    # At least one of the above branches must have been taken, because every Python
    # version has exactly one of the two 'ASYNC_*' flags
    return grammars
 def lib2to3_parse(src_txt: str, target_versions: Iterable[TargetVersion] = ()) -> Node:
    """Given a string with source, return the lib2to3 Node."""
    if not src_txt.endswith("\n"):
        src_txt += "\n"
    grammars = get_grammars(set(target_versions))
 re.match()
 match = a
 with match() as match:
    match = f"{match}"
 re.match()
 match = a
 with match() as match:
    match = f"{match}"
--- a/tests/test_format.py
+++ b/tests/test_format.py
@ -69,6 +69,7 @@
    "pattern_matching_complex",
    "pattern_matching_extras",
    "pattern_matching_style",
    "pattern_matching_generic",
    "parenthesized_context_managers",
 ]