Replace the blib2to3 tokenizer with pytokens (#4536)

2025-03-16 06:11:19 +05:30 · 2025-03-16 06:11:19 +05:30 · 5342d2eeda
commit 5342d2eeda
parent 9f38928414
9 changed files with 204 additions and 1109 deletions
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -51,6 +51,7 @@ repos:
          - click >= 8.1.0, != 8.1.4, != 8.1.5
          - packaging >= 22.0
          - platformdirs >= 2.1.0
+          - pytokens >= 0.1.10
          - pytest
          - hypothesis
          - aiohttp >= 3.7.4
--- a/CHANGES.md
+++ b/CHANGES.md
@ -29,6 +29,7 @@

 <!-- Changes to the parser or to version autodetection -->

+- Rewrite tokenizer to improve performance and compliance (#4536)
 - Fix bug where certain unusual expressions (e.g., lambdas) were not accepted
  in type parameter bounds and defaults. (#4602)

--- a/pyproject.toml
+++ b/pyproject.toml
@ -69,6 +69,7 @@ dependencies = [
  "packaging>=22.0",
  "pathspec>=0.9.0",
  "platformdirs>=2",
+  "pytokens>=0.1.10",
  "tomli>=1.1.0; python_version < '3.11'",
  "typing_extensions>=4.0.1; python_version < '3.11'",
 ]
--- a/src/blib2to3/pgen2/driver.py
+++ b/src/blib2to3/pgen2/driver.py
@ -28,7 +28,7 @@
 from typing import IO, Any, Optional, Union, cast

 from blib2to3.pgen2.grammar import Grammar
-from blib2to3.pgen2.tokenize import GoodTokenInfo
+from blib2to3.pgen2.tokenize import TokenInfo
 from blib2to3.pytree import NL

 # Pgen imports
@ -112,7 +112,7 @@ def __init__(self, grammar: Grammar, logger: Optional[Logger] = None) -> None:
            logger = logging.getLogger(__name__)
        self.logger = logger

-    def parse_tokens(self, tokens: Iterable[GoodTokenInfo], debug: bool = False) -> NL:
+    def parse_tokens(self, tokens: Iterable[TokenInfo], debug: bool = False) -> NL:
        """Parse a series of tokens and return the syntax tree."""
        # XXX Move the prefix computation into a wrapper around tokenize.
        proxy = TokenProxy(tokens)
@ -180,27 +180,17 @@ def parse_tokens(self, tokens: Iterable[GoodTokenInfo], debug: bool = False) ->
        assert p.rootnode is not None
        return p.rootnode

-    def parse_stream_raw(self, stream: IO[str], debug: bool = False) -> NL:
-        """Parse a stream and return the syntax tree."""
-        tokens = tokenize.generate_tokens(stream.readline, grammar=self.grammar)
-        return self.parse_tokens(tokens, debug)
-
-    def parse_stream(self, stream: IO[str], debug: bool = False) -> NL:
-        """Parse a stream and return the syntax tree."""
-        return self.parse_stream_raw(stream, debug)
-
    def parse_file(
        self, filename: Path, encoding: Optional[str] = None, debug: bool = False
    ) -> NL:
        """Parse a file and return the syntax tree."""
        with open(filename, encoding=encoding) as stream:
-            return self.parse_stream(stream, debug)
+            text = stream.read()
+        return self.parse_string(text, debug)

    def parse_string(self, text: str, debug: bool = False) -> NL:
        """Parse a string and return the syntax tree."""
-        tokens = tokenize.generate_tokens(
-            io.StringIO(text).readline, grammar=self.grammar
-        )
+        tokens = tokenize.tokenize(text, grammar=self.grammar)
        return self.parse_tokens(tokens, debug)

    def _partially_consume_prefix(self, prefix: str, column: int) -> tuple[str, str]:
--- a/src/blib2to3/pgen2/pgen.py
+++ b/src/blib2to3/pgen2/pgen.py
@ -6,7 +6,7 @@
 from typing import IO, Any, NoReturn, Optional, Union

 from blib2to3.pgen2 import grammar, token, tokenize
-from blib2to3.pgen2.tokenize import GoodTokenInfo
+from blib2to3.pgen2.tokenize import TokenInfo

 Path = Union[str, "os.PathLike[str]"]

@ -18,7 +18,7 @@ class PgenGrammar(grammar.Grammar):
 class ParserGenerator:
    filename: Path
    stream: IO[str]
-    generator: Iterator[GoodTokenInfo]
+    generator: Iterator[TokenInfo]
    first: dict[str, Optional[dict[str, int]]]

    def __init__(self, filename: Path, stream: Optional[IO[str]] = None) -> None:
@ -27,8 +27,7 @@ def __init__(self, filename: Path, stream: Optional[IO[str]] = None) -> None:
            stream = open(filename, encoding="utf-8")
            close_stream = stream.close
        self.filename = filename
-        self.stream = stream
-        self.generator = tokenize.generate_tokens(stream.readline)
+        self.generator = tokenize.tokenize(stream.read())
        self.gettoken()  # Initialize lookahead
        self.dfas, self.startsymbol = self.parse()
        if close_stream is not None:
--- a/src/blib2to3/pgen2/tokenize.py
+++ b/src/blib2to3/pgen2/tokenize.py
--- a/tests/data/miscellaneous/debug_visitor.out
+++ b/tests/data/miscellaneous/debug_visitor.out
@ -232,8 +232,6 @@ file_input
                        fstring
                          FSTRING_START
 "f'"
-                          FSTRING_MIDDLE
- ''
                          fstring_replacement_field
                            LBRACE
 '{'
@ -242,8 +240,6 @@ file_input
                            RBRACE
 '}'
                          /fstring_replacement_field
-                          FSTRING_MIDDLE
- ''
                          fstring_replacement_field
                            LBRACE
 '{'
@ -252,8 +248,6 @@ file_input
                            RBRACE
 '}'
                          /fstring_replacement_field
-                          FSTRING_MIDDLE
- ''
                          FSTRING_END
 "'"
                        /fstring
@ -399,8 +393,6 @@ file_input
                        fstring
                          FSTRING_START
 "f'"
-                          FSTRING_MIDDLE
- ''
                          fstring_replacement_field
                            LBRACE
 '{'
@ -419,8 +411,6 @@ file_input
                            RBRACE
 '}'
                          /fstring_replacement_field
-                          FSTRING_MIDDLE
- ''
                          FSTRING_END
 "'"
                        /fstring
@ -549,8 +539,6 @@ file_input
                        fstring
                          FSTRING_START
 "f'"
-                          FSTRING_MIDDLE
- ''
                          fstring_replacement_field
                            LBRACE
 '{'
@ -559,8 +547,6 @@ file_input
                            RBRACE
 '}'
                          /fstring_replacement_field
-                          FSTRING_MIDDLE
- ''
                          fstring_replacement_field
                            LBRACE
 '{'
@ -569,8 +555,6 @@ file_input
                            RBRACE
 '}'
                          /fstring_replacement_field
-                          FSTRING_MIDDLE
- ''
                          FSTRING_END
 "'"
                        /fstring
@ -660,8 +644,6 @@ file_input
                                RBRACE
 '}'
                              /fstring_replacement_field
-                              FSTRING_MIDDLE
- ''
                              FSTRING_END
 "'"
                            /fstring
@ -744,8 +726,6 @@ file_input
                            RBRACE
 '}'
                          /fstring_replacement_field
-                          FSTRING_MIDDLE
- ''
                          FSTRING_END
 "'"
                        /fstring
--- a/tests/test_black.py
+++ b/tests/test_black.py
@ -463,17 +463,6 @@ def test_tab_comment_indentation(self) -> None:
        self.assertFormatEqual(contents_spc, fs(contents_spc))
        self.assertFormatEqual(contents_spc, fs(contents_tab))

-        # mixed tabs and spaces (valid Python 2 code)
-        contents_tab = "if 1:\n        if 2:\n\t\tpass\n\t# comment\n        pass\n"
-        contents_spc = "if 1:\n    if 2:\n        pass\n    # comment\n    pass\n"
-        self.assertFormatEqual(contents_spc, fs(contents_spc))
-        self.assertFormatEqual(contents_spc, fs(contents_tab))
-
-        contents_tab = "if 1:\n        if 2:\n\t\tpass\n\t\t# comment\n        pass\n"
-        contents_spc = "if 1:\n    if 2:\n        pass\n        # comment\n    pass\n"
-        self.assertFormatEqual(contents_spc, fs(contents_spc))
-        self.assertFormatEqual(contents_spc, fs(contents_tab))
-
    def test_false_positive_symlink_output_issue_3384(self) -> None:
        # Emulate the behavior when using the CLI (`black ./child  --verbose`), which
        # involves patching some `pathlib.Path` methods. In particular, `is_dir` is
@ -1980,7 +1969,7 @@ def test_for_handled_unexpected_eof_error(self) -> None:
        with pytest.raises(black.parsing.InvalidInput) as exc_info:
            black.lib2to3_parse("print(", {})

-        exc_info.match("Cannot parse: 2:0: EOF in multi-line statement")
+        exc_info.match("Cannot parse: 1:6: Unexpected EOF in multi-line statement")

    def test_line_ranges_with_code_option(self) -> None:
        code = textwrap.dedent("""\
--- a/tests/test_tokenize.py
+++ b/tests/test_tokenize.py
@ -1,6 +1,5 @@
 """Tests for the blib2to3 tokenizer."""

-import io
 import sys
 import textwrap
 from dataclasses import dataclass
@ -19,16 +18,10 @@ class Token:

 def get_tokens(text: str) -> list[Token]:
    """Return the tokens produced by the tokenizer."""
-    readline = io.StringIO(text).readline
-    tokens: list[Token] = []
-
-    def tokeneater(
-        type: int, string: str, start: tokenize.Coord, end: tokenize.Coord, line: str
-    ) -> None:
-        tokens.append(Token(token.tok_name[type], string, start, end))
-
-    tokenize.tokenize(readline, tokeneater)
-    return tokens
+    return [
+        Token(token.tok_name[tok_type], string, start, end)
+        for tok_type, string, start, end, _ in tokenize.tokenize(text)
+    ]


 def assert_tokenizes(text: str, tokens: list[Token]) -> None:
@ -69,11 +62,9 @@ def test_fstring() -> None:
        'f"{x}"',
        [
            Token("FSTRING_START", 'f"', (1, 0), (1, 2)),
-            Token("FSTRING_MIDDLE", "", (1, 2), (1, 2)),
-            Token("LBRACE", "{", (1, 2), (1, 3)),
+            Token("OP", "{", (1, 2), (1, 3)),
            Token("NAME", "x", (1, 3), (1, 4)),
-            Token("RBRACE", "}", (1, 4), (1, 5)),
-            Token("FSTRING_MIDDLE", "", (1, 5), (1, 5)),
+            Token("OP", "}", (1, 4), (1, 5)),
            Token("FSTRING_END", '"', (1, 5), (1, 6)),
            Token("ENDMARKER", "", (2, 0), (2, 0)),
        ],
@ -82,13 +73,11 @@ def test_fstring() -> None:
        'f"{x:y}"\n',
        [
            Token(type="FSTRING_START", string='f"', start=(1, 0), end=(1, 2)),
-            Token(type="FSTRING_MIDDLE", string="", start=(1, 2), end=(1, 2)),
-            Token(type="LBRACE", string="{", start=(1, 2), end=(1, 3)),
+            Token(type="OP", string="{", start=(1, 2), end=(1, 3)),
            Token(type="NAME", string="x", start=(1, 3), end=(1, 4)),
            Token(type="OP", string=":", start=(1, 4), end=(1, 5)),
            Token(type="FSTRING_MIDDLE", string="y", start=(1, 5), end=(1, 6)),
-            Token(type="RBRACE", string="}", start=(1, 6), end=(1, 7)),
-            Token(type="FSTRING_MIDDLE", string="", start=(1, 7), end=(1, 7)),
+            Token(type="OP", string="}", start=(1, 6), end=(1, 7)),
            Token(type="FSTRING_END", string='"', start=(1, 7), end=(1, 8)),
            Token(type="NEWLINE", string="\n", start=(1, 8), end=(1, 9)),
            Token(type="ENDMARKER", string="", start=(2, 0), end=(2, 0)),
@ -99,10 +88,9 @@ def test_fstring() -> None:
        [
            Token(type="FSTRING_START", string='f"', start=(1, 0), end=(1, 2)),
            Token(type="FSTRING_MIDDLE", string="x\\\n", start=(1, 2), end=(2, 0)),
-            Token(type="LBRACE", string="{", start=(2, 0), end=(2, 1)),
+            Token(type="OP", string="{", start=(2, 0), end=(2, 1)),
            Token(type="NAME", string="a", start=(2, 1), end=(2, 2)),
-            Token(type="RBRACE", string="}", start=(2, 2), end=(2, 3)),
-            Token(type="FSTRING_MIDDLE", string="", start=(2, 3), end=(2, 3)),
+            Token(type="OP", string="}", start=(2, 2), end=(2, 3)),
            Token(type="FSTRING_END", string='"', start=(2, 3), end=(2, 4)),
            Token(type="NEWLINE", string="\n", start=(2, 4), end=(2, 5)),
            Token(type="ENDMARKER", string="", start=(3, 0), end=(3, 0)),