Replace the blib2to3 tokenizer with pytokens (#4536)

This commit is contained in:
Tushar Sadhwani 2025-03-16 06:11:19 +05:30 committed by GitHub
parent 9f38928414
commit 5342d2eeda
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 204 additions and 1109 deletions

View File

@ -51,6 +51,7 @@ repos:
- click >= 8.1.0, != 8.1.4, != 8.1.5 - click >= 8.1.0, != 8.1.4, != 8.1.5
- packaging >= 22.0 - packaging >= 22.0
- platformdirs >= 2.1.0 - platformdirs >= 2.1.0
- pytokens >= 0.1.10
- pytest - pytest
- hypothesis - hypothesis
- aiohttp >= 3.7.4 - aiohttp >= 3.7.4

View File

@ -29,6 +29,7 @@
<!-- Changes to the parser or to version autodetection --> <!-- Changes to the parser or to version autodetection -->
- Rewrite tokenizer to improve performance and compliance (#4536)
- Fix bug where certain unusual expressions (e.g., lambdas) were not accepted - Fix bug where certain unusual expressions (e.g., lambdas) were not accepted
in type parameter bounds and defaults. (#4602) in type parameter bounds and defaults. (#4602)

View File

@ -69,6 +69,7 @@ dependencies = [
"packaging>=22.0", "packaging>=22.0",
"pathspec>=0.9.0", "pathspec>=0.9.0",
"platformdirs>=2", "platformdirs>=2",
"pytokens>=0.1.10",
"tomli>=1.1.0; python_version < '3.11'", "tomli>=1.1.0; python_version < '3.11'",
"typing_extensions>=4.0.1; python_version < '3.11'", "typing_extensions>=4.0.1; python_version < '3.11'",
] ]

View File

@ -28,7 +28,7 @@
from typing import IO, Any, Optional, Union, cast from typing import IO, Any, Optional, Union, cast
from blib2to3.pgen2.grammar import Grammar from blib2to3.pgen2.grammar import Grammar
from blib2to3.pgen2.tokenize import GoodTokenInfo from blib2to3.pgen2.tokenize import TokenInfo
from blib2to3.pytree import NL from blib2to3.pytree import NL
# Pgen imports # Pgen imports
@ -112,7 +112,7 @@ def __init__(self, grammar: Grammar, logger: Optional[Logger] = None) -> None:
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
self.logger = logger self.logger = logger
def parse_tokens(self, tokens: Iterable[GoodTokenInfo], debug: bool = False) -> NL: def parse_tokens(self, tokens: Iterable[TokenInfo], debug: bool = False) -> NL:
"""Parse a series of tokens and return the syntax tree.""" """Parse a series of tokens and return the syntax tree."""
# XXX Move the prefix computation into a wrapper around tokenize. # XXX Move the prefix computation into a wrapper around tokenize.
proxy = TokenProxy(tokens) proxy = TokenProxy(tokens)
@ -180,27 +180,17 @@ def parse_tokens(self, tokens: Iterable[GoodTokenInfo], debug: bool = False) ->
assert p.rootnode is not None assert p.rootnode is not None
return p.rootnode return p.rootnode
def parse_stream_raw(self, stream: IO[str], debug: bool = False) -> NL:
"""Parse a stream and return the syntax tree."""
tokens = tokenize.generate_tokens(stream.readline, grammar=self.grammar)
return self.parse_tokens(tokens, debug)
def parse_stream(self, stream: IO[str], debug: bool = False) -> NL:
"""Parse a stream and return the syntax tree."""
return self.parse_stream_raw(stream, debug)
def parse_file( def parse_file(
self, filename: Path, encoding: Optional[str] = None, debug: bool = False self, filename: Path, encoding: Optional[str] = None, debug: bool = False
) -> NL: ) -> NL:
"""Parse a file and return the syntax tree.""" """Parse a file and return the syntax tree."""
with open(filename, encoding=encoding) as stream: with open(filename, encoding=encoding) as stream:
return self.parse_stream(stream, debug) text = stream.read()
return self.parse_string(text, debug)
def parse_string(self, text: str, debug: bool = False) -> NL: def parse_string(self, text: str, debug: bool = False) -> NL:
"""Parse a string and return the syntax tree.""" """Parse a string and return the syntax tree."""
tokens = tokenize.generate_tokens( tokens = tokenize.tokenize(text, grammar=self.grammar)
io.StringIO(text).readline, grammar=self.grammar
)
return self.parse_tokens(tokens, debug) return self.parse_tokens(tokens, debug)
def _partially_consume_prefix(self, prefix: str, column: int) -> tuple[str, str]: def _partially_consume_prefix(self, prefix: str, column: int) -> tuple[str, str]:

View File

@ -6,7 +6,7 @@
from typing import IO, Any, NoReturn, Optional, Union from typing import IO, Any, NoReturn, Optional, Union
from blib2to3.pgen2 import grammar, token, tokenize from blib2to3.pgen2 import grammar, token, tokenize
from blib2to3.pgen2.tokenize import GoodTokenInfo from blib2to3.pgen2.tokenize import TokenInfo
Path = Union[str, "os.PathLike[str]"] Path = Union[str, "os.PathLike[str]"]
@ -18,7 +18,7 @@ class PgenGrammar(grammar.Grammar):
class ParserGenerator: class ParserGenerator:
filename: Path filename: Path
stream: IO[str] stream: IO[str]
generator: Iterator[GoodTokenInfo] generator: Iterator[TokenInfo]
first: dict[str, Optional[dict[str, int]]] first: dict[str, Optional[dict[str, int]]]
def __init__(self, filename: Path, stream: Optional[IO[str]] = None) -> None: def __init__(self, filename: Path, stream: Optional[IO[str]] = None) -> None:
@ -27,8 +27,7 @@ def __init__(self, filename: Path, stream: Optional[IO[str]] = None) -> None:
stream = open(filename, encoding="utf-8") stream = open(filename, encoding="utf-8")
close_stream = stream.close close_stream = stream.close
self.filename = filename self.filename = filename
self.stream = stream self.generator = tokenize.tokenize(stream.read())
self.generator = tokenize.generate_tokens(stream.readline)
self.gettoken() # Initialize lookahead self.gettoken() # Initialize lookahead
self.dfas, self.startsymbol = self.parse() self.dfas, self.startsymbol = self.parse()
if close_stream is not None: if close_stream is not None:

File diff suppressed because it is too large Load Diff

View File

@ -232,8 +232,6 @@ file_input
fstring fstring
FSTRING_START FSTRING_START
"f'" "f'"
FSTRING_MIDDLE
''
fstring_replacement_field fstring_replacement_field
LBRACE LBRACE
'{' '{'
@ -242,8 +240,6 @@ file_input
RBRACE RBRACE
'}' '}'
/fstring_replacement_field /fstring_replacement_field
FSTRING_MIDDLE
''
fstring_replacement_field fstring_replacement_field
LBRACE LBRACE
'{' '{'
@ -252,8 +248,6 @@ file_input
RBRACE RBRACE
'}' '}'
/fstring_replacement_field /fstring_replacement_field
FSTRING_MIDDLE
''
FSTRING_END FSTRING_END
"'" "'"
/fstring /fstring
@ -399,8 +393,6 @@ file_input
fstring fstring
FSTRING_START FSTRING_START
"f'" "f'"
FSTRING_MIDDLE
''
fstring_replacement_field fstring_replacement_field
LBRACE LBRACE
'{' '{'
@ -419,8 +411,6 @@ file_input
RBRACE RBRACE
'}' '}'
/fstring_replacement_field /fstring_replacement_field
FSTRING_MIDDLE
''
FSTRING_END FSTRING_END
"'" "'"
/fstring /fstring
@ -549,8 +539,6 @@ file_input
fstring fstring
FSTRING_START FSTRING_START
"f'" "f'"
FSTRING_MIDDLE
''
fstring_replacement_field fstring_replacement_field
LBRACE LBRACE
'{' '{'
@ -559,8 +547,6 @@ file_input
RBRACE RBRACE
'}' '}'
/fstring_replacement_field /fstring_replacement_field
FSTRING_MIDDLE
''
fstring_replacement_field fstring_replacement_field
LBRACE LBRACE
'{' '{'
@ -569,8 +555,6 @@ file_input
RBRACE RBRACE
'}' '}'
/fstring_replacement_field /fstring_replacement_field
FSTRING_MIDDLE
''
FSTRING_END FSTRING_END
"'" "'"
/fstring /fstring
@ -660,8 +644,6 @@ file_input
RBRACE RBRACE
'}' '}'
/fstring_replacement_field /fstring_replacement_field
FSTRING_MIDDLE
''
FSTRING_END FSTRING_END
"'" "'"
/fstring /fstring
@ -744,8 +726,6 @@ file_input
RBRACE RBRACE
'}' '}'
/fstring_replacement_field /fstring_replacement_field
FSTRING_MIDDLE
''
FSTRING_END FSTRING_END
"'" "'"
/fstring /fstring

View File

@ -463,17 +463,6 @@ def test_tab_comment_indentation(self) -> None:
self.assertFormatEqual(contents_spc, fs(contents_spc)) self.assertFormatEqual(contents_spc, fs(contents_spc))
self.assertFormatEqual(contents_spc, fs(contents_tab)) self.assertFormatEqual(contents_spc, fs(contents_tab))
# mixed tabs and spaces (valid Python 2 code)
contents_tab = "if 1:\n if 2:\n\t\tpass\n\t# comment\n pass\n"
contents_spc = "if 1:\n if 2:\n pass\n # comment\n pass\n"
self.assertFormatEqual(contents_spc, fs(contents_spc))
self.assertFormatEqual(contents_spc, fs(contents_tab))
contents_tab = "if 1:\n if 2:\n\t\tpass\n\t\t# comment\n pass\n"
contents_spc = "if 1:\n if 2:\n pass\n # comment\n pass\n"
self.assertFormatEqual(contents_spc, fs(contents_spc))
self.assertFormatEqual(contents_spc, fs(contents_tab))
def test_false_positive_symlink_output_issue_3384(self) -> None: def test_false_positive_symlink_output_issue_3384(self) -> None:
# Emulate the behavior when using the CLI (`black ./child --verbose`), which # Emulate the behavior when using the CLI (`black ./child --verbose`), which
# involves patching some `pathlib.Path` methods. In particular, `is_dir` is # involves patching some `pathlib.Path` methods. In particular, `is_dir` is
@ -1980,7 +1969,7 @@ def test_for_handled_unexpected_eof_error(self) -> None:
with pytest.raises(black.parsing.InvalidInput) as exc_info: with pytest.raises(black.parsing.InvalidInput) as exc_info:
black.lib2to3_parse("print(", {}) black.lib2to3_parse("print(", {})
exc_info.match("Cannot parse: 2:0: EOF in multi-line statement") exc_info.match("Cannot parse: 1:6: Unexpected EOF in multi-line statement")
def test_line_ranges_with_code_option(self) -> None: def test_line_ranges_with_code_option(self) -> None:
code = textwrap.dedent("""\ code = textwrap.dedent("""\

View File

@ -1,6 +1,5 @@
"""Tests for the blib2to3 tokenizer.""" """Tests for the blib2to3 tokenizer."""
import io
import sys import sys
import textwrap import textwrap
from dataclasses import dataclass from dataclasses import dataclass
@ -19,16 +18,10 @@ class Token:
def get_tokens(text: str) -> list[Token]: def get_tokens(text: str) -> list[Token]:
"""Return the tokens produced by the tokenizer.""" """Return the tokens produced by the tokenizer."""
readline = io.StringIO(text).readline return [
tokens: list[Token] = [] Token(token.tok_name[tok_type], string, start, end)
for tok_type, string, start, end, _ in tokenize.tokenize(text)
def tokeneater( ]
type: int, string: str, start: tokenize.Coord, end: tokenize.Coord, line: str
) -> None:
tokens.append(Token(token.tok_name[type], string, start, end))
tokenize.tokenize(readline, tokeneater)
return tokens
def assert_tokenizes(text: str, tokens: list[Token]) -> None: def assert_tokenizes(text: str, tokens: list[Token]) -> None:
@ -69,11 +62,9 @@ def test_fstring() -> None:
'f"{x}"', 'f"{x}"',
[ [
Token("FSTRING_START", 'f"', (1, 0), (1, 2)), Token("FSTRING_START", 'f"', (1, 0), (1, 2)),
Token("FSTRING_MIDDLE", "", (1, 2), (1, 2)), Token("OP", "{", (1, 2), (1, 3)),
Token("LBRACE", "{", (1, 2), (1, 3)),
Token("NAME", "x", (1, 3), (1, 4)), Token("NAME", "x", (1, 3), (1, 4)),
Token("RBRACE", "}", (1, 4), (1, 5)), Token("OP", "}", (1, 4), (1, 5)),
Token("FSTRING_MIDDLE", "", (1, 5), (1, 5)),
Token("FSTRING_END", '"', (1, 5), (1, 6)), Token("FSTRING_END", '"', (1, 5), (1, 6)),
Token("ENDMARKER", "", (2, 0), (2, 0)), Token("ENDMARKER", "", (2, 0), (2, 0)),
], ],
@ -82,13 +73,11 @@ def test_fstring() -> None:
'f"{x:y}"\n', 'f"{x:y}"\n',
[ [
Token(type="FSTRING_START", string='f"', start=(1, 0), end=(1, 2)), Token(type="FSTRING_START", string='f"', start=(1, 0), end=(1, 2)),
Token(type="FSTRING_MIDDLE", string="", start=(1, 2), end=(1, 2)), Token(type="OP", string="{", start=(1, 2), end=(1, 3)),
Token(type="LBRACE", string="{", start=(1, 2), end=(1, 3)),
Token(type="NAME", string="x", start=(1, 3), end=(1, 4)), Token(type="NAME", string="x", start=(1, 3), end=(1, 4)),
Token(type="OP", string=":", start=(1, 4), end=(1, 5)), Token(type="OP", string=":", start=(1, 4), end=(1, 5)),
Token(type="FSTRING_MIDDLE", string="y", start=(1, 5), end=(1, 6)), Token(type="FSTRING_MIDDLE", string="y", start=(1, 5), end=(1, 6)),
Token(type="RBRACE", string="}", start=(1, 6), end=(1, 7)), Token(type="OP", string="}", start=(1, 6), end=(1, 7)),
Token(type="FSTRING_MIDDLE", string="", start=(1, 7), end=(1, 7)),
Token(type="FSTRING_END", string='"', start=(1, 7), end=(1, 8)), Token(type="FSTRING_END", string='"', start=(1, 7), end=(1, 8)),
Token(type="NEWLINE", string="\n", start=(1, 8), end=(1, 9)), Token(type="NEWLINE", string="\n", start=(1, 8), end=(1, 9)),
Token(type="ENDMARKER", string="", start=(2, 0), end=(2, 0)), Token(type="ENDMARKER", string="", start=(2, 0), end=(2, 0)),
@ -99,10 +88,9 @@ def test_fstring() -> None:
[ [
Token(type="FSTRING_START", string='f"', start=(1, 0), end=(1, 2)), Token(type="FSTRING_START", string='f"', start=(1, 0), end=(1, 2)),
Token(type="FSTRING_MIDDLE", string="x\\\n", start=(1, 2), end=(2, 0)), Token(type="FSTRING_MIDDLE", string="x\\\n", start=(1, 2), end=(2, 0)),
Token(type="LBRACE", string="{", start=(2, 0), end=(2, 1)), Token(type="OP", string="{", start=(2, 0), end=(2, 1)),
Token(type="NAME", string="a", start=(2, 1), end=(2, 2)), Token(type="NAME", string="a", start=(2, 1), end=(2, 2)),
Token(type="RBRACE", string="}", start=(2, 2), end=(2, 3)), Token(type="OP", string="}", start=(2, 2), end=(2, 3)),
Token(type="FSTRING_MIDDLE", string="", start=(2, 3), end=(2, 3)),
Token(type="FSTRING_END", string='"', start=(2, 3), end=(2, 4)), Token(type="FSTRING_END", string='"', start=(2, 3), end=(2, 4)),
Token(type="NEWLINE", string="\n", start=(2, 4), end=(2, 5)), Token(type="NEWLINE", string="\n", start=(2, 4), end=(2, 5)),
Token(type="ENDMARKER", string="", start=(3, 0), end=(3, 0)), Token(type="ENDMARKER", string="", start=(3, 0), end=(3, 0)),