Replace the blib2to3 tokenizer with pytokens (#4536)

This commit is contained in:
Tushar Sadhwani 2025-03-16 06:11:19 +05:30 committed by GitHub
parent 9f38928414
commit 5342d2eeda
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 204 additions and 1109 deletions

View File

@ -51,6 +51,7 @@ repos:
- click >= 8.1.0, != 8.1.4, != 8.1.5
- packaging >= 22.0
- platformdirs >= 2.1.0
- pytokens >= 0.1.10
- pytest
- hypothesis
- aiohttp >= 3.7.4

View File

@ -29,6 +29,7 @@
<!-- Changes to the parser or to version autodetection -->
- Rewrite tokenizer to improve performance and compliance (#4536)
- Fix bug where certain unusual expressions (e.g., lambdas) were not accepted
in type parameter bounds and defaults. (#4602)

View File

@ -69,6 +69,7 @@ dependencies = [
"packaging>=22.0",
"pathspec>=0.9.0",
"platformdirs>=2",
"pytokens>=0.1.10",
"tomli>=1.1.0; python_version < '3.11'",
"typing_extensions>=4.0.1; python_version < '3.11'",
]

View File

@ -28,7 +28,7 @@
from typing import IO, Any, Optional, Union, cast
from blib2to3.pgen2.grammar import Grammar
from blib2to3.pgen2.tokenize import GoodTokenInfo
from blib2to3.pgen2.tokenize import TokenInfo
from blib2to3.pytree import NL
# Pgen imports
@ -112,7 +112,7 @@ def __init__(self, grammar: Grammar, logger: Optional[Logger] = None) -> None:
logger = logging.getLogger(__name__)
self.logger = logger
def parse_tokens(self, tokens: Iterable[GoodTokenInfo], debug: bool = False) -> NL:
def parse_tokens(self, tokens: Iterable[TokenInfo], debug: bool = False) -> NL:
"""Parse a series of tokens and return the syntax tree."""
# XXX Move the prefix computation into a wrapper around tokenize.
proxy = TokenProxy(tokens)
@ -180,27 +180,17 @@ def parse_tokens(self, tokens: Iterable[GoodTokenInfo], debug: bool = False) ->
assert p.rootnode is not None
return p.rootnode
def parse_stream_raw(self, stream: IO[str], debug: bool = False) -> NL:
"""Parse a stream and return the syntax tree."""
tokens = tokenize.generate_tokens(stream.readline, grammar=self.grammar)
return self.parse_tokens(tokens, debug)
def parse_stream(self, stream: IO[str], debug: bool = False) -> NL:
"""Parse a stream and return the syntax tree."""
return self.parse_stream_raw(stream, debug)
def parse_file(
self, filename: Path, encoding: Optional[str] = None, debug: bool = False
) -> NL:
"""Parse a file and return the syntax tree."""
with open(filename, encoding=encoding) as stream:
return self.parse_stream(stream, debug)
text = stream.read()
return self.parse_string(text, debug)
def parse_string(self, text: str, debug: bool = False) -> NL:
"""Parse a string and return the syntax tree."""
tokens = tokenize.generate_tokens(
io.StringIO(text).readline, grammar=self.grammar
)
tokens = tokenize.tokenize(text, grammar=self.grammar)
return self.parse_tokens(tokens, debug)
def _partially_consume_prefix(self, prefix: str, column: int) -> tuple[str, str]:

View File

@ -6,7 +6,7 @@
from typing import IO, Any, NoReturn, Optional, Union
from blib2to3.pgen2 import grammar, token, tokenize
from blib2to3.pgen2.tokenize import GoodTokenInfo
from blib2to3.pgen2.tokenize import TokenInfo
Path = Union[str, "os.PathLike[str]"]
@ -18,7 +18,7 @@ class PgenGrammar(grammar.Grammar):
class ParserGenerator:
filename: Path
stream: IO[str]
generator: Iterator[GoodTokenInfo]
generator: Iterator[TokenInfo]
first: dict[str, Optional[dict[str, int]]]
def __init__(self, filename: Path, stream: Optional[IO[str]] = None) -> None:
@ -27,8 +27,7 @@ def __init__(self, filename: Path, stream: Optional[IO[str]] = None) -> None:
stream = open(filename, encoding="utf-8")
close_stream = stream.close
self.filename = filename
self.stream = stream
self.generator = tokenize.generate_tokens(stream.readline)
self.generator = tokenize.tokenize(stream.read())
self.gettoken() # Initialize lookahead
self.dfas, self.startsymbol = self.parse()
if close_stream is not None:

File diff suppressed because it is too large Load Diff

View File

@ -232,8 +232,6 @@ file_input
fstring
FSTRING_START
"f'"
FSTRING_MIDDLE
''
fstring_replacement_field
LBRACE
'{'
@ -242,8 +240,6 @@ file_input
RBRACE
'}'
/fstring_replacement_field
FSTRING_MIDDLE
''
fstring_replacement_field
LBRACE
'{'
@ -252,8 +248,6 @@ file_input
RBRACE
'}'
/fstring_replacement_field
FSTRING_MIDDLE
''
FSTRING_END
"'"
/fstring
@ -399,8 +393,6 @@ file_input
fstring
FSTRING_START
"f'"
FSTRING_MIDDLE
''
fstring_replacement_field
LBRACE
'{'
@ -419,8 +411,6 @@ file_input
RBRACE
'}'
/fstring_replacement_field
FSTRING_MIDDLE
''
FSTRING_END
"'"
/fstring
@ -549,8 +539,6 @@ file_input
fstring
FSTRING_START
"f'"
FSTRING_MIDDLE
''
fstring_replacement_field
LBRACE
'{'
@ -559,8 +547,6 @@ file_input
RBRACE
'}'
/fstring_replacement_field
FSTRING_MIDDLE
''
fstring_replacement_field
LBRACE
'{'
@ -569,8 +555,6 @@ file_input
RBRACE
'}'
/fstring_replacement_field
FSTRING_MIDDLE
''
FSTRING_END
"'"
/fstring
@ -660,8 +644,6 @@ file_input
RBRACE
'}'
/fstring_replacement_field
FSTRING_MIDDLE
''
FSTRING_END
"'"
/fstring
@ -744,8 +726,6 @@ file_input
RBRACE
'}'
/fstring_replacement_field
FSTRING_MIDDLE
''
FSTRING_END
"'"
/fstring

View File

@ -463,17 +463,6 @@ def test_tab_comment_indentation(self) -> None:
self.assertFormatEqual(contents_spc, fs(contents_spc))
self.assertFormatEqual(contents_spc, fs(contents_tab))
# mixed tabs and spaces (valid Python 2 code)
contents_tab = "if 1:\n if 2:\n\t\tpass\n\t# comment\n pass\n"
contents_spc = "if 1:\n if 2:\n pass\n # comment\n pass\n"
self.assertFormatEqual(contents_spc, fs(contents_spc))
self.assertFormatEqual(contents_spc, fs(contents_tab))
contents_tab = "if 1:\n if 2:\n\t\tpass\n\t\t# comment\n pass\n"
contents_spc = "if 1:\n if 2:\n pass\n # comment\n pass\n"
self.assertFormatEqual(contents_spc, fs(contents_spc))
self.assertFormatEqual(contents_spc, fs(contents_tab))
def test_false_positive_symlink_output_issue_3384(self) -> None:
# Emulate the behavior when using the CLI (`black ./child --verbose`), which
# involves patching some `pathlib.Path` methods. In particular, `is_dir` is
@ -1980,7 +1969,7 @@ def test_for_handled_unexpected_eof_error(self) -> None:
with pytest.raises(black.parsing.InvalidInput) as exc_info:
black.lib2to3_parse("print(", {})
exc_info.match("Cannot parse: 2:0: EOF in multi-line statement")
exc_info.match("Cannot parse: 1:6: Unexpected EOF in multi-line statement")
def test_line_ranges_with_code_option(self) -> None:
code = textwrap.dedent("""\

View File

@ -1,6 +1,5 @@
"""Tests for the blib2to3 tokenizer."""
import io
import sys
import textwrap
from dataclasses import dataclass
@ -19,16 +18,10 @@ class Token:
def get_tokens(text: str) -> list[Token]:
"""Return the tokens produced by the tokenizer."""
readline = io.StringIO(text).readline
tokens: list[Token] = []
def tokeneater(
type: int, string: str, start: tokenize.Coord, end: tokenize.Coord, line: str
) -> None:
tokens.append(Token(token.tok_name[type], string, start, end))
tokenize.tokenize(readline, tokeneater)
return tokens
return [
Token(token.tok_name[tok_type], string, start, end)
for tok_type, string, start, end, _ in tokenize.tokenize(text)
]
def assert_tokenizes(text: str, tokens: list[Token]) -> None:
@ -69,11 +62,9 @@ def test_fstring() -> None:
'f"{x}"',
[
Token("FSTRING_START", 'f"', (1, 0), (1, 2)),
Token("FSTRING_MIDDLE", "", (1, 2), (1, 2)),
Token("LBRACE", "{", (1, 2), (1, 3)),
Token("OP", "{", (1, 2), (1, 3)),
Token("NAME", "x", (1, 3), (1, 4)),
Token("RBRACE", "}", (1, 4), (1, 5)),
Token("FSTRING_MIDDLE", "", (1, 5), (1, 5)),
Token("OP", "}", (1, 4), (1, 5)),
Token("FSTRING_END", '"', (1, 5), (1, 6)),
Token("ENDMARKER", "", (2, 0), (2, 0)),
],
@ -82,13 +73,11 @@ def test_fstring() -> None:
'f"{x:y}"\n',
[
Token(type="FSTRING_START", string='f"', start=(1, 0), end=(1, 2)),
Token(type="FSTRING_MIDDLE", string="", start=(1, 2), end=(1, 2)),
Token(type="LBRACE", string="{", start=(1, 2), end=(1, 3)),
Token(type="OP", string="{", start=(1, 2), end=(1, 3)),
Token(type="NAME", string="x", start=(1, 3), end=(1, 4)),
Token(type="OP", string=":", start=(1, 4), end=(1, 5)),
Token(type="FSTRING_MIDDLE", string="y", start=(1, 5), end=(1, 6)),
Token(type="RBRACE", string="}", start=(1, 6), end=(1, 7)),
Token(type="FSTRING_MIDDLE", string="", start=(1, 7), end=(1, 7)),
Token(type="OP", string="}", start=(1, 6), end=(1, 7)),
Token(type="FSTRING_END", string='"', start=(1, 7), end=(1, 8)),
Token(type="NEWLINE", string="\n", start=(1, 8), end=(1, 9)),
Token(type="ENDMARKER", string="", start=(2, 0), end=(2, 0)),
@ -99,10 +88,9 @@ def test_fstring() -> None:
[
Token(type="FSTRING_START", string='f"', start=(1, 0), end=(1, 2)),
Token(type="FSTRING_MIDDLE", string="x\\\n", start=(1, 2), end=(2, 0)),
Token(type="LBRACE", string="{", start=(2, 0), end=(2, 1)),
Token(type="OP", string="{", start=(2, 0), end=(2, 1)),
Token(type="NAME", string="a", start=(2, 1), end=(2, 2)),
Token(type="RBRACE", string="}", start=(2, 2), end=(2, 3)),
Token(type="FSTRING_MIDDLE", string="", start=(2, 3), end=(2, 3)),
Token(type="OP", string="}", start=(2, 2), end=(2, 3)),
Token(type="FSTRING_END", string='"', start=(2, 3), end=(2, 4)),
Token(type="NEWLINE", string="\n", start=(2, 4), end=(2, 5)),
Token(type="ENDMARKER", string="", start=(3, 0), end=(3, 0)),