Replace the blib2to3 tokenizer with pytokens (#4536)
This commit is contained in:
parent
9f38928414
commit
5342d2eeda
@ -51,6 +51,7 @@ repos:
|
||||
- click >= 8.1.0, != 8.1.4, != 8.1.5
|
||||
- packaging >= 22.0
|
||||
- platformdirs >= 2.1.0
|
||||
- pytokens >= 0.1.10
|
||||
- pytest
|
||||
- hypothesis
|
||||
- aiohttp >= 3.7.4
|
||||
|
@ -29,6 +29,7 @@
|
||||
|
||||
<!-- Changes to the parser or to version autodetection -->
|
||||
|
||||
- Rewrite tokenizer to improve performance and compliance (#4536)
|
||||
- Fix bug where certain unusual expressions (e.g., lambdas) were not accepted
|
||||
in type parameter bounds and defaults. (#4602)
|
||||
|
||||
|
@ -69,6 +69,7 @@ dependencies = [
|
||||
"packaging>=22.0",
|
||||
"pathspec>=0.9.0",
|
||||
"platformdirs>=2",
|
||||
"pytokens>=0.1.10",
|
||||
"tomli>=1.1.0; python_version < '3.11'",
|
||||
"typing_extensions>=4.0.1; python_version < '3.11'",
|
||||
]
|
||||
|
@ -28,7 +28,7 @@
|
||||
from typing import IO, Any, Optional, Union, cast
|
||||
|
||||
from blib2to3.pgen2.grammar import Grammar
|
||||
from blib2to3.pgen2.tokenize import GoodTokenInfo
|
||||
from blib2to3.pgen2.tokenize import TokenInfo
|
||||
from blib2to3.pytree import NL
|
||||
|
||||
# Pgen imports
|
||||
@ -112,7 +112,7 @@ def __init__(self, grammar: Grammar, logger: Optional[Logger] = None) -> None:
|
||||
logger = logging.getLogger(__name__)
|
||||
self.logger = logger
|
||||
|
||||
def parse_tokens(self, tokens: Iterable[GoodTokenInfo], debug: bool = False) -> NL:
|
||||
def parse_tokens(self, tokens: Iterable[TokenInfo], debug: bool = False) -> NL:
|
||||
"""Parse a series of tokens and return the syntax tree."""
|
||||
# XXX Move the prefix computation into a wrapper around tokenize.
|
||||
proxy = TokenProxy(tokens)
|
||||
@ -180,27 +180,17 @@ def parse_tokens(self, tokens: Iterable[GoodTokenInfo], debug: bool = False) ->
|
||||
assert p.rootnode is not None
|
||||
return p.rootnode
|
||||
|
||||
def parse_stream_raw(self, stream: IO[str], debug: bool = False) -> NL:
|
||||
"""Parse a stream and return the syntax tree."""
|
||||
tokens = tokenize.generate_tokens(stream.readline, grammar=self.grammar)
|
||||
return self.parse_tokens(tokens, debug)
|
||||
|
||||
def parse_stream(self, stream: IO[str], debug: bool = False) -> NL:
|
||||
"""Parse a stream and return the syntax tree."""
|
||||
return self.parse_stream_raw(stream, debug)
|
||||
|
||||
def parse_file(
|
||||
self, filename: Path, encoding: Optional[str] = None, debug: bool = False
|
||||
) -> NL:
|
||||
"""Parse a file and return the syntax tree."""
|
||||
with open(filename, encoding=encoding) as stream:
|
||||
return self.parse_stream(stream, debug)
|
||||
text = stream.read()
|
||||
return self.parse_string(text, debug)
|
||||
|
||||
def parse_string(self, text: str, debug: bool = False) -> NL:
|
||||
"""Parse a string and return the syntax tree."""
|
||||
tokens = tokenize.generate_tokens(
|
||||
io.StringIO(text).readline, grammar=self.grammar
|
||||
)
|
||||
tokens = tokenize.tokenize(text, grammar=self.grammar)
|
||||
return self.parse_tokens(tokens, debug)
|
||||
|
||||
def _partially_consume_prefix(self, prefix: str, column: int) -> tuple[str, str]:
|
||||
|
@ -6,7 +6,7 @@
|
||||
from typing import IO, Any, NoReturn, Optional, Union
|
||||
|
||||
from blib2to3.pgen2 import grammar, token, tokenize
|
||||
from blib2to3.pgen2.tokenize import GoodTokenInfo
|
||||
from blib2to3.pgen2.tokenize import TokenInfo
|
||||
|
||||
Path = Union[str, "os.PathLike[str]"]
|
||||
|
||||
@ -18,7 +18,7 @@ class PgenGrammar(grammar.Grammar):
|
||||
class ParserGenerator:
|
||||
filename: Path
|
||||
stream: IO[str]
|
||||
generator: Iterator[GoodTokenInfo]
|
||||
generator: Iterator[TokenInfo]
|
||||
first: dict[str, Optional[dict[str, int]]]
|
||||
|
||||
def __init__(self, filename: Path, stream: Optional[IO[str]] = None) -> None:
|
||||
@ -27,8 +27,7 @@ def __init__(self, filename: Path, stream: Optional[IO[str]] = None) -> None:
|
||||
stream = open(filename, encoding="utf-8")
|
||||
close_stream = stream.close
|
||||
self.filename = filename
|
||||
self.stream = stream
|
||||
self.generator = tokenize.generate_tokens(stream.readline)
|
||||
self.generator = tokenize.tokenize(stream.read())
|
||||
self.gettoken() # Initialize lookahead
|
||||
self.dfas, self.startsymbol = self.parse()
|
||||
if close_stream is not None:
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -232,8 +232,6 @@ file_input
|
||||
fstring
|
||||
FSTRING_START
|
||||
"f'"
|
||||
FSTRING_MIDDLE
|
||||
''
|
||||
fstring_replacement_field
|
||||
LBRACE
|
||||
'{'
|
||||
@ -242,8 +240,6 @@ file_input
|
||||
RBRACE
|
||||
'}'
|
||||
/fstring_replacement_field
|
||||
FSTRING_MIDDLE
|
||||
''
|
||||
fstring_replacement_field
|
||||
LBRACE
|
||||
'{'
|
||||
@ -252,8 +248,6 @@ file_input
|
||||
RBRACE
|
||||
'}'
|
||||
/fstring_replacement_field
|
||||
FSTRING_MIDDLE
|
||||
''
|
||||
FSTRING_END
|
||||
"'"
|
||||
/fstring
|
||||
@ -399,8 +393,6 @@ file_input
|
||||
fstring
|
||||
FSTRING_START
|
||||
"f'"
|
||||
FSTRING_MIDDLE
|
||||
''
|
||||
fstring_replacement_field
|
||||
LBRACE
|
||||
'{'
|
||||
@ -419,8 +411,6 @@ file_input
|
||||
RBRACE
|
||||
'}'
|
||||
/fstring_replacement_field
|
||||
FSTRING_MIDDLE
|
||||
''
|
||||
FSTRING_END
|
||||
"'"
|
||||
/fstring
|
||||
@ -549,8 +539,6 @@ file_input
|
||||
fstring
|
||||
FSTRING_START
|
||||
"f'"
|
||||
FSTRING_MIDDLE
|
||||
''
|
||||
fstring_replacement_field
|
||||
LBRACE
|
||||
'{'
|
||||
@ -559,8 +547,6 @@ file_input
|
||||
RBRACE
|
||||
'}'
|
||||
/fstring_replacement_field
|
||||
FSTRING_MIDDLE
|
||||
''
|
||||
fstring_replacement_field
|
||||
LBRACE
|
||||
'{'
|
||||
@ -569,8 +555,6 @@ file_input
|
||||
RBRACE
|
||||
'}'
|
||||
/fstring_replacement_field
|
||||
FSTRING_MIDDLE
|
||||
''
|
||||
FSTRING_END
|
||||
"'"
|
||||
/fstring
|
||||
@ -660,8 +644,6 @@ file_input
|
||||
RBRACE
|
||||
'}'
|
||||
/fstring_replacement_field
|
||||
FSTRING_MIDDLE
|
||||
''
|
||||
FSTRING_END
|
||||
"'"
|
||||
/fstring
|
||||
@ -744,8 +726,6 @@ file_input
|
||||
RBRACE
|
||||
'}'
|
||||
/fstring_replacement_field
|
||||
FSTRING_MIDDLE
|
||||
''
|
||||
FSTRING_END
|
||||
"'"
|
||||
/fstring
|
||||
|
@ -463,17 +463,6 @@ def test_tab_comment_indentation(self) -> None:
|
||||
self.assertFormatEqual(contents_spc, fs(contents_spc))
|
||||
self.assertFormatEqual(contents_spc, fs(contents_tab))
|
||||
|
||||
# mixed tabs and spaces (valid Python 2 code)
|
||||
contents_tab = "if 1:\n if 2:\n\t\tpass\n\t# comment\n pass\n"
|
||||
contents_spc = "if 1:\n if 2:\n pass\n # comment\n pass\n"
|
||||
self.assertFormatEqual(contents_spc, fs(contents_spc))
|
||||
self.assertFormatEqual(contents_spc, fs(contents_tab))
|
||||
|
||||
contents_tab = "if 1:\n if 2:\n\t\tpass\n\t\t# comment\n pass\n"
|
||||
contents_spc = "if 1:\n if 2:\n pass\n # comment\n pass\n"
|
||||
self.assertFormatEqual(contents_spc, fs(contents_spc))
|
||||
self.assertFormatEqual(contents_spc, fs(contents_tab))
|
||||
|
||||
def test_false_positive_symlink_output_issue_3384(self) -> None:
|
||||
# Emulate the behavior when using the CLI (`black ./child --verbose`), which
|
||||
# involves patching some `pathlib.Path` methods. In particular, `is_dir` is
|
||||
@ -1980,7 +1969,7 @@ def test_for_handled_unexpected_eof_error(self) -> None:
|
||||
with pytest.raises(black.parsing.InvalidInput) as exc_info:
|
||||
black.lib2to3_parse("print(", {})
|
||||
|
||||
exc_info.match("Cannot parse: 2:0: EOF in multi-line statement")
|
||||
exc_info.match("Cannot parse: 1:6: Unexpected EOF in multi-line statement")
|
||||
|
||||
def test_line_ranges_with_code_option(self) -> None:
|
||||
code = textwrap.dedent("""\
|
||||
|
@ -1,6 +1,5 @@
|
||||
"""Tests for the blib2to3 tokenizer."""
|
||||
|
||||
import io
|
||||
import sys
|
||||
import textwrap
|
||||
from dataclasses import dataclass
|
||||
@ -19,16 +18,10 @@ class Token:
|
||||
|
||||
def get_tokens(text: str) -> list[Token]:
|
||||
"""Return the tokens produced by the tokenizer."""
|
||||
readline = io.StringIO(text).readline
|
||||
tokens: list[Token] = []
|
||||
|
||||
def tokeneater(
|
||||
type: int, string: str, start: tokenize.Coord, end: tokenize.Coord, line: str
|
||||
) -> None:
|
||||
tokens.append(Token(token.tok_name[type], string, start, end))
|
||||
|
||||
tokenize.tokenize(readline, tokeneater)
|
||||
return tokens
|
||||
return [
|
||||
Token(token.tok_name[tok_type], string, start, end)
|
||||
for tok_type, string, start, end, _ in tokenize.tokenize(text)
|
||||
]
|
||||
|
||||
|
||||
def assert_tokenizes(text: str, tokens: list[Token]) -> None:
|
||||
@ -69,11 +62,9 @@ def test_fstring() -> None:
|
||||
'f"{x}"',
|
||||
[
|
||||
Token("FSTRING_START", 'f"', (1, 0), (1, 2)),
|
||||
Token("FSTRING_MIDDLE", "", (1, 2), (1, 2)),
|
||||
Token("LBRACE", "{", (1, 2), (1, 3)),
|
||||
Token("OP", "{", (1, 2), (1, 3)),
|
||||
Token("NAME", "x", (1, 3), (1, 4)),
|
||||
Token("RBRACE", "}", (1, 4), (1, 5)),
|
||||
Token("FSTRING_MIDDLE", "", (1, 5), (1, 5)),
|
||||
Token("OP", "}", (1, 4), (1, 5)),
|
||||
Token("FSTRING_END", '"', (1, 5), (1, 6)),
|
||||
Token("ENDMARKER", "", (2, 0), (2, 0)),
|
||||
],
|
||||
@ -82,13 +73,11 @@ def test_fstring() -> None:
|
||||
'f"{x:y}"\n',
|
||||
[
|
||||
Token(type="FSTRING_START", string='f"', start=(1, 0), end=(1, 2)),
|
||||
Token(type="FSTRING_MIDDLE", string="", start=(1, 2), end=(1, 2)),
|
||||
Token(type="LBRACE", string="{", start=(1, 2), end=(1, 3)),
|
||||
Token(type="OP", string="{", start=(1, 2), end=(1, 3)),
|
||||
Token(type="NAME", string="x", start=(1, 3), end=(1, 4)),
|
||||
Token(type="OP", string=":", start=(1, 4), end=(1, 5)),
|
||||
Token(type="FSTRING_MIDDLE", string="y", start=(1, 5), end=(1, 6)),
|
||||
Token(type="RBRACE", string="}", start=(1, 6), end=(1, 7)),
|
||||
Token(type="FSTRING_MIDDLE", string="", start=(1, 7), end=(1, 7)),
|
||||
Token(type="OP", string="}", start=(1, 6), end=(1, 7)),
|
||||
Token(type="FSTRING_END", string='"', start=(1, 7), end=(1, 8)),
|
||||
Token(type="NEWLINE", string="\n", start=(1, 8), end=(1, 9)),
|
||||
Token(type="ENDMARKER", string="", start=(2, 0), end=(2, 0)),
|
||||
@ -99,10 +88,9 @@ def test_fstring() -> None:
|
||||
[
|
||||
Token(type="FSTRING_START", string='f"', start=(1, 0), end=(1, 2)),
|
||||
Token(type="FSTRING_MIDDLE", string="x\\\n", start=(1, 2), end=(2, 0)),
|
||||
Token(type="LBRACE", string="{", start=(2, 0), end=(2, 1)),
|
||||
Token(type="OP", string="{", start=(2, 0), end=(2, 1)),
|
||||
Token(type="NAME", string="a", start=(2, 1), end=(2, 2)),
|
||||
Token(type="RBRACE", string="}", start=(2, 2), end=(2, 3)),
|
||||
Token(type="FSTRING_MIDDLE", string="", start=(2, 3), end=(2, 3)),
|
||||
Token(type="OP", string="}", start=(2, 2), end=(2, 3)),
|
||||
Token(type="FSTRING_END", string='"', start=(2, 3), end=(2, 4)),
|
||||
Token(type="NEWLINE", string="\n", start=(2, 4), end=(2, 5)),
|
||||
Token(type="ENDMARKER", string="", start=(3, 0), end=(3, 0)),
|
||||
|
Loading…
Reference in New Issue
Block a user