Move tokenizer config onto grammar, rename flag

Based on the feedback in
https://github.com/python/black/pull/845#issuecomment-490622711

- Remove TokenizerConfig, and add a field to Grammar instead.
- Pass the Grammar to the tokenizer.
- Rename `ASYNC_IS_RESERVED_KEYWORD` to `ASYNC_KEYWORDS` and
  `ASYNC_IS_VALID_IDENTIFIER` to `ASYNC_IDENTIFIERS`.
This commit is contained in:
Benjamin Woodruff 2019-05-08 16:38:38 -04:00 committed by Łukasz Langa
parent f8617f975d
commit 448885b256
No known key found for this signature in database
GPG Key ID: B26995E310250568
10 changed files with 59 additions and 80 deletions

View File

@ -48,7 +48,6 @@
from blib2to3.pgen2 import driver, token from blib2to3.pgen2 import driver, token
from blib2to3.pgen2.grammar import Grammar from blib2to3.pgen2.grammar import Grammar
from blib2to3.pgen2.parse import ParseError from blib2to3.pgen2.parse import ParseError
from blib2to3.pgen2.tokenize import TokenizerConfig
__version__ = "19.3b0" __version__ = "19.3b0"
@ -139,18 +138,18 @@ class Feature(Enum):
TRAILING_COMMA_IN_DEF = 5 TRAILING_COMMA_IN_DEF = 5
# The following two feature-flags are mutually exclusive, and exactly one should be # The following two feature-flags are mutually exclusive, and exactly one should be
# set for every version of python. # set for every version of python.
ASYNC_IS_VALID_IDENTIFIER = 6 ASYNC_IDENTIFIERS = 6
ASYNC_IS_RESERVED_KEYWORD = 7 ASYNC_KEYWORDS = 7
VERSION_TO_FEATURES: Dict[TargetVersion, Set[Feature]] = { VERSION_TO_FEATURES: Dict[TargetVersion, Set[Feature]] = {
TargetVersion.PY27: {Feature.ASYNC_IS_VALID_IDENTIFIER}, TargetVersion.PY27: {Feature.ASYNC_IDENTIFIERS},
TargetVersion.PY33: {Feature.UNICODE_LITERALS, Feature.ASYNC_IS_VALID_IDENTIFIER}, TargetVersion.PY33: {Feature.UNICODE_LITERALS, Feature.ASYNC_IDENTIFIERS},
TargetVersion.PY34: {Feature.UNICODE_LITERALS, Feature.ASYNC_IS_VALID_IDENTIFIER}, TargetVersion.PY34: {Feature.UNICODE_LITERALS, Feature.ASYNC_IDENTIFIERS},
TargetVersion.PY35: { TargetVersion.PY35: {
Feature.UNICODE_LITERALS, Feature.UNICODE_LITERALS,
Feature.TRAILING_COMMA_IN_CALL, Feature.TRAILING_COMMA_IN_CALL,
Feature.ASYNC_IS_VALID_IDENTIFIER, Feature.ASYNC_IDENTIFIERS,
}, },
TargetVersion.PY36: { TargetVersion.PY36: {
Feature.UNICODE_LITERALS, Feature.UNICODE_LITERALS,
@ -158,7 +157,7 @@ class Feature(Enum):
Feature.NUMERIC_UNDERSCORES, Feature.NUMERIC_UNDERSCORES,
Feature.TRAILING_COMMA_IN_CALL, Feature.TRAILING_COMMA_IN_CALL,
Feature.TRAILING_COMMA_IN_DEF, Feature.TRAILING_COMMA_IN_DEF,
Feature.ASYNC_IS_VALID_IDENTIFIER, Feature.ASYNC_IDENTIFIERS,
}, },
TargetVersion.PY37: { TargetVersion.PY37: {
Feature.UNICODE_LITERALS, Feature.UNICODE_LITERALS,
@ -166,7 +165,7 @@ class Feature(Enum):
Feature.NUMERIC_UNDERSCORES, Feature.NUMERIC_UNDERSCORES,
Feature.TRAILING_COMMA_IN_CALL, Feature.TRAILING_COMMA_IN_CALL,
Feature.TRAILING_COMMA_IN_DEF, Feature.TRAILING_COMMA_IN_DEF,
Feature.ASYNC_IS_RESERVED_KEYWORD, Feature.ASYNC_KEYWORDS,
}, },
TargetVersion.PY38: { TargetVersion.PY38: {
Feature.UNICODE_LITERALS, Feature.UNICODE_LITERALS,
@ -174,7 +173,7 @@ class Feature(Enum):
Feature.NUMERIC_UNDERSCORES, Feature.NUMERIC_UNDERSCORES,
Feature.TRAILING_COMMA_IN_CALL, Feature.TRAILING_COMMA_IN_CALL,
Feature.TRAILING_COMMA_IN_DEF, Feature.TRAILING_COMMA_IN_DEF,
Feature.ASYNC_IS_RESERVED_KEYWORD, Feature.ASYNC_KEYWORDS,
}, },
} }
@ -760,62 +759,42 @@ def decode_bytes(src: bytes) -> Tuple[FileContent, Encoding, NewLine]:
return tiow.read(), encoding, newline return tiow.read(), encoding, newline
@dataclass(frozen=True) def get_grammars(target_versions: Set[TargetVersion]) -> List[Grammar]:
class ParserConfig:
grammar: Grammar
tokenizer_config: TokenizerConfig = TokenizerConfig()
def get_parser_configs(target_versions: Set[TargetVersion]) -> List[ParserConfig]:
if not target_versions: if not target_versions:
# No target_version specified, so try all grammars. # No target_version specified, so try all grammars.
return [ return [
# Python 3.7+ # Python 3.7+
ParserConfig( pygram.python_grammar_no_print_statement_no_exec_statement_async_keywords,
pygram.python_grammar_no_print_statement_no_exec_statement,
TokenizerConfig(async_is_reserved_keyword=True),
),
# Python 3.0-3.6 # Python 3.0-3.6
ParserConfig( pygram.python_grammar_no_print_statement_no_exec_statement,
pygram.python_grammar_no_print_statement_no_exec_statement,
TokenizerConfig(async_is_reserved_keyword=False),
),
# Python 2.7 with future print_function import # Python 2.7 with future print_function import
ParserConfig(pygram.python_grammar_no_print_statement), pygram.python_grammar_no_print_statement,
# Python 2.7 # Python 2.7
ParserConfig(pygram.python_grammar), pygram.python_grammar,
] ]
elif all(version.is_python2() for version in target_versions): elif all(version.is_python2() for version in target_versions):
# Python 2-only code, so try Python 2 grammars. # Python 2-only code, so try Python 2 grammars.
return [ return [
# Python 2.7 with future print_function import # Python 2.7 with future print_function import
ParserConfig(pygram.python_grammar_no_print_statement), pygram.python_grammar_no_print_statement,
# Python 2.7 # Python 2.7
ParserConfig(pygram.python_grammar), pygram.python_grammar,
] ]
else: else:
# Python 3-compatible code, so only try Python 3 grammar. # Python 3-compatible code, so only try Python 3 grammar.
configs = [] grammars = []
# If we have to parse both, try to parse async as a keyword first # If we have to parse both, try to parse async as a keyword first
if not supports_feature(target_versions, Feature.ASYNC_IS_VALID_IDENTIFIER): if not supports_feature(target_versions, Feature.ASYNC_IDENTIFIERS):
# Python 3.7+ # Python 3.7+
configs.append( grammars.append(
ParserConfig( pygram.python_grammar_no_print_statement_no_exec_statement_async_keywords # noqa: B950
pygram.python_grammar_no_print_statement_no_exec_statement,
TokenizerConfig(async_is_reserved_keyword=True),
)
) )
if not supports_feature(target_versions, Feature.ASYNC_IS_RESERVED_KEYWORD): if not supports_feature(target_versions, Feature.ASYNC_KEYWORDS):
# Python 3.0-3.6 # Python 3.0-3.6
configs.append( grammars.append(pygram.python_grammar_no_print_statement_no_exec_statement)
ParserConfig(
pygram.python_grammar_no_print_statement_no_exec_statement,
TokenizerConfig(async_is_reserved_keyword=False),
)
)
# At least one of the above branches must have been taken, because every Python # At least one of the above branches must have been taken, because every Python
# version has exactly one of the two 'ASYNC_IS_*' flags # version has exactly one of the two 'ASYNC_*' flags
return configs return grammars
def lib2to3_parse(src_txt: str, target_versions: Iterable[TargetVersion] = ()) -> Node: def lib2to3_parse(src_txt: str, target_versions: Iterable[TargetVersion] = ()) -> Node:
@ -823,12 +802,8 @@ def lib2to3_parse(src_txt: str, target_versions: Iterable[TargetVersion] = ()) -
if src_txt[-1:] != "\n": if src_txt[-1:] != "\n":
src_txt += "\n" src_txt += "\n"
for parser_config in get_parser_configs(set(target_versions)): for grammar in get_grammars(set(target_versions)):
drv = driver.Driver( drv = driver.Driver(grammar, pytree.convert)
parser_config.grammar,
pytree.convert,
tokenizer_config=parser_config.tokenizer_config,
)
try: try:
result = drv.parse_string(src_txt, True) result = drv.parse_string(src_txt, True)
break break

View File

@ -34,14 +34,12 @@ def __init__(
grammar, grammar,
convert=None, convert=None,
logger=None, logger=None,
tokenizer_config=tokenize.TokenizerConfig(),
): ):
self.grammar = grammar self.grammar = grammar
if logger is None: if logger is None:
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
self.logger = logger self.logger = logger
self.convert = convert self.convert = convert
self.tokenizer_config = tokenizer_config
def parse_tokens(self, tokens, debug=False): def parse_tokens(self, tokens, debug=False):
"""Parse a series of tokens and return the syntax tree.""" """Parse a series of tokens and return the syntax tree."""
@ -104,7 +102,7 @@ def parse_tokens(self, tokens, debug=False):
def parse_stream_raw(self, stream, debug=False): def parse_stream_raw(self, stream, debug=False):
"""Parse a stream and return the syntax tree.""" """Parse a stream and return the syntax tree."""
tokens = tokenize.generate_tokens(stream.readline, config=self.tokenizer_config) tokens = tokenize.generate_tokens(stream.readline, grammar=self.grammar)
return self.parse_tokens(tokens, debug) return self.parse_tokens(tokens, debug)
def parse_stream(self, stream, debug=False): def parse_stream(self, stream, debug=False):
@ -120,7 +118,7 @@ def parse_string(self, text, debug=False):
"""Parse a string and return the syntax tree.""" """Parse a string and return the syntax tree."""
tokens = tokenize.generate_tokens( tokens = tokenize.generate_tokens(
io.StringIO(text).readline, io.StringIO(text).readline,
config=self.tokenizer_config, grammar=self.grammar
) )
return self.parse_tokens(tokens, debug) return self.parse_tokens(tokens, debug)

View File

@ -8,20 +8,13 @@ from logging import Logger
from blib2to3.pytree import _Convert, _NL from blib2to3.pytree import _Convert, _NL
from blib2to3.pgen2 import _Path from blib2to3.pgen2 import _Path
from blib2to3.pgen2.grammar import Grammar from blib2to3.pgen2.grammar import Grammar
from blib2to3.pgen2.tokenize import TokenizerConfig
class Driver: class Driver:
grammar: Grammar grammar: Grammar
logger: Logger logger: Logger
convert: _Convert convert: _Convert
def __init__( def __init__(self, grammar: Grammar, convert: Optional[_Convert] = ..., logger: Optional[Logger] = ...) -> None: ...
self,
grammar: Grammar,
convert: Optional[_Convert] = ...,
logger: Optional[Logger] = ...,
tokenizer_config: TokenizerConfig = ...
) -> None: ...
def parse_tokens(self, tokens: Iterable[Any], debug: bool = ...) -> _NL: ... def parse_tokens(self, tokens: Iterable[Any], debug: bool = ...) -> _NL: ...
def parse_stream_raw(self, stream: IO[Text], debug: bool = ...) -> _NL: ... def parse_stream_raw(self, stream: IO[Text], debug: bool = ...) -> _NL: ...
def parse_stream(self, stream: IO[Text], debug: bool = ...) -> _NL: ... def parse_stream(self, stream: IO[Text], debug: bool = ...) -> _NL: ...

View File

@ -85,6 +85,8 @@ def __init__(self):
self.tokens = {} self.tokens = {}
self.symbol2label = {} self.symbol2label = {}
self.start = 256 self.start = 256
# Python 3.7+ parses async as a keyword, not an identifier
self.async_keywords = False
def dump(self, filename): def dump(self, filename):
"""Dump the grammar tables to a pickle file.""" """Dump the grammar tables to a pickle file."""
@ -113,6 +115,7 @@ def copy(self):
new.labels = self.labels[:] new.labels = self.labels[:]
new.states = self.states[:] new.states = self.states[:]
new.start = self.start new.start = self.start
new.async_keywords = self.async_keywords
return new return new
def report(self): def report(self):

View File

@ -19,6 +19,7 @@ class Grammar:
tokens: Dict[int, int] tokens: Dict[int, int]
symbol2label: Dict[Text, int] symbol2label: Dict[Text, int]
start: int start: int
async_keywords: bool
def __init__(self) -> None: ... def __init__(self) -> None: ...
def dump(self, filename: _Path) -> None: ... def dump(self, filename: _Path) -> None: ...
def load(self, filename: _Path) -> None: ... def load(self, filename: _Path) -> None: ...

View File

@ -31,7 +31,6 @@
import re import re
from codecs import BOM_UTF8, lookup from codecs import BOM_UTF8, lookup
from attr import dataclass
from blib2to3.pgen2.token import * from blib2to3.pgen2.token import *
from . import token from . import token
@ -138,10 +137,6 @@ def _combinations(*l):
tabsize = 8 tabsize = 8
@dataclass(frozen=True)
class TokenizerConfig:
async_is_reserved_keyword: bool = False
class TokenError(Exception): pass class TokenError(Exception): pass
class StopTokenizing(Exception): pass class StopTokenizing(Exception): pass
@ -339,7 +334,7 @@ def untokenize(iterable):
ut = Untokenizer() ut = Untokenizer()
return ut.untokenize(iterable) return ut.untokenize(iterable)
def generate_tokens(readline, config: TokenizerConfig = TokenizerConfig()): def generate_tokens(readline, grammar=None):
""" """
The generate_tokens() generator requires one argument, readline, which The generate_tokens() generator requires one argument, readline, which
must be a callable object which provides the same interface as the must be a callable object which provides the same interface as the
@ -363,7 +358,7 @@ def generate_tokens(readline, config: TokenizerConfig = TokenizerConfig()):
# If we know we're parsing 3.7+, we can unconditionally parse `async` and # If we know we're parsing 3.7+, we can unconditionally parse `async` and
# `await` as keywords. # `await` as keywords.
async_is_reserved_keyword = config.async_is_reserved_keyword async_keywords = False if grammar is None else grammar.async_keywords
# 'stashed' and 'async_*' are used for async/await parsing # 'stashed' and 'async_*' are used for async/await parsing
stashed = None stashed = None
async_def = False async_def = False
@ -514,7 +509,7 @@ def generate_tokens(readline, config: TokenizerConfig = TokenizerConfig()):
yield (STRING, token, spos, epos, line) yield (STRING, token, spos, epos, line)
elif initial.isidentifier(): # ordinary name elif initial.isidentifier(): # ordinary name
if token in ('async', 'await'): if token in ('async', 'await'):
if async_is_reserved_keyword or async_def: if async_keywords or async_def:
yield (ASYNC if token == 'async' else AWAIT, yield (ASYNC if token == 'async' else AWAIT,
token, spos, epos, line) token, spos, epos, line)
continue continue

View File

@ -1,18 +1,15 @@
# Stubs for lib2to3.pgen2.tokenize (Python 3.6) # Stubs for lib2to3.pgen2.tokenize (Python 3.6)
# NOTE: Only elements from __all__ are present. # NOTE: Only elements from __all__ are present.
from typing import Callable, Iterable, Iterator, List, Text, Tuple from typing import Callable, Iterable, Iterator, List, Optional, Text, Tuple
from attr import dataclass
from blib2to3.pgen2.token import * # noqa from blib2to3.pgen2.token import * # noqa
from blib2to3.pygram import Grammar
_Coord = Tuple[int, int] _Coord = Tuple[int, int]
_TokenEater = Callable[[int, Text, _Coord, _Coord, Text], None] _TokenEater = Callable[[int, Text, _Coord, _Coord, Text], None]
_TokenInfo = Tuple[int, Text, _Coord, _Coord, Text] _TokenInfo = Tuple[int, Text, _Coord, _Coord, Text]
@dataclass(frozen=True)
class TokenizerConfig:
async_is_reserved_keyword: bool = False
class TokenError(Exception): ... class TokenError(Exception): ...
class StopTokenizing(Exception): ... class StopTokenizing(Exception): ...
@ -30,5 +27,6 @@ class Untokenizer:
def untokenize(iterable: Iterable[_TokenInfo]) -> Text: ... def untokenize(iterable: Iterable[_TokenInfo]) -> Text: ...
def generate_tokens( def generate_tokens(
readline: Callable[[], Text] readline: Callable[[], Text],
grammar: Optional[Grammar] = ...
) -> Iterator[_TokenInfo]: ... ) -> Iterator[_TokenInfo]: ...

View File

@ -33,6 +33,7 @@ def initialize(cache_dir=None):
global python_grammar global python_grammar
global python_grammar_no_print_statement global python_grammar_no_print_statement
global python_grammar_no_print_statement_no_exec_statement global python_grammar_no_print_statement_no_exec_statement
global python_grammar_no_print_statement_no_exec_statement_async_keywords
global python_symbols global python_symbols
global pattern_grammar global pattern_grammar
global pattern_symbols global pattern_symbols
@ -47,11 +48,17 @@ def initialize(cache_dir=None):
python_grammar_no_print_statement = python_grammar.copy() python_grammar_no_print_statement = python_grammar.copy()
del python_grammar_no_print_statement.keywords["print"] del python_grammar_no_print_statement.keywords["print"]
# Python 3 # Python 3.0-3.6
python_grammar_no_print_statement_no_exec_statement = python_grammar.copy() python_grammar_no_print_statement_no_exec_statement = python_grammar.copy()
del python_grammar_no_print_statement_no_exec_statement.keywords["print"] del python_grammar_no_print_statement_no_exec_statement.keywords["print"]
del python_grammar_no_print_statement_no_exec_statement.keywords["exec"] del python_grammar_no_print_statement_no_exec_statement.keywords["exec"]
# Python 3.7+
python_grammar_no_print_statement_no_exec_statement_async_keywords = (
python_grammar_no_print_statement_no_exec_statement.copy()
)
python_grammar_no_print_statement_no_exec_statement_async_keywords.async_keywords = True
pattern_grammar = driver.load_packaged_grammar("blib2to3", _PATTERN_GRAMMAR_FILE, pattern_grammar = driver.load_packaged_grammar("blib2to3", _PATTERN_GRAMMAR_FILE,
cache_dir) cache_dir)
pattern_symbols = Symbols(pattern_grammar) pattern_symbols = Symbols(pattern_grammar)

View File

@ -118,6 +118,7 @@ class pattern_symbols(Symbols):
python_grammar: Grammar python_grammar: Grammar
python_grammar_no_print_statement: Grammar python_grammar_no_print_statement: Grammar
python_grammar_no_print_statement_no_exec_statement: Grammar python_grammar_no_print_statement_no_exec_statement: Grammar
python_grammar_no_print_statement_no_exec_statement_async_keywords: Grammar
python_grammar_no_exec_statement: Grammar python_grammar_no_exec_statement: Grammar
pattern_grammar: Grammar pattern_grammar: Grammar

View File

@ -1,10 +1,16 @@
#!/usr/bin/env python3.7 #!/usr/bin/env python3.7
def f(): def f():
return (i*2 async for i in arange(42)) return (i * 2 async for i in arange(42))
def g(): def g():
return (something_long * something_long async for something_long in async_generator(with_an_argument)) return (
something_long * something_long
async for something_long in async_generator(with_an_argument)
)
async def func(): async def func():
if test: if test:
@ -15,9 +21,11 @@ async def func():
) )
] ]
def awaited_generator_value(n): def awaited_generator_value(n):
return (await awaitable for awaitable in awaitable_list) return (await awaitable for awaitable in awaitable_list)
def make_arange(n): def make_arange(n):
return (i * 2 for i in range(n) if await wrap(i)) return (i * 2 for i in range(n) if await wrap(i))