Fix incorrect f-string tokenization (#4332)
This commit is contained in:
parent
ba88fc372e
commit
e4aaa8a994
@ -26,6 +26,8 @@
|
|||||||
|
|
||||||
<!-- Changes to the parser or to version autodetection -->
|
<!-- Changes to the parser or to version autodetection -->
|
||||||
|
|
||||||
|
- Fix regression where certain complex f-strings failed to parse (#4332)
|
||||||
|
|
||||||
### Performance
|
### Performance
|
||||||
|
|
||||||
<!-- Changes that improve Black's performance. -->
|
<!-- Changes that improve Black's performance. -->
|
||||||
|
@ -480,6 +480,88 @@ def _split_fstring_start_and_middle(token: str) -> Tuple[str, str]:
|
|||||||
raise ValueError(f"Token {token!r} is not a valid f-string start")
|
raise ValueError(f"Token {token!r} is not a valid f-string start")
|
||||||
|
|
||||||
|
|
||||||
|
STATE_NOT_FSTRING: Final = 0 # not in an f-string
|
||||||
|
STATE_MIDDLE: Final = 1 # in the string portion of an f-string (outside braces)
|
||||||
|
STATE_IN_BRACES: Final = 2 # between braces in an f-string
|
||||||
|
# in the format specifier (between the colon and the closing brace)
|
||||||
|
STATE_IN_COLON: Final = 3
|
||||||
|
|
||||||
|
|
||||||
|
class FStringState:
|
||||||
|
"""Keeps track of state around f-strings.
|
||||||
|
|
||||||
|
The tokenizer should call the appropriate method on this class when
|
||||||
|
it transitions to a different part of an f-string. This is needed
|
||||||
|
because the tokenization depends on knowing where exactly we are in
|
||||||
|
the f-string.
|
||||||
|
|
||||||
|
For example, consider the following f-string:
|
||||||
|
|
||||||
|
f"a{1:b{2}c}d"
|
||||||
|
|
||||||
|
The following is the tokenization of this string and the states
|
||||||
|
tracked by this class:
|
||||||
|
|
||||||
|
1,0-1,2: FSTRING_START 'f"' # [STATE_NOT_FSTRING, STATE_MIDDLE]
|
||||||
|
1,2-1,3: FSTRING_MIDDLE 'a'
|
||||||
|
1,3-1,4: LBRACE '{' # [STATE_NOT_FSTRING, STATE_IN_BRACES]
|
||||||
|
1,4-1,5: NUMBER '1'
|
||||||
|
1,5-1,6: OP ':' # [STATE_NOT_FSTRING, STATE_IN_COLON]
|
||||||
|
1,6-1,7: FSTRING_MIDDLE 'b'
|
||||||
|
1,7-1,8: LBRACE '{' # [STATE_NOT_FSTRING, STATE_IN_COLON, STATE_IN_BRACES]
|
||||||
|
1,8-1,9: NUMBER '2'
|
||||||
|
1,9-1,10: RBRACE '}' # [STATE_NOT_FSTRING, STATE_IN_COLON]
|
||||||
|
1,10-1,11: FSTRING_MIDDLE 'c'
|
||||||
|
1,11-1,12: RBRACE '}' # [STATE_NOT_FSTRING, STATE_MIDDLE]
|
||||||
|
1,12-1,13: FSTRING_MIDDLE 'd'
|
||||||
|
1,13-1,14: FSTRING_END '"' # [STATE_NOT_FSTRING]
|
||||||
|
1,14-1,15: NEWLINE '\n'
|
||||||
|
2,0-2,0: ENDMARKER ''
|
||||||
|
|
||||||
|
Notice that the nested braces in the format specifier are represented
|
||||||
|
by adding a STATE_IN_BRACES entry to the state stack. The stack is
|
||||||
|
also used if there are nested f-strings.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self.stack: List[int] = [STATE_NOT_FSTRING]
|
||||||
|
|
||||||
|
def is_in_fstring_expression(self) -> bool:
|
||||||
|
return self.stack[-1] not in (STATE_MIDDLE, STATE_NOT_FSTRING)
|
||||||
|
|
||||||
|
def current(self) -> int:
|
||||||
|
return self.stack[-1]
|
||||||
|
|
||||||
|
def enter_fstring(self) -> None:
|
||||||
|
self.stack.append(STATE_MIDDLE)
|
||||||
|
|
||||||
|
def leave_fstring(self) -> None:
|
||||||
|
state = self.stack.pop()
|
||||||
|
assert state == STATE_MIDDLE
|
||||||
|
|
||||||
|
def consume_lbrace(self) -> None:
|
||||||
|
current_state = self.stack[-1]
|
||||||
|
if current_state == STATE_MIDDLE:
|
||||||
|
self.stack[-1] = STATE_IN_BRACES
|
||||||
|
elif current_state == STATE_IN_COLON:
|
||||||
|
self.stack.append(STATE_IN_BRACES)
|
||||||
|
else:
|
||||||
|
assert False, current_state
|
||||||
|
|
||||||
|
def consume_rbrace(self) -> None:
|
||||||
|
current_state = self.stack[-1]
|
||||||
|
assert current_state in (STATE_IN_BRACES, STATE_IN_COLON)
|
||||||
|
if len(self.stack) > 1 and self.stack[-2] == STATE_IN_COLON:
|
||||||
|
self.stack.pop()
|
||||||
|
else:
|
||||||
|
self.stack[-1] = STATE_MIDDLE
|
||||||
|
|
||||||
|
def consume_colon(self) -> None:
|
||||||
|
assert self.stack[-1] == STATE_IN_BRACES, self.stack
|
||||||
|
self.stack[-1] = STATE_IN_COLON
|
||||||
|
|
||||||
|
|
||||||
def generate_tokens(
|
def generate_tokens(
|
||||||
readline: Callable[[], str], grammar: Optional[Grammar] = None
|
readline: Callable[[], str], grammar: Optional[Grammar] = None
|
||||||
) -> Iterator[GoodTokenInfo]:
|
) -> Iterator[GoodTokenInfo]:
|
||||||
@ -498,12 +580,10 @@ def generate_tokens(
|
|||||||
and the line on which the token was found. The line passed is the
|
and the line on which the token was found. The line passed is the
|
||||||
logical line; continuation lines are included.
|
logical line; continuation lines are included.
|
||||||
"""
|
"""
|
||||||
lnum = parenlev = fstring_level = continued = 0
|
lnum = parenlev = continued = 0
|
||||||
parenlev_stack: List[int] = []
|
parenlev_stack: List[int] = []
|
||||||
inside_fstring_braces = False
|
fstring_state = FStringState()
|
||||||
inside_fstring_colon = False
|
|
||||||
formatspec = ""
|
formatspec = ""
|
||||||
bracelev = 0
|
|
||||||
numchars: Final[str] = "0123456789"
|
numchars: Final[str] = "0123456789"
|
||||||
contstr, needcont = "", 0
|
contstr, needcont = "", 0
|
||||||
contline: Optional[str] = None
|
contline: Optional[str] = None
|
||||||
@ -542,13 +622,15 @@ def generate_tokens(
|
|||||||
spos = strstart
|
spos = strstart
|
||||||
epos = (lnum, end)
|
epos = (lnum, end)
|
||||||
tokenline = contline + line
|
tokenline = contline + line
|
||||||
if fstring_level == 0 and not is_fstring_start(token):
|
if (
|
||||||
|
fstring_state.current() == STATE_NOT_FSTRING
|
||||||
|
and not is_fstring_start(token)
|
||||||
|
):
|
||||||
yield (STRING, token, spos, epos, tokenline)
|
yield (STRING, token, spos, epos, tokenline)
|
||||||
endprog_stack.pop()
|
endprog_stack.pop()
|
||||||
parenlev = parenlev_stack.pop()
|
parenlev = parenlev_stack.pop()
|
||||||
else:
|
else:
|
||||||
if is_fstring_start(token):
|
if is_fstring_start(token):
|
||||||
fstring_level += 1
|
|
||||||
fstring_start, token = _split_fstring_start_and_middle(token)
|
fstring_start, token = _split_fstring_start_and_middle(token)
|
||||||
fstring_start_epos = (lnum, spos[1] + len(fstring_start))
|
fstring_start_epos = (lnum, spos[1] + len(fstring_start))
|
||||||
yield (
|
yield (
|
||||||
@ -558,6 +640,7 @@ def generate_tokens(
|
|||||||
fstring_start_epos,
|
fstring_start_epos,
|
||||||
tokenline,
|
tokenline,
|
||||||
)
|
)
|
||||||
|
fstring_state.enter_fstring()
|
||||||
# increase spos to the end of the fstring start
|
# increase spos to the end of the fstring start
|
||||||
spos = fstring_start_epos
|
spos = fstring_start_epos
|
||||||
|
|
||||||
@ -572,7 +655,7 @@ def generate_tokens(
|
|||||||
line,
|
line,
|
||||||
)
|
)
|
||||||
yield (LBRACE, lbrace, lbrace_spos, epos, line)
|
yield (LBRACE, lbrace, lbrace_spos, epos, line)
|
||||||
inside_fstring_braces = True
|
fstring_state.consume_lbrace()
|
||||||
else:
|
else:
|
||||||
if token.endswith(('"""', "'''")):
|
if token.endswith(('"""', "'''")):
|
||||||
fstring_middle, fstring_end = token[:-3], token[-3:]
|
fstring_middle, fstring_end = token[:-3], token[-3:]
|
||||||
@ -594,11 +677,9 @@ def generate_tokens(
|
|||||||
epos,
|
epos,
|
||||||
line,
|
line,
|
||||||
)
|
)
|
||||||
fstring_level -= 1
|
fstring_state.leave_fstring()
|
||||||
endprog_stack.pop()
|
endprog_stack.pop()
|
||||||
parenlev = parenlev_stack.pop()
|
parenlev = parenlev_stack.pop()
|
||||||
if fstring_level > 0:
|
|
||||||
inside_fstring_braces = True
|
|
||||||
pos = end
|
pos = end
|
||||||
contstr, needcont = "", 0
|
contstr, needcont = "", 0
|
||||||
contline = None
|
contline = None
|
||||||
@ -619,7 +700,11 @@ def generate_tokens(
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
# new statement
|
# new statement
|
||||||
elif parenlev == 0 and not continued and not inside_fstring_braces:
|
elif (
|
||||||
|
parenlev == 0
|
||||||
|
and not continued
|
||||||
|
and not fstring_state.is_in_fstring_expression()
|
||||||
|
):
|
||||||
if not line:
|
if not line:
|
||||||
break
|
break
|
||||||
column = 0
|
column = 0
|
||||||
@ -687,7 +772,7 @@ def generate_tokens(
|
|||||||
continued = 0
|
continued = 0
|
||||||
|
|
||||||
while pos < max:
|
while pos < max:
|
||||||
if fstring_level > 0 and not inside_fstring_braces:
|
if fstring_state.current() == STATE_MIDDLE:
|
||||||
endprog = endprog_stack[-1]
|
endprog = endprog_stack[-1]
|
||||||
endmatch = endprog.match(line, pos)
|
endmatch = endprog.match(line, pos)
|
||||||
if endmatch: # all on one line
|
if endmatch: # all on one line
|
||||||
@ -718,14 +803,12 @@ def generate_tokens(
|
|||||||
(lnum, end),
|
(lnum, end),
|
||||||
line,
|
line,
|
||||||
)
|
)
|
||||||
fstring_level -= 1
|
fstring_state.leave_fstring()
|
||||||
endprog_stack.pop()
|
endprog_stack.pop()
|
||||||
parenlev = parenlev_stack.pop()
|
parenlev = parenlev_stack.pop()
|
||||||
if fstring_level > 0:
|
|
||||||
inside_fstring_braces = True
|
|
||||||
else:
|
else:
|
||||||
yield (LBRACE, "{", (lnum, end - 1), (lnum, end), line)
|
yield (LBRACE, "{", (lnum, end - 1), (lnum, end), line)
|
||||||
inside_fstring_braces = True
|
fstring_state.consume_lbrace()
|
||||||
pos = end
|
pos = end
|
||||||
continue
|
continue
|
||||||
else: # multiple lines
|
else: # multiple lines
|
||||||
@ -734,7 +817,7 @@ def generate_tokens(
|
|||||||
contline = line
|
contline = line
|
||||||
break
|
break
|
||||||
|
|
||||||
if inside_fstring_colon:
|
if fstring_state.current() == STATE_IN_COLON:
|
||||||
match = fstring_middle_after_colon.match(line, pos)
|
match = fstring_middle_after_colon.match(line, pos)
|
||||||
if match is None:
|
if match is None:
|
||||||
formatspec += line[pos:]
|
formatspec += line[pos:]
|
||||||
@ -754,15 +837,19 @@ def generate_tokens(
|
|||||||
formatspec = ""
|
formatspec = ""
|
||||||
|
|
||||||
if brace_or_nl == "{":
|
if brace_or_nl == "{":
|
||||||
yield (OP, "{", (lnum, brace_start), (lnum, brace_end), line)
|
yield (LBRACE, "{", (lnum, brace_start), (lnum, brace_end), line)
|
||||||
bracelev += 1
|
fstring_state.consume_lbrace()
|
||||||
end = brace_end
|
end = brace_end
|
||||||
|
elif brace_or_nl == "}":
|
||||||
|
yield (RBRACE, "}", (lnum, brace_start), (lnum, brace_end), line)
|
||||||
|
fstring_state.consume_rbrace()
|
||||||
|
end = brace_end
|
||||||
|
formatspec_start = (lnum, brace_end)
|
||||||
|
|
||||||
inside_fstring_colon = False
|
|
||||||
pos = end
|
pos = end
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if inside_fstring_braces and parenlev == 0:
|
if fstring_state.current() == STATE_IN_BRACES and parenlev == 0:
|
||||||
match = bang.match(line, pos)
|
match = bang.match(line, pos)
|
||||||
if match:
|
if match:
|
||||||
start, end = match.span(1)
|
start, end = match.span(1)
|
||||||
@ -774,7 +861,7 @@ def generate_tokens(
|
|||||||
if match:
|
if match:
|
||||||
start, end = match.span(1)
|
start, end = match.span(1)
|
||||||
yield (OP, ":", (lnum, start), (lnum, end), line)
|
yield (OP, ":", (lnum, start), (lnum, end), line)
|
||||||
inside_fstring_colon = True
|
fstring_state.consume_colon()
|
||||||
formatspec_start = (lnum, end)
|
formatspec_start = (lnum, end)
|
||||||
pos = end
|
pos = end
|
||||||
continue
|
continue
|
||||||
@ -791,7 +878,7 @@ def generate_tokens(
|
|||||||
yield (NUMBER, token, spos, epos, line)
|
yield (NUMBER, token, spos, epos, line)
|
||||||
elif initial in "\r\n":
|
elif initial in "\r\n":
|
||||||
newline = NEWLINE
|
newline = NEWLINE
|
||||||
if parenlev > 0 or inside_fstring_braces:
|
if parenlev > 0 or fstring_state.is_in_fstring_expression():
|
||||||
newline = NL
|
newline = NL
|
||||||
elif async_def:
|
elif async_def:
|
||||||
async_def_nl = True
|
async_def_nl = True
|
||||||
@ -813,7 +900,7 @@ def generate_tokens(
|
|||||||
parenlev = 0
|
parenlev = 0
|
||||||
if is_fstring_start(token):
|
if is_fstring_start(token):
|
||||||
yield (FSTRING_START, token, spos, epos, line)
|
yield (FSTRING_START, token, spos, epos, line)
|
||||||
fstring_level += 1
|
fstring_state.enter_fstring()
|
||||||
|
|
||||||
endmatch = endprog.match(line, pos)
|
endmatch = endprog.match(line, pos)
|
||||||
if endmatch: # all on one line
|
if endmatch: # all on one line
|
||||||
@ -848,11 +935,9 @@ def generate_tokens(
|
|||||||
epos,
|
epos,
|
||||||
line,
|
line,
|
||||||
)
|
)
|
||||||
fstring_level -= 1
|
fstring_state.leave_fstring()
|
||||||
endprog_stack.pop()
|
endprog_stack.pop()
|
||||||
parenlev = parenlev_stack.pop()
|
parenlev = parenlev_stack.pop()
|
||||||
if fstring_level > 0:
|
|
||||||
inside_fstring_braces = True
|
|
||||||
else:
|
else:
|
||||||
fstring_middle, lbrace = token[:-1], token[-1]
|
fstring_middle, lbrace = token[:-1], token[-1]
|
||||||
fstring_middle_epos = lbrace_spos = (lnum, end - 1)
|
fstring_middle_epos = lbrace_spos = (lnum, end - 1)
|
||||||
@ -864,7 +949,7 @@ def generate_tokens(
|
|||||||
line,
|
line,
|
||||||
)
|
)
|
||||||
yield (LBRACE, lbrace, lbrace_spos, epos, line)
|
yield (LBRACE, lbrace, lbrace_spos, epos, line)
|
||||||
inside_fstring_braces = True
|
fstring_state.consume_lbrace()
|
||||||
pos = end
|
pos = end
|
||||||
else:
|
else:
|
||||||
# multiple lines
|
# multiple lines
|
||||||
@ -919,7 +1004,7 @@ def generate_tokens(
|
|||||||
|
|
||||||
start_epos = (lnum, start + offset)
|
start_epos = (lnum, start + offset)
|
||||||
yield (FSTRING_START, fstring_start, spos, start_epos, line)
|
yield (FSTRING_START, fstring_start, spos, start_epos, line)
|
||||||
fstring_level += 1
|
fstring_state.enter_fstring()
|
||||||
endprog = endprogs[fstring_start]
|
endprog = endprogs[fstring_start]
|
||||||
endprog_stack.append(endprog)
|
endprog_stack.append(endprog)
|
||||||
parenlev_stack.append(parenlev)
|
parenlev_stack.append(parenlev)
|
||||||
@ -940,16 +1025,14 @@ def generate_tokens(
|
|||||||
end_spos = (lnum, end_offset)
|
end_spos = (lnum, end_offset)
|
||||||
end_epos = (lnum, end_offset + 1)
|
end_epos = (lnum, end_offset + 1)
|
||||||
yield (FSTRING_END, token[-1], end_spos, end_epos, line)
|
yield (FSTRING_END, token[-1], end_spos, end_epos, line)
|
||||||
fstring_level -= 1
|
fstring_state.leave_fstring()
|
||||||
endprog_stack.pop()
|
endprog_stack.pop()
|
||||||
parenlev = parenlev_stack.pop()
|
parenlev = parenlev_stack.pop()
|
||||||
if fstring_level > 0:
|
|
||||||
inside_fstring_braces = True
|
|
||||||
else:
|
else:
|
||||||
end_spos = (lnum, end_offset)
|
end_spos = (lnum, end_offset)
|
||||||
end_epos = (lnum, end_offset + 1)
|
end_epos = (lnum, end_offset + 1)
|
||||||
yield (LBRACE, "{", end_spos, end_epos, line)
|
yield (LBRACE, "{", end_spos, end_epos, line)
|
||||||
inside_fstring_braces = True
|
fstring_state.consume_lbrace()
|
||||||
|
|
||||||
elif initial.isidentifier(): # ordinary name
|
elif initial.isidentifier(): # ordinary name
|
||||||
if token in ("async", "await"):
|
if token in ("async", "await"):
|
||||||
@ -998,19 +1081,13 @@ def generate_tokens(
|
|||||||
elif (
|
elif (
|
||||||
initial == "}"
|
initial == "}"
|
||||||
and parenlev == 0
|
and parenlev == 0
|
||||||
and bracelev == 0
|
and fstring_state.is_in_fstring_expression()
|
||||||
and fstring_level > 0
|
|
||||||
):
|
):
|
||||||
yield (RBRACE, token, spos, epos, line)
|
yield (RBRACE, token, spos, epos, line)
|
||||||
inside_fstring_braces = False
|
fstring_state.consume_rbrace()
|
||||||
|
formatspec_start = epos
|
||||||
else:
|
else:
|
||||||
if parenlev == 0 and bracelev > 0 and initial == "}":
|
if initial in "([{":
|
||||||
bracelev -= 1
|
|
||||||
# if we're still inside fstrings, we're still part of the format spec
|
|
||||||
if inside_fstring_braces:
|
|
||||||
inside_fstring_colon = True
|
|
||||||
formatspec_start = (lnum, pos)
|
|
||||||
elif initial in "([{":
|
|
||||||
parenlev += 1
|
parenlev += 1
|
||||||
elif initial in ")]}":
|
elif initial in ")]}":
|
||||||
parenlev -= 1
|
parenlev -= 1
|
||||||
|
@ -119,6 +119,10 @@
|
|||||||
level=0,
|
level=0,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
f"{1:{f'{2}'}}"
|
||||||
|
f'{1:{f'{2}'}}'
|
||||||
|
f'{1:{2}d}'
|
||||||
|
|
||||||
f'{{\\"kind\\":\\"ConfigMap\\",\\"metadata\\":{{\\"annotations\\":{{}},\\"name\\":\\"cluster-info\\",\\"namespace\\":\\"amazon-cloudwatch\\"}}}}'
|
f'{{\\"kind\\":\\"ConfigMap\\",\\"metadata\\":{{\\"annotations\\":{{}},\\"name\\":\\"cluster-info\\",\\"namespace\\":\\"amazon-cloudwatch\\"}}}}'
|
||||||
|
|
||||||
# output
|
# output
|
||||||
@ -243,4 +247,8 @@
|
|||||||
level=0,
|
level=0,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
f"{1:{f'{2}'}}"
|
||||||
|
f"{1:{f'{2}'}}"
|
||||||
|
f"{1:{2}d}"
|
||||||
|
|
||||||
f'{{\\"kind\\":\\"ConfigMap\\",\\"metadata\\":{{\\"annotations\\":{{}},\\"name\\":\\"cluster-info\\",\\"namespace\\":\\"amazon-cloudwatch\\"}}}}'
|
f'{{\\"kind\\":\\"ConfigMap\\",\\"metadata\\":{{\\"annotations\\":{{}},\\"name\\":\\"cluster-info\\",\\"namespace\\":\\"amazon-cloudwatch\\"}}}}'
|
||||||
|
Loading…
Reference in New Issue
Block a user