Format hex code in unicode escape sequences in string literals (#2916)
Co-authored-by: Jelle Zijlstra <jelle.zijlstra@gmail.com>
This commit is contained in:
parent
1557f7d3a3
commit
eabff673b3
@ -16,6 +16,7 @@
|
||||
|
||||
<!-- Changes that affect Black's preview style -->
|
||||
|
||||
- Format hex code in unicode escape sequences in string literals (#2916)
|
||||
- Add parentheses around `if`-`else` expressions (#2278)
|
||||
- Improve the performance on large expressions that contain many strings (#3467)
|
||||
- Fix a crash in preview style with assert + parenthesized string (#3415)
|
||||
|
@ -59,6 +59,7 @@
|
||||
get_string_prefix,
|
||||
normalize_string_prefix,
|
||||
normalize_string_quotes,
|
||||
normalize_unicode_escape_sequences,
|
||||
)
|
||||
from black.trans import (
|
||||
CannotTransform,
|
||||
@ -368,6 +369,9 @@ def visit_factor(self, node: Node) -> Iterator[Line]:
|
||||
yield from self.visit_default(node)
|
||||
|
||||
def visit_STRING(self, leaf: Leaf) -> Iterator[Line]:
|
||||
if Preview.hex_codes_in_unicode_sequences in self.mode:
|
||||
normalize_unicode_escape_sequences(leaf)
|
||||
|
||||
if is_docstring(leaf) and "\\\n" not in leaf.value:
|
||||
# We're ignoring docstrings with backslash newline escapes because changing
|
||||
# indentation of those changes the AST representation of the code.
|
||||
|
@ -153,6 +153,7 @@ def supports_feature(target_versions: Set[TargetVersion], feature: Feature) -> b
|
||||
class Preview(Enum):
|
||||
"""Individual preview style features."""
|
||||
|
||||
hex_codes_in_unicode_sequences = auto()
|
||||
annotation_parens = auto()
|
||||
empty_lines_before_class_or_def_with_leading_comments = auto()
|
||||
handle_trailing_commas_in_head = auto()
|
||||
|
@ -5,7 +5,9 @@
|
||||
import re
|
||||
import sys
|
||||
from functools import lru_cache
|
||||
from typing import List, Pattern
|
||||
from typing import List, Match, Pattern
|
||||
|
||||
from blib2to3.pytree import Leaf
|
||||
|
||||
if sys.version_info < (3, 8):
|
||||
from typing_extensions import Final
|
||||
@ -18,6 +20,15 @@
|
||||
r"^([" + STRING_PREFIX_CHARS + r"]*)(.*)$", re.DOTALL
|
||||
)
|
||||
FIRST_NON_WHITESPACE_RE: Final = re.compile(r"\s*\t+\s*(\S)")
|
||||
UNICODE_ESCAPE_RE: Final = re.compile(
|
||||
r"(?P<backslashes>\\+)(?P<body>"
|
||||
r"(u(?P<u>[a-fA-F0-9]{4}))" # Character with 16-bit hex value xxxx
|
||||
r"|(U(?P<U>[a-fA-F0-9]{8}))" # Character with 32-bit hex value xxxxxxxx
|
||||
r"|(x(?P<x>[a-fA-F0-9]{2}))" # Character with hex value hh
|
||||
r"|(N\{(?P<N>[a-zA-Z0-9 \-]{2,})\})" # Character named name in the Unicode database
|
||||
r")",
|
||||
re.VERBOSE,
|
||||
)
|
||||
|
||||
|
||||
def sub_twice(regex: Pattern[str], replacement: str, original: str) -> str:
|
||||
@ -236,3 +247,34 @@ def normalize_string_quotes(s: str) -> str:
|
||||
return s # Prefer double quotes
|
||||
|
||||
return f"{prefix}{new_quote}{new_body}{new_quote}"
|
||||
|
||||
|
||||
def normalize_unicode_escape_sequences(leaf: Leaf) -> None:
|
||||
"""Replace hex codes in Unicode escape sequences with lowercase representation."""
|
||||
text = leaf.value
|
||||
prefix = get_string_prefix(text)
|
||||
if "r" in prefix.lower():
|
||||
return
|
||||
|
||||
def replace(m: Match[str]) -> str:
|
||||
groups = m.groupdict()
|
||||
back_slashes = groups["backslashes"]
|
||||
|
||||
if len(back_slashes) % 2 == 0:
|
||||
return back_slashes + groups["body"]
|
||||
|
||||
if groups["u"]:
|
||||
# \u
|
||||
return back_slashes + "u" + groups["u"].lower()
|
||||
elif groups["U"]:
|
||||
# \U
|
||||
return back_slashes + "U" + groups["U"].lower()
|
||||
elif groups["x"]:
|
||||
# \x
|
||||
return back_slashes + "x" + groups["x"].lower()
|
||||
else:
|
||||
assert groups["N"], f"Unexpected match: {m}"
|
||||
# \N{}
|
||||
return back_slashes + "N{" + groups["N"].upper() + "}"
|
||||
|
||||
leaf.value = re.sub(UNICODE_ESCAPE_RE, replace, text)
|
||||
|
33
tests/data/preview/format_unicode_escape_seq.py
Normal file
33
tests/data/preview/format_unicode_escape_seq.py
Normal file
@ -0,0 +1,33 @@
|
||||
x = "\x1F"
|
||||
x = "\\x1B"
|
||||
x = "\\\x1B"
|
||||
x = "\U0001F60E"
|
||||
x = "\u0001F60E"
|
||||
x = r"\u0001F60E"
|
||||
x = "don't format me"
|
||||
x = "\xA3"
|
||||
x = "\u2717"
|
||||
x = "\uFaCe"
|
||||
x = "\N{ox}\N{OX}"
|
||||
x = "\N{lAtIn smaLL letteR x}"
|
||||
x = "\N{CYRILLIC small LETTER BYELORUSSIAN-UKRAINIAN I}"
|
||||
x = b"\x1Fdon't byte"
|
||||
x = rb"\x1Fdon't format"
|
||||
|
||||
# output
|
||||
|
||||
x = "\x1f"
|
||||
x = "\\x1B"
|
||||
x = "\\\x1b"
|
||||
x = "\U0001f60e"
|
||||
x = "\u0001F60E"
|
||||
x = r"\u0001F60E"
|
||||
x = "don't format me"
|
||||
x = "\xa3"
|
||||
x = "\u2717"
|
||||
x = "\uface"
|
||||
x = "\N{OX}\N{OX}"
|
||||
x = "\N{LATIN SMALL LETTER X}"
|
||||
x = "\N{CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I}"
|
||||
x = b"\x1fdon't byte"
|
||||
x = rb"\x1Fdon't format"
|
Loading…
Reference in New Issue
Block a user