Improve multiline string handling (#1879)

Co-authored-by: Olivia Hong <ohong@lyft.com> Co-authored-by: Olivia Hong <24500729+olivia-hong@users.noreply.github.com>
2023-03-07 14:52:19 -05:00 · 2023-03-07 14:52:19 -05:00 · 4a063a9f8d
commit 4a063a9f8d
parent 25d886f52c
6 changed files with 514 additions and 33 deletions
--- a/CHANGES.md
+++ b/CHANGES.md
@ -133,6 +133,7 @@ versions separately.
  code. Implicitly concatenated f-strings with different quotes can now be merged or
  quote-normalized by changing the quotes used in expressions. (#3509)
 - Fix crash on `await (yield)` when Black is compiled with mypyc (#3533)
+- Improve handling of multiline strings by changing line split behavior (#1879)

 ### Configuration

--- a/docs/the_black_code_style/future_style.md
+++ b/docs/the_black_code_style/future_style.md
@ -111,3 +111,51 @@ my_dict = {
    "another key": short_value,
 }
 ```
+
+### Improved multiline string handling
+
+_Black_ is smarter when formatting multiline strings, especially in function arguments,
+to avoid introducing extra line breaks. Previously, it would always consider multiline
+strings as not fitting on a single line. With this new feature, _Black_ looks at the
+context around the multiline string to decide if it should be inlined or split to a
+separate line. For example, when a multiline string is passed to a function, _Black_
+will only split the multiline string if a line is too long or if multiple arguments are
+being passed.
+
+For example, _Black_ will reformat
+
+```python
+textwrap.dedent(
+    """\
+    This is a
+    multiline string
+"""
+)
+```
+
+to:
+
+```python
+textwrap.dedent("""\
+    This is a
+    multiline string
+""")
+```
+
+And:
+
+```python
+MULTILINE = """
+foobar
+""".replace(
+    "\n", ""
+)
+```
+
+to:
+
+```python
+MULTILINE = """
+foobar
+""".replace("\n", "")
+```
--- a/src/black/linegen.py
+++ b/src/black/linegen.py
@ -2,7 +2,7 @@
 Generating lines of code.
 """
 import sys
-from dataclasses import dataclass
+from dataclasses import dataclass, replace
 from enum import Enum, auto
 from functools import partial, wraps
 from typing import Collection, Iterator, List, Optional, Set, Union, cast
@ -505,7 +505,7 @@ def transform_line(
        and not line.should_split_rhs
        and not line.magic_trailing_comma
        and (
-            is_line_short_enough(line, line_length=mode.line_length, line_str=line_str)
+            is_line_short_enough(line, mode=mode, line_str=line_str)
            or line.contains_unsplittable_type_ignore()
        )
        and not (line.inside_brackets and line.contains_standalone_comments())
@ -529,14 +529,12 @@ def _rhs(
            bracket pair instead.
            """
            for omit in generate_trailers_to_omit(line, mode.line_length):
-                lines = list(
-                    right_hand_split(line, mode.line_length, features, omit=omit)
-                )
+                lines = list(right_hand_split(line, mode, features, omit=omit))
                # Note: this check is only able to figure out if the first line of the
                # *current* transformation fits in the line length.  This is true only
                # for simple cases.  All others require running more transforms via
                # `transform_line()`.  This check doesn't know if those would succeed.
-                if is_line_short_enough(lines[0], line_length=mode.line_length):
+                if is_line_short_enough(lines[0], mode=mode):
                    yield from lines
                    return

@ -544,9 +542,7 @@ def _rhs(
            # This mostly happens to multiline strings that are by definition
            # reported as not fitting a single line, as well as lines that contain
            # trailing commas (those have to be exploded).
-            yield from right_hand_split(
-                line, line_length=mode.line_length, features=features
-            )
+            yield from right_hand_split(line, mode, features=features)

        # HACK: nested functions (like _rhs) compiled by mypyc don't retain their
        # __name__ attribute which is needed in `run_transformer` further down.
@ -664,7 +660,7 @@ class _RHSResult:

 def right_hand_split(
    line: Line,
-    line_length: int,
+    mode: Mode,
    features: Collection[Feature] = (),
    omit: Collection[LeafID] = (),
 ) -> Iterator[Line]:
@ -678,7 +674,7 @@ def right_hand_split(
    """
    rhs_result = _first_right_hand_split(line, omit=omit)
    yield from _maybe_split_omitting_optional_parens(
-        rhs_result, line, line_length, features=features, omit=omit
+        rhs_result, line, mode, features=features, omit=omit
    )


@ -733,7 +729,7 @@ def _first_right_hand_split(
 def _maybe_split_omitting_optional_parens(
    rhs: _RHSResult,
    line: Line,
-    line_length: int,
+    mode: Mode,
    features: Collection[Feature] = (),
    omit: Collection[LeafID] = (),
 ) -> Iterator[Line]:
@ -751,7 +747,7 @@ def _maybe_split_omitting_optional_parens(
        # there are no standalone comments in the body
        and not rhs.body.contains_standalone_comments(0)
        # and we can actually remove the parens
-        and can_omit_invisible_parens(rhs.body, line_length)
+        and can_omit_invisible_parens(rhs.body, mode.line_length)
    ):
        omit = {id(rhs.closing_bracket), *omit}
        try:
@ -766,23 +762,24 @@ def _maybe_split_omitting_optional_parens(
                and any(leaf.type in BRACKETS for leaf in rhs.head.leaves[:-1])
                # the left side of assignment is short enough (the -1 is for the ending
                # optional paren)
-                and is_line_short_enough(rhs.head, line_length=line_length - 1)
+                and is_line_short_enough(
+                    rhs.head, mode=replace(mode, line_length=mode.line_length - 1)
+                )
                # the left side of assignment won't explode further because of magic
                # trailing comma
                and rhs.head.magic_trailing_comma is None
                # the split by omitting optional parens isn't preferred by some other
                # reason
-                and not _prefer_split_rhs_oop(rhs_oop, line_length=line_length)
+                and not _prefer_split_rhs_oop(rhs_oop, mode)
            ):
                yield from _maybe_split_omitting_optional_parens(
-                    rhs_oop, line, line_length, features=features, omit=omit
+                    rhs_oop, line, mode, features=features, omit=omit
                )
                return

        except CannotSplit as e:
            if not (
-                can_be_split(rhs.body)
-                or is_line_short_enough(rhs.body, line_length=line_length)
+                can_be_split(rhs.body) or is_line_short_enough(rhs.body, mode=mode)
            ):
                raise CannotSplit(
                    "Splitting failed, body is still too long and can't be split."
@ -806,7 +803,7 @@ def _maybe_split_omitting_optional_parens(
            yield result


-def _prefer_split_rhs_oop(rhs_oop: _RHSResult, line_length: int) -> bool:
+def _prefer_split_rhs_oop(rhs_oop: _RHSResult, mode: Mode) -> bool:
    """
    Returns whether we should prefer the result from a split omitting optional parens.
    """
@ -826,7 +823,7 @@ def _prefer_split_rhs_oop(rhs_oop: _RHSResult, line_length: int) -> bool:
            # the first line still contains the `=`)
            any(leaf.type == token.EQUAL for leaf in rhs_oop.head.leaves)
            # the first line is short enough
-            and is_line_short_enough(rhs_oop.head, line_length=line_length)
+            and is_line_short_enough(rhs_oop.head, mode=mode)
        )
        # contains unsplittable type ignore
        or rhs_oop.head.contains_unsplittable_type_ignore()
@ -1525,7 +1522,7 @@ def run_transformer(
        or line.contains_multiline_strings()
        or result[0].contains_uncollapsable_type_comments()
        or result[0].contains_unsplittable_type_ignore()
-        or is_line_short_enough(result[0], line_length=mode.line_length)
+        or is_line_short_enough(result[0], mode=mode)
        # If any leaves have no parents (which _can_ occur since
        # `transform(line)` potentially destroys the line's underlying node
        # structure), then we can't proceed. Doing so would cause the below
@ -1540,8 +1537,6 @@ def run_transformer(
    second_opinion = run_transformer(
        line_copy, transform, mode, features_fop, line_str=line_str
    )
-    if all(
-        is_line_short_enough(ln, line_length=mode.line_length) for ln in second_opinion
-    ):
+    if all(is_line_short_enough(ln, mode=mode) for ln in second_opinion):
        result = second_opinion
    return result
--- a/src/black/lines.py
+++ b/src/black/lines.py
@ -1,4 +1,5 @@
 import itertools
+import math
 import sys
 from dataclasses import dataclass, field
 from typing import (
@ -10,11 +11,12 @@
    Sequence,
    Tuple,
    TypeVar,
+    Union,
    cast,
 )

 from black.brackets import DOT_PRIORITY, BracketTracker
-from black.mode import Mode
+from black.mode import Mode, Preview
 from black.nodes import (
    BRACKETS,
    CLOSING_BRACKETS,
@ -37,6 +39,7 @@
 T = TypeVar("T")
 Index = int
 LeafID = int
+LN = Union[Leaf, Node]


@dataclass
@ -701,18 +704,93 @@ def append_leaves(
            new_line.append(comment_leaf, preformatted=True)


-def is_line_short_enough(line: Line, *, line_length: int, line_str: str = "") -> bool:
-    """Return True if `line` is no longer than `line_length`.
-
+def is_line_short_enough(  # noqa: C901
+    line: Line, *, mode: Mode, line_str: str = ""
+) -> bool:
+    """For non-multiline strings, return True if `line` is no longer than `line_length`.
+    For multiline strings, looks at the context around `line` to determine
+    if it should be inlined or split up.
    Uses the provided `line_str` rendering, if any, otherwise computes a new one.
    """
    if not line_str:
        line_str = line_to_string(line)
-    return (
-        len(line_str) <= line_length
-        and "\n" not in line_str  # multiline strings
-        and not line.contains_standalone_comments()
-    )
+
+    if Preview.multiline_string_handling not in mode:
+        return (
+            len(line_str) <= mode.line_length
+            and "\n" not in line_str  # multiline strings
+            and not line.contains_standalone_comments()
+        )
+
+    if line.contains_standalone_comments():
+        return False
+    if "\n" not in line_str:
+        # No multiline strings (MLS) present
+        return len(line_str) <= mode.line_length
+
+    first, *_, last = line_str.split("\n")
+    if len(first) > mode.line_length or len(last) > mode.line_length:
+        return False
+
+    # Traverse the AST to examine the context of the multiline string (MLS),
+    # tracking aspects such as depth and comma existence,
+    # to determine whether to split the MLS or keep it together.
+    # Depth (which is based on the existing bracket_depth concept)
+    # is needed to determine nesting level of the MLS.
+    # Includes special case for trailing commas.
+    commas: List[int] = []  # tracks number of commas per depth level
+    multiline_string: Optional[Leaf] = None
+    # store the leaves that contain parts of the MLS
+    multiline_string_contexts: List[LN] = []
+
+    max_level_to_update = math.inf  # track the depth of the MLS
+    for i, leaf in enumerate(line.leaves):
+        if max_level_to_update == math.inf:
+            had_comma: Optional[int] = None
+            if leaf.bracket_depth + 1 > len(commas):
+                commas.append(0)
+            elif leaf.bracket_depth + 1 < len(commas):
+                had_comma = commas.pop()
+            if (
+                had_comma is not None
+                and multiline_string is not None
+                and multiline_string.bracket_depth == leaf.bracket_depth + 1
+            ):
+                # Have left the level with the MLS, stop tracking commas
+                max_level_to_update = leaf.bracket_depth
+                if had_comma > 0:
+                    # MLS was in parens with at least one comma - force split
+                    return False
+
+        if leaf.bracket_depth <= max_level_to_update and leaf.type == token.COMMA:
+            # Ignore non-nested trailing comma
+            # directly after MLS/MLS-containing expression
+            ignore_ctxs: List[Optional[LN]] = [None]
+            ignore_ctxs += multiline_string_contexts
+            if not (leaf.prev_sibling in ignore_ctxs and i == len(line.leaves) - 1):
+                commas[leaf.bracket_depth] += 1
+        if max_level_to_update != math.inf:
+            max_level_to_update = min(max_level_to_update, leaf.bracket_depth)
+
+        if is_multiline_string(leaf):
+            if len(multiline_string_contexts) > 0:
+                # >1 multiline string cannot fit on a single line - force split
+                return False
+            multiline_string = leaf
+            ctx: LN = leaf
+            # fetch the leaf components of the MLS in the AST
+            while str(ctx) in line_str:
+                multiline_string_contexts.append(ctx)
+                if ctx.parent is None:
+                    break
+                ctx = ctx.parent
+
+    # May not have a triple-quoted multiline string at all,
+    # in case of a regular string with embedded newlines and line continuations
+    if len(multiline_string_contexts) == 0:
+        return True
+
+    return all(val == 0 for val in commas)


 def can_be_split(line: Line) -> bool:
--- a/src/black/mode.py
+++ b/src/black/mode.py
@ -155,6 +155,7 @@ class Preview(Enum):

    add_trailing_comma_consistently = auto()
    hex_codes_in_unicode_sequences = auto()
+    multiline_string_handling = auto()
    prefer_splitting_right_hand_side_of_assignments = auto()
    # NOTE: string_processing requires wrap_long_dict_values_in_parens
    # for https://github.com/psf/black/issues/3117 to be fixed.
--- a/tests/data/preview/multiline_strings.py
+++ b/tests/data/preview/multiline_strings.py
@ -0,0 +1,358 @@
+"""cow
+say""",
+call(3, "dogsay", textwrap.dedent("""dove
+    coo""" % "cowabunga"))
+call(3, "dogsay", textwrap.dedent("""dove
+coo""" % "cowabunga"))
+call(3, textwrap.dedent("""cow
+    moo""" % "cowabunga"), "dogsay")
+call(3, "dogsay", textwrap.dedent("""crow
+    caw""" % "cowabunga"),)
+call(3, textwrap.dedent("""cat
+    meow""" % "cowabunga"), {"dog", "say"})
+call(3, {"dog", "say"}, textwrap.dedent("""horse
+    neigh""" % "cowabunga"))
+call(3, {"dog", "say"}, textwrap.dedent("""pig
+    oink""" % "cowabunga"),)
+textwrap.dedent("""A one-line triple-quoted string.""")
+textwrap.dedent("""A two-line triple-quoted string
+since it goes to the next line.""")
+textwrap.dedent("""A three-line triple-quoted string
+that not only goes to the next line
+but also goes one line beyond.""")
+textwrap.dedent("""\
+    A triple-quoted string
+    actually leveraging the textwrap.dedent functionality
+    that ends in a trailing newline,
+    representing e.g. file contents.
+""")
+path.write_text(textwrap.dedent("""\
+    A triple-quoted string
+    actually leveraging the textwrap.dedent functionality
+    that ends in a trailing newline,
+    representing e.g. file contents.
+"""))
+path.write_text(textwrap.dedent("""\
+    A triple-quoted string
+    actually leveraging the textwrap.dedent functionality
+    that ends in a trailing newline,
+    representing e.g. {config_filename} file contents.
+""".format("config_filename", config_filename)))
+# Another use case
+data = yaml.load("""\
+a: 1
+b: 2
+""")
+data = yaml.load("""\
+a: 1
+b: 2
+""",)
+data = yaml.load(
+    """\
+    a: 1
+    b: 2
+"""
+)
+
+MULTILINE = """
+foo
+""".replace("\n", "")
+generated_readme = lambda project_name: """
+{}
+
+<Add content here!>
+""".strip().format(project_name)
+parser.usage += """
+Custom extra help summary.
+
+Extra test:
+- with
+- bullets
+"""
+
+
+def get_stuff(cr, value):
+    # original
+    cr.execute("""
+        SELECT whatever
+          FROM some_table t
+         WHERE id = %s
+    """, [value])
+    return cr.fetchone()
+
+
+def get_stuff(cr, value):
+    # preferred
+    cr.execute(
+        """
+        SELECT whatever
+          FROM some_table t
+         WHERE id = %s
+        """,
+        [value],
+    )
+    return cr.fetchone()
+
+
+call(arg1, arg2, """
+short
+""", arg3=True)
+test_vectors = [
+    "one-liner\n",
+    "two\nliner\n",
+    """expressed
+as a three line
+mulitline string""",
+]
+
+_wat = re.compile(
+    r"""
+    regex
+    """,
+    re.MULTILINE | re.VERBOSE,
+)
+dis_c_instance_method = """\
+%3d           0 LOAD_FAST                1 (x)
+              2 LOAD_CONST               1 (1)
+              4 COMPARE_OP               2 (==)
+              6 LOAD_FAST                0 (self)
+              8 STORE_ATTR               0 (x)
+             10 LOAD_CONST               0 (None)
+             12 RETURN_VALUE
+""" % (_C.__init__.__code__.co_firstlineno + 1,)
+path.write_text(textwrap.dedent("""\
+    A triple-quoted string
+    actually {verb} the textwrap.dedent functionality
+    that ends in a trailing newline,
+    representing e.g. {file_type} file contents.
+""".format(verb="using", file_type="test")))
+{"""cow
+moos"""}
+["""cow
+moos"""]
+["""cow
+moos""", """dog
+woofs
+and
+barks"""]
+def dastardly_default_value(
+    cow: String = json.loads("""this
+is
+quite
+the
+dastadardly
+value!"""),
+    **kwargs,
+):
+    pass
+
+print(f"""
+    This {animal}
+    moos and barks
+{animal} say
+""")
+msg = f"""The arguments {bad_arguments} were passed in.
+Please use `--build-option` instead,
+`--global-option` is reserved to flags like `--verbose` or `--quiet`.
+"""
+
+# output
+"""cow
+say""",
+call(
+    3,
+    "dogsay",
+    textwrap.dedent("""dove
+    coo""" % "cowabunga"),
+)
+call(
+    3,
+    "dogsay",
+    textwrap.dedent("""dove
+coo""" % "cowabunga"),
+)
+call(
+    3,
+    textwrap.dedent("""cow
+    moo""" % "cowabunga"),
+    "dogsay",
+)
+call(
+    3,
+    "dogsay",
+    textwrap.dedent("""crow
+    caw""" % "cowabunga"),
+)
+call(
+    3,
+    textwrap.dedent("""cat
+    meow""" % "cowabunga"),
+    {"dog", "say"},
+)
+call(
+    3,
+    {"dog", "say"},
+    textwrap.dedent("""horse
+    neigh""" % "cowabunga"),
+)
+call(
+    3,
+    {"dog", "say"},
+    textwrap.dedent("""pig
+    oink""" % "cowabunga"),
+)
+textwrap.dedent("""A one-line triple-quoted string.""")
+textwrap.dedent("""A two-line triple-quoted string
+since it goes to the next line.""")
+textwrap.dedent("""A three-line triple-quoted string
+that not only goes to the next line
+but also goes one line beyond.""")
+textwrap.dedent("""\
+    A triple-quoted string
+    actually leveraging the textwrap.dedent functionality
+    that ends in a trailing newline,
+    representing e.g. file contents.
+""")
+path.write_text(textwrap.dedent("""\
+    A triple-quoted string
+    actually leveraging the textwrap.dedent functionality
+    that ends in a trailing newline,
+    representing e.g. file contents.
+"""))
+path.write_text(textwrap.dedent("""\
+    A triple-quoted string
+    actually leveraging the textwrap.dedent functionality
+    that ends in a trailing newline,
+    representing e.g. {config_filename} file contents.
+""".format("config_filename", config_filename)))
+# Another use case
+data = yaml.load("""\
+a: 1
+b: 2
+""")
+data = yaml.load(
+    """\
+a: 1
+b: 2
+""",
+)
+data = yaml.load("""\
+    a: 1
+    b: 2
+""")
+
+MULTILINE = """
+foo
+""".replace("\n", "")
+generated_readme = lambda project_name: """
+{}
+
+<Add content here!>
+""".strip().format(project_name)
+parser.usage += """
+Custom extra help summary.
+
+Extra test:
+- with
+- bullets
+"""
+
+
+def get_stuff(cr, value):
+    # original
+    cr.execute(
+        """
+        SELECT whatever
+          FROM some_table t
+         WHERE id = %s
+    """,
+        [value],
+    )
+    return cr.fetchone()
+
+
+def get_stuff(cr, value):
+    # preferred
+    cr.execute(
+        """
+        SELECT whatever
+          FROM some_table t
+         WHERE id = %s
+        """,
+        [value],
+    )
+    return cr.fetchone()
+
+
+call(
+    arg1,
+    arg2,
+    """
+short
+""",
+    arg3=True,
+)
+test_vectors = [
+    "one-liner\n",
+    "two\nliner\n",
+    """expressed
+as a three line
+mulitline string""",
+]
+
+_wat = re.compile(
+    r"""
+    regex
+    """,
+    re.MULTILINE | re.VERBOSE,
+)
+dis_c_instance_method = """\
+%3d           0 LOAD_FAST                1 (x)
+              2 LOAD_CONST               1 (1)
+              4 COMPARE_OP               2 (==)
+              6 LOAD_FAST                0 (self)
+              8 STORE_ATTR               0 (x)
+             10 LOAD_CONST               0 (None)
+             12 RETURN_VALUE
+""" % (_C.__init__.__code__.co_firstlineno + 1,)
+path.write_text(textwrap.dedent("""\
+    A triple-quoted string
+    actually {verb} the textwrap.dedent functionality
+    that ends in a trailing newline,
+    representing e.g. {file_type} file contents.
+""".format(verb="using", file_type="test")))
+{"""cow
+moos"""}
+["""cow
+moos"""]
+[
+    """cow
+moos""",
+    """dog
+woofs
+and
+barks""",
+]
+
+
+def dastardly_default_value(
+    cow: String = json.loads("""this
+is
+quite
+the
+dastadardly
+value!"""),
+    **kwargs,
+):
+    pass
+
+
+print(f"""
+    This {animal}
+    moos and barks
+{animal} say
+""")
+msg = f"""The arguments {bad_arguments} were passed in.
+Please use `--build-option` instead,
+`--global-option` is reserved to flags like `--verbose` or `--quiet`.
+"""