Fix ignoring input files for symlink reasons (#4222)

This relates to #4015, #4161 and the behaviour of os.getcwd()

Black is a big user of pathlib and as such loves doing `.resolve()`,
since for a long time it was the only good way of getting an absolute
path in pathlib. However, this has two problems:

The first minor problem is performance, e.g. in #3751 I (safely) got rid
of a bunch of `.resolve()` which made Black 40% faster on cached runs.

The second more important problem is that always resolving symlinks
results in unintuitive exclusion behaviour. For instance, a gitignored
symlink should never alter formatting of your actual code. This kind of
thing was reported by users a few times.

In #3846, I improved the exclusion rule logic for symlinks in
`gen_python_files` and everything was good.

But `gen_python_files` isn't enough, there's also `get_sources`, which
handles user specified paths directly (instead of files Black
discovers). So in #4015, I made a very similar change to #3846 for
`get_sources`, and this is where some problems began.

The core issue was the line:
```
root_relative_path = path.absolute().relative_to(root).as_posix()
```
The first issue is that despite root being computed from user inputs, we
call `.resolve()` while computing it (likely unecessarily). Which means
that `path` may not actually be relative to `root`. So I started off
this PR trying to fix that, when I ran into the second issue. Which is
that `os.getcwd()` (as called by `os.path.abspath` or `Path.absolute` or
`Path.cwd`) also often resolves symlinks!
```
>>> import os
>>> os.environ.get("PWD")
'/Users/shantanu/dev/black/symlink/bug'
>>> os.getcwd()
'/Users/shantanu/dev/black/actual/bug'
```
This also meant that the breakage often would not show up when input
relative paths.

This doesn't affect `gen_python_files` / #3846 because things are always
absolute and known to be relative to `root`.

Anyway, it looks like #4161 fixed the crash by just swallowing the error
and ignoring the file. Instead, we should just try to compute the actual
relative path. I think this PR should be quite safe, but we could also
consider reverting some of the previous changes; the associated issues
weren't too popular.

At the same time, I think there's still behaviour that can be improved
and I kind of want to make larger changes, but maybe I'll save that for
if we do something like #3952

Hopefully fixes #4205, fixes #4209, actual fix for #4077
This commit is contained in:
Shantanu 2024-02-12 00:04:09 -08:00 committed by GitHub
parent a20100395c
commit 23dfc5b2c3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 127 additions and 53 deletions

View File

@ -24,6 +24,7 @@
### Configuration ### Configuration
- Fix issue where _Black_ would ignore input files in the presence of symlinks (#4222)
- _Black_ now ignores `pyproject.toml` that is missing a `tool.black` section when - _Black_ now ignores `pyproject.toml` that is missing a `tool.black` section when
discovering project root and configuration. Since _Black_ continues to use version discovering project root and configuration. Since _Black_ continues to use version
control as an indicator of project root, this is expected to primarily change behavior control as an indicator of project root, this is expected to primarily change behavior

View File

@ -44,12 +44,12 @@
STDIN_PLACEHOLDER, STDIN_PLACEHOLDER,
) )
from black.files import ( from black.files import (
best_effort_relative_path,
find_project_root, find_project_root,
find_pyproject_toml, find_pyproject_toml,
find_user_pyproject_toml, find_user_pyproject_toml,
gen_python_files, gen_python_files,
get_gitignore, get_gitignore,
get_root_relative_path,
parse_pyproject_toml, parse_pyproject_toml,
path_is_excluded, path_is_excluded,
resolves_outside_root_or_cannot_stat, resolves_outside_root_or_cannot_stat,
@ -734,6 +734,7 @@ def get_sources(
"""Compute the set of files to be formatted.""" """Compute the set of files to be formatted."""
sources: Set[Path] = set() sources: Set[Path] = set()
assert root.is_absolute(), f"INTERNAL ERROR: `root` must be absolute but is {root}"
using_default_exclude = exclude is None using_default_exclude = exclude is None
exclude = re_compile_maybe_verbose(DEFAULT_EXCLUDES) if exclude is None else exclude exclude = re_compile_maybe_verbose(DEFAULT_EXCLUDES) if exclude is None else exclude
gitignore: Optional[Dict[Path, PathSpec]] = None gitignore: Optional[Dict[Path, PathSpec]] = None
@ -749,11 +750,12 @@ def get_sources(
# Compare the logic here to the logic in `gen_python_files`. # Compare the logic here to the logic in `gen_python_files`.
if is_stdin or path.is_file(): if is_stdin or path.is_file():
root_relative_path = get_root_relative_path(path, root, report) if resolves_outside_root_or_cannot_stat(path, root, report):
if verbose:
if root_relative_path is None: out(f'Skipping invalid source: "{path}"', fg="red")
continue continue
root_relative_path = best_effort_relative_path(path, root).as_posix()
root_relative_path = "/" + root_relative_path root_relative_path = "/" + root_relative_path
# Hard-exclude any files that matches the `--force-exclude` regex. # Hard-exclude any files that matches the `--force-exclude` regex.
@ -763,11 +765,6 @@ def get_sources(
) )
continue continue
if resolves_outside_root_or_cannot_stat(path, root, report):
if verbose:
out(f'Skipping invalid source: "{path}"', fg="red")
continue
if is_stdin: if is_stdin:
path = Path(f"{STDIN_PLACEHOLDER}{str(path)}") path = Path(f"{STDIN_PLACEHOLDER}{str(path)}")

View File

@ -48,6 +48,11 @@ def _load_toml(path: Union[Path, str]) -> Dict[str, Any]:
return tomllib.load(f) return tomllib.load(f)
@lru_cache
def _cached_resolve(path: Path) -> Path:
return path.resolve()
@lru_cache @lru_cache
def find_project_root( def find_project_root(
srcs: Sequence[str], stdin_filename: Optional[str] = None srcs: Sequence[str], stdin_filename: Optional[str] = None
@ -67,9 +72,9 @@ def find_project_root(
if stdin_filename is not None: if stdin_filename is not None:
srcs = tuple(stdin_filename if s == "-" else s for s in srcs) srcs = tuple(stdin_filename if s == "-" else s for s in srcs)
if not srcs: if not srcs:
srcs = [str(Path.cwd().resolve())] srcs = [str(_cached_resolve(Path.cwd()))]
path_srcs = [Path(Path.cwd(), src).resolve() for src in srcs] path_srcs = [_cached_resolve(Path(Path.cwd(), src)) for src in srcs]
# A list of lists of parents for each 'src'. 'src' is included as a # A list of lists of parents for each 'src'. 'src' is included as a
# "parent" of itself if it is a directory # "parent" of itself if it is a directory
@ -236,7 +241,7 @@ def find_user_pyproject_toml() -> Path:
else: else:
config_root = os.environ.get("XDG_CONFIG_HOME", "~/.config") config_root = os.environ.get("XDG_CONFIG_HOME", "~/.config")
user_config_path = Path(config_root).expanduser() / "black" user_config_path = Path(config_root).expanduser() / "black"
return user_config_path.resolve() return _cached_resolve(user_config_path)
@lru_cache @lru_cache
@ -266,27 +271,31 @@ def resolves_outside_root_or_cannot_stat(
try: try:
if sys.version_info < (3, 8, 6): if sys.version_info < (3, 8, 6):
path = path.absolute() # https://bugs.python.org/issue33660 path = path.absolute() # https://bugs.python.org/issue33660
resolved_path = path.resolve() resolved_path = _cached_resolve(path)
return get_root_relative_path(resolved_path, root, report) is None
except OSError as e: except OSError as e:
if report: if report:
report.path_ignored(path, f"cannot be read because {e}") report.path_ignored(path, f"cannot be read because {e}")
return True return True
def get_root_relative_path(
path: Path,
root: Path,
report: Optional[Report] = None,
) -> Optional[str]:
"""Returns the file path relative to the 'root' directory"""
try: try:
root_relative_path = path.absolute().relative_to(root).as_posix() resolved_path.relative_to(root)
except ValueError: except ValueError:
if report: if report:
report.path_ignored(path, f"is a symbolic link that points outside {root}") report.path_ignored(path, f"is a symbolic link that points outside {root}")
return None return True
return root_relative_path return False
def best_effort_relative_path(path: Path, root: Path) -> Path:
# Precondition: resolves_outside_root_or_cannot_stat(path, root) is False
try:
return path.absolute().relative_to(root)
except ValueError:
pass
root_parent = next((p for p in path.parents if _cached_resolve(p) == root), None)
if root_parent is not None:
return path.relative_to(root_parent)
# something adversarial, fallback to path guaranteed by precondition
return _cached_resolve(path).relative_to(root)
def _path_is_ignored( def _path_is_ignored(
@ -339,7 +348,8 @@ def gen_python_files(
assert root.is_absolute(), f"INTERNAL ERROR: `root` must be absolute but is {root}" assert root.is_absolute(), f"INTERNAL ERROR: `root` must be absolute but is {root}"
for child in paths: for child in paths:
root_relative_path = child.absolute().relative_to(root).as_posix() assert child.is_absolute()
root_relative_path = child.relative_to(root).as_posix()
# First ignore files matching .gitignore, if passed # First ignore files matching .gitignore, if passed
if gitignore_dict and _path_is_ignored( if gitignore_dict and _path_is_ignored(

View File

@ -2567,32 +2567,32 @@ def test_symlinks(self) -> None:
gitignore = PathSpec.from_lines("gitwildmatch", []) gitignore = PathSpec.from_lines("gitwildmatch", [])
regular = MagicMock() regular = MagicMock()
regular.absolute.return_value = root / "regular.py" regular.relative_to.return_value = Path("regular.py")
regular.resolve.return_value = root / "regular.py" regular.resolve.return_value = root / "regular.py"
regular.is_dir.return_value = False regular.is_dir.return_value = False
regular.is_file.return_value = True regular.is_file.return_value = True
outside_root_symlink = MagicMock() outside_root_symlink = MagicMock()
outside_root_symlink.absolute.return_value = root / "symlink.py" outside_root_symlink.relative_to.return_value = Path("symlink.py")
outside_root_symlink.resolve.return_value = Path("/nowhere") outside_root_symlink.resolve.return_value = Path("/nowhere")
outside_root_symlink.is_dir.return_value = False outside_root_symlink.is_dir.return_value = False
outside_root_symlink.is_file.return_value = True outside_root_symlink.is_file.return_value = True
ignored_symlink = MagicMock() ignored_symlink = MagicMock()
ignored_symlink.absolute.return_value = root / ".mypy_cache" / "symlink.py" ignored_symlink.relative_to.return_value = Path(".mypy_cache") / "symlink.py"
ignored_symlink.is_dir.return_value = False ignored_symlink.is_dir.return_value = False
ignored_symlink.is_file.return_value = True ignored_symlink.is_file.return_value = True
# A symlink that has an excluded name, but points to an included name # A symlink that has an excluded name, but points to an included name
symlink_excluded_name = MagicMock() symlink_excluded_name = MagicMock()
symlink_excluded_name.absolute.return_value = root / "excluded_name" symlink_excluded_name.relative_to.return_value = Path("excluded_name")
symlink_excluded_name.resolve.return_value = root / "included_name.py" symlink_excluded_name.resolve.return_value = root / "included_name.py"
symlink_excluded_name.is_dir.return_value = False symlink_excluded_name.is_dir.return_value = False
symlink_excluded_name.is_file.return_value = True symlink_excluded_name.is_file.return_value = True
# A symlink that has an included name, but points to an excluded name # A symlink that has an included name, but points to an excluded name
symlink_included_name = MagicMock() symlink_included_name = MagicMock()
symlink_included_name.absolute.return_value = root / "included_name.py" symlink_included_name.relative_to.return_value = Path("included_name.py")
symlink_included_name.resolve.return_value = root / "excluded_name" symlink_included_name.resolve.return_value = root / "excluded_name"
symlink_included_name.is_dir.return_value = False symlink_included_name.is_dir.return_value = False
symlink_included_name.is_file.return_value = True symlink_included_name.is_file.return_value = True
@ -2626,39 +2626,100 @@ def test_symlinks(self) -> None:
outside_root_symlink.resolve.assert_called_once() outside_root_symlink.resolve.assert_called_once()
ignored_symlink.resolve.assert_not_called() ignored_symlink.resolve.assert_not_called()
def test_get_sources_symlink_and_force_exclude(self) -> None:
with TemporaryDirectory() as tempdir:
tmp = Path(tempdir).resolve()
actual = tmp / "actual"
actual.mkdir()
symlink = tmp / "symlink"
symlink.symlink_to(actual)
actual_proj = actual / "project"
actual_proj.mkdir()
(actual_proj / "module.py").write_text("print('hello')", encoding="utf-8")
symlink_proj = symlink / "project"
with change_directory(symlink_proj):
assert_collected_sources(
src=["module.py"],
root=symlink_proj.resolve(),
expected=["module.py"],
)
absolute_module = symlink_proj / "module.py"
assert_collected_sources(
src=[absolute_module],
root=symlink_proj.resolve(),
expected=[absolute_module],
)
# a few tricky tests for force_exclude
flat_symlink = symlink_proj / "symlink_module.py"
flat_symlink.symlink_to(actual_proj / "module.py")
assert_collected_sources(
src=[flat_symlink],
root=symlink_proj.resolve(),
force_exclude=r"/symlink_module.py",
expected=[],
)
target = actual_proj / "target"
target.mkdir()
(target / "another.py").write_text("print('hello')", encoding="utf-8")
(symlink_proj / "nested").symlink_to(target)
assert_collected_sources(
src=[symlink_proj / "nested" / "another.py"],
root=symlink_proj.resolve(),
force_exclude=r"nested",
expected=[],
)
assert_collected_sources(
src=[symlink_proj / "nested" / "another.py"],
root=symlink_proj.resolve(),
force_exclude=r"target",
expected=[symlink_proj / "nested" / "another.py"],
)
def test_get_sources_with_stdin_symlink_outside_root( def test_get_sources_with_stdin_symlink_outside_root(
self, self,
) -> None: ) -> None:
path = THIS_DIR / "data" / "include_exclude_tests" path = THIS_DIR / "data" / "include_exclude_tests"
stdin_filename = str(path / "b/exclude/a.py") stdin_filename = str(path / "b/exclude/a.py")
outside_root_symlink = Path("/target_directory/a.py") outside_root_symlink = Path("/target_directory/a.py")
root = Path("target_dir/").resolve().absolute()
with patch("pathlib.Path.resolve", return_value=outside_root_symlink): with patch("pathlib.Path.resolve", return_value=outside_root_symlink):
assert_collected_sources( assert_collected_sources(
root=Path("target_directory/"), root=root,
src=["-"], src=["-"],
expected=[], expected=[],
stdin_filename=stdin_filename, stdin_filename=stdin_filename,
) )
@patch("black.find_project_root", lambda *args: (THIS_DIR.resolve(), None))
def test_get_sources_with_stdin(self) -> None: def test_get_sources_with_stdin(self) -> None:
src = ["-"] src = ["-"]
expected = ["-"] expected = ["-"]
assert_collected_sources(src, expected, include="", exclude=r"/exclude/|a\.py") assert_collected_sources(
src,
root=THIS_DIR.resolve(),
expected=expected,
include="",
exclude=r"/exclude/|a\.py",
)
@patch("black.find_project_root", lambda *args: (THIS_DIR.resolve(), None))
def test_get_sources_with_stdin_filename(self) -> None: def test_get_sources_with_stdin_filename(self) -> None:
src = ["-"] src = ["-"]
stdin_filename = str(THIS_DIR / "data/collections.py") stdin_filename = str(THIS_DIR / "data/collections.py")
expected = [f"__BLACK_STDIN_FILENAME__{stdin_filename}"] expected = [f"__BLACK_STDIN_FILENAME__{stdin_filename}"]
assert_collected_sources( assert_collected_sources(
src, src,
expected, root=THIS_DIR.resolve(),
expected=expected,
exclude=r"/exclude/a\.py", exclude=r"/exclude/a\.py",
stdin_filename=stdin_filename, stdin_filename=stdin_filename,
) )
@patch("black.find_project_root", lambda *args: (THIS_DIR.resolve(), None))
def test_get_sources_with_stdin_filename_and_exclude(self) -> None: def test_get_sources_with_stdin_filename_and_exclude(self) -> None:
# Exclude shouldn't exclude stdin_filename since it is mimicking the # Exclude shouldn't exclude stdin_filename since it is mimicking the
# file being passed directly. This is the same as # file being passed directly. This is the same as
@ -2669,12 +2730,12 @@ def test_get_sources_with_stdin_filename_and_exclude(self) -> None:
expected = [f"__BLACK_STDIN_FILENAME__{stdin_filename}"] expected = [f"__BLACK_STDIN_FILENAME__{stdin_filename}"]
assert_collected_sources( assert_collected_sources(
src, src,
expected, root=THIS_DIR.resolve(),
expected=expected,
exclude=r"/exclude/|a\.py", exclude=r"/exclude/|a\.py",
stdin_filename=stdin_filename, stdin_filename=stdin_filename,
) )
@patch("black.find_project_root", lambda *args: (THIS_DIR.resolve(), None))
def test_get_sources_with_stdin_filename_and_extend_exclude(self) -> None: def test_get_sources_with_stdin_filename_and_extend_exclude(self) -> None:
# Extend exclude shouldn't exclude stdin_filename since it is mimicking the # Extend exclude shouldn't exclude stdin_filename since it is mimicking the
# file being passed directly. This is the same as # file being passed directly. This is the same as
@ -2685,12 +2746,12 @@ def test_get_sources_with_stdin_filename_and_extend_exclude(self) -> None:
expected = [f"__BLACK_STDIN_FILENAME__{stdin_filename}"] expected = [f"__BLACK_STDIN_FILENAME__{stdin_filename}"]
assert_collected_sources( assert_collected_sources(
src, src,
expected, root=THIS_DIR.resolve(),
expected=expected,
extend_exclude=r"/exclude/|a\.py", extend_exclude=r"/exclude/|a\.py",
stdin_filename=stdin_filename, stdin_filename=stdin_filename,
) )
@patch("black.find_project_root", lambda *args: (THIS_DIR.resolve(), None))
def test_get_sources_with_stdin_filename_and_force_exclude(self) -> None: def test_get_sources_with_stdin_filename_and_force_exclude(self) -> None:
# Force exclude should exclude the file when passing it through # Force exclude should exclude the file when passing it through
# stdin_filename # stdin_filename
@ -2698,27 +2759,32 @@ def test_get_sources_with_stdin_filename_and_force_exclude(self) -> None:
stdin_filename = str(path / "b/exclude/a.py") stdin_filename = str(path / "b/exclude/a.py")
assert_collected_sources( assert_collected_sources(
src=["-"], src=["-"],
root=THIS_DIR.resolve(),
expected=[], expected=[],
force_exclude=r"/exclude/|a\.py", force_exclude=r"/exclude/|a\.py",
stdin_filename=stdin_filename, stdin_filename=stdin_filename,
) )
@patch("black.find_project_root", lambda *args: (THIS_DIR.resolve(), None))
def test_get_sources_with_stdin_filename_and_force_exclude_and_symlink( def test_get_sources_with_stdin_filename_and_force_exclude_and_symlink(
self, self,
) -> None: ) -> None:
# Force exclude should exclude a symlink based on the symlink, not its target # Force exclude should exclude a symlink based on the symlink, not its target
path = THIS_DIR / "data" / "include_exclude_tests" with TemporaryDirectory() as tempdir:
stdin_filename = str(path / "symlink.py") tmp = Path(tempdir).resolve()
expected = [f"__BLACK_STDIN_FILENAME__{stdin_filename}"] (tmp / "exclude").mkdir()
target = path / "b/exclude/a.py" (tmp / "exclude" / "a.py").write_text("print('hello')", encoding="utf-8")
with patch("pathlib.Path.resolve", return_value=target): (tmp / "symlink.py").symlink_to(tmp / "exclude" / "a.py")
assert_collected_sources(
src=["-"], stdin_filename = str(tmp / "symlink.py")
expected=expected, expected = [f"__BLACK_STDIN_FILENAME__{stdin_filename}"]
force_exclude=r"exclude/a\.py", with change_directory(tmp):
stdin_filename=stdin_filename, assert_collected_sources(
) src=["-"],
root=tmp,
expected=expected,
force_exclude=r"exclude/a\.py",
stdin_filename=stdin_filename,
)
class TestDeFactoAPI: class TestDeFactoAPI: