black/src/black_primer/lib.py

import asyncio
import errno
import json
import logging
import os
import stat
import sys
from functools import partial
from pathlib import Path
from platform import system
from shutil import rmtree, which
from subprocess import CalledProcessError
from sys import version_info
from tempfile import TemporaryDirectory
from typing import (
    Any,
    Callable,
    Dict,
    List,
    NamedTuple,
    Optional,
    Sequence,
    Tuple,
    Union,
)
from urllib.parse import urlparse

import click


TEN_MINUTES_SECONDS = 600
WINDOWS = system() == "Windows"
BLACK_BINARY = "black.exe" if WINDOWS else "black"
GIT_BINARY = "git.exe" if WINDOWS else "git"
LOG = logging.getLogger(__name__)


# Windows needs a ProactorEventLoop if you want to exec subprocesses
# Starting with 3.8 this is the default - can remove when Black >= 3.8
# mypy only respects sys.platform if directly in the evaluation
# https://mypy.readthedocs.io/en/latest/common_issues.html#python-version-and-system-platform-checks  # noqa: B950
if sys.platform == "win32":
    asyncio.set_event_loop(asyncio.ProactorEventLoop())


class Results(NamedTuple):
    stats: Dict[str, int] = {}
    failed_projects: Dict[str, CalledProcessError] = {}


async def _gen_check_output(
    cmd: Sequence[str],
    timeout: float = TEN_MINUTES_SECONDS,
    env: Optional[Dict[str, str]] = None,
    cwd: Optional[Path] = None,
    stdin: Optional[bytes] = None,
) -> Tuple[bytes, bytes]:
    process = await asyncio.create_subprocess_exec(
        *cmd,
        stdin=asyncio.subprocess.PIPE,
        stdout=asyncio.subprocess.PIPE,
        stderr=asyncio.subprocess.STDOUT,
        env=env,
        cwd=cwd,
    )
    try:
        (stdout, stderr) = await asyncio.wait_for(process.communicate(stdin), timeout)
    except asyncio.TimeoutError:
        process.kill()
        await process.wait()
        raise

    # A non-optional timeout was supplied to asyncio.wait_for, guaranteeing
    # a timeout or completed process.  A terminated Python process will have a
    # non-empty returncode value.
    assert process.returncode is not None

    if process.returncode != 0:
        cmd_str = " ".join(cmd)
        raise CalledProcessError(
            process.returncode, cmd_str, output=stdout, stderr=stderr
        )

    return (stdout, stderr)


def analyze_results(project_count: int, results: Results) -> int:
    failed_pct = round(((results.stats["failed"] / project_count) * 100), 2)
    success_pct = round(((results.stats["success"] / project_count) * 100), 2)

    click.secho("-- primer results 📊 --\n", bold=True)
    click.secho(
        f"{results.stats['success']} / {project_count} succeeded ({success_pct}%) ✅",
        bold=True,
        fg="green",
    )
    click.secho(
        f"{results.stats['failed']} / {project_count} FAILED ({failed_pct}%) 💩",
        bold=bool(results.stats["failed"]),
        fg="red",
    )
    s = "" if results.stats["disabled"] == 1 else "s"
    click.echo(f" - {results.stats['disabled']} project{s} disabled by config")
    s = "" if results.stats["wrong_py_ver"] == 1 else "s"
    click.echo(
        f" - {results.stats['wrong_py_ver']} project{s} skipped due to Python version"
    )
    click.echo(
        f" - {results.stats['skipped_long_checkout']} skipped due to long checkout"
    )

    if results.failed_projects:
        click.secho("\nFailed projects:\n", bold=True)

    for project_name, project_cpe in results.failed_projects.items():
        print(f"## {project_name}:")
        print(f" - Returned {project_cpe.returncode}")
        if project_cpe.stderr:
            print(f" - stderr:\n{project_cpe.stderr.decode('utf8')}")
        if project_cpe.stdout:
            print(f" - stdout:\n{project_cpe.stdout.decode('utf8')}")
        print("")

    return results.stats["failed"]


def _flatten_cli_args(cli_args: List[Union[Sequence[str], str]]) -> List[str]:
    """Allow a user to put long arguments into a list of strs
    to make the JSON human readable"""
    flat_args = []
    for arg in cli_args:
        if isinstance(arg, str):
            flat_args.append(arg)
            continue

        args_as_str = "".join(arg)
        flat_args.append(args_as_str)

    return flat_args


async def black_run(
    project_name: str,
    repo_path: Optional[Path],
    project_config: Dict[str, Any],
    results: Results,
    no_diff: bool = False,
) -> None:
    """Run Black and record failures"""
    if not repo_path:
        results.stats["failed"] += 1
        results.failed_projects[project_name] = CalledProcessError(
            69, [], f"{project_name} has no repo_path: {repo_path}".encode(), b""
        )
        return

    stdin_test = project_name.upper() == "STDIN"
    cmd = [str(which(BLACK_BINARY))]
    if "cli_arguments" in project_config and project_config["cli_arguments"]:
        cmd.extend(_flatten_cli_args(project_config["cli_arguments"]))
    cmd.append("--check")
    if not no_diff:
        cmd.append("--diff")

    # Workout if we should read in a python file or search from cwd
    stdin = None
    if stdin_test:
        cmd.append("-")
        stdin = repo_path.read_bytes()
    elif "base_path" in project_config:
        cmd.append(project_config["base_path"])
    else:
        cmd.append(".")

    timeout = (
        project_config["timeout_seconds"]
        if "timeout_seconds" in project_config
        else TEN_MINUTES_SECONDS
    )
    with TemporaryDirectory() as tmp_path:
        # Prevent reading top-level user configs by manipulating environment variables
        env = {
            **os.environ,
            "XDG_CONFIG_HOME": tmp_path,  # Unix-like
            "USERPROFILE": tmp_path,  # Windows (changes `Path.home()` output)
        }

        cwd_path = repo_path.parent if stdin_test else repo_path
        try:
            LOG.debug(f"Running black for {project_name}: {' '.join(cmd)}")
            _stdout, _stderr = await _gen_check_output(
                cmd, cwd=cwd_path, env=env, stdin=stdin, timeout=timeout
            )
        except asyncio.TimeoutError:
            results.stats["failed"] += 1
            LOG.error(f"Running black for {repo_path} timed out ({cmd})")
        except CalledProcessError as cpe:
            # TODO: Tune for smarter for higher signal
            # If any other return value than 1 we raise - can disable project in config
            if cpe.returncode == 1:
                if not project_config["expect_formatting_changes"]:
                    results.stats["failed"] += 1
                    results.failed_projects[repo_path.name] = cpe
                else:
                    results.stats["success"] += 1
                return
            elif cpe.returncode > 1:
                results.stats["failed"] += 1
                results.failed_projects[repo_path.name] = cpe
                return

            LOG.error(f"Unknown error with {repo_path}")
            raise

    # If we get here and expect formatting changes something is up
    if project_config["expect_formatting_changes"]:
        results.stats["failed"] += 1
        results.failed_projects[repo_path.name] = CalledProcessError(
            0, cmd, b"Expected formatting changes but didn't get any!", b""
        )
        return

    results.stats["success"] += 1


async def git_checkout_or_rebase(
    work_path: Path,
    project_config: Dict[str, Any],
    rebase: bool = False,
    *,
    depth: int = 1,
) -> Optional[Path]:
    """git Clone project or rebase"""
    git_bin = str(which(GIT_BINARY))
    if not git_bin:
        LOG.error("No git binary found")
        return None

    repo_url_parts = urlparse(project_config["git_clone_url"])
    path_parts = repo_url_parts.path[1:].split("/", maxsplit=1)

    repo_path: Path = work_path / path_parts[1].replace(".git", "")
    cmd = [git_bin, "clone", "--depth", str(depth), project_config["git_clone_url"]]
    cwd = work_path
    if repo_path.exists() and rebase:
        cmd = [git_bin, "pull", "--rebase"]
        cwd = repo_path
    elif repo_path.exists():
        return repo_path

    try:
        _stdout, _stderr = await _gen_check_output(cmd, cwd=cwd)
    except (asyncio.TimeoutError, CalledProcessError) as e:
        LOG.error(f"Unable to git clone / pull {project_config['git_clone_url']}: {e}")
        return None

    return repo_path


def handle_PermissionError(
    func: Callable[..., None], path: Path, exc: Tuple[Any, Any, Any]
) -> None:
    """
    Handle PermissionError during shutil.rmtree.

    This checks if the erroring function is either 'os.rmdir' or 'os.unlink', and that
    the error was EACCES (i.e. Permission denied). If true, the path is set writable,
    readable, and executable by everyone. Finally, it tries the error causing delete
    operation again.

    If the check is false, then the original error will be reraised as this function
    can't handle it.
    """
    excvalue = exc[1]
    LOG.debug(f"Handling {excvalue} from {func.__name__}... ")
    if func in (os.rmdir, os.unlink) and excvalue.errno == errno.EACCES:
        LOG.debug(f"Setting {path} writable, readable, and executable by everyone... ")
        os.chmod(path, stat.S_IRWXU | stat.S_IRWXG | stat.S_IRWXO)  # chmod 0777
        func(path)  # Try the error causing delete operation again
    else:
        raise


async def load_projects_queue(
    config_path: Path,
    projects_to_run: List[str],
) -> Tuple[Dict[str, Any], asyncio.Queue]:
    """Load project config and fill queue with all the project names"""
    with config_path.open("r") as cfp:
        config = json.load(cfp)

    # TODO: Offer more options here
    # e.g. Run on X random packages etc.
    queue: asyncio.Queue = asyncio.Queue(maxsize=len(projects_to_run))
    for project in projects_to_run:
        await queue.put(project)

    return config, queue


async def project_runner(
    idx: int,
    config: Dict[str, Any],
    queue: asyncio.Queue,
    work_path: Path,
    results: Results,
    long_checkouts: bool = False,
    rebase: bool = False,
    keep: bool = False,
    no_diff: bool = False,
) -> None:
    """Check out project and run Black on it + record result"""
    loop = asyncio.get_event_loop()
    py_version = f"{version_info[0]}.{version_info[1]}"
    while True:
        try:
            project_name = queue.get_nowait()
        except asyncio.QueueEmpty:
            LOG.debug(f"project_runner {idx} exiting")
            return
        LOG.debug(f"worker {idx} working on {project_name}")

        project_config = config["projects"][project_name]

        # Check if disabled by config
        if "disabled" in project_config and project_config["disabled"]:
            results.stats["disabled"] += 1
            LOG.info(f"Skipping {project_name} as it's disabled via config")
            continue

        # Check if we should run on this version of Python
        if (
            "all" not in project_config["py_versions"]
            and py_version not in project_config["py_versions"]
        ):
            results.stats["wrong_py_ver"] += 1
            LOG.debug(f"Skipping {project_name} as it's not enabled for {py_version}")
            continue

        # Check if we're doing big projects / long checkouts
        if not long_checkouts and project_config["long_checkout"]:
            results.stats["skipped_long_checkout"] += 1
            LOG.debug(f"Skipping {project_name} as it's configured as a long checkout")
            continue

        repo_path: Optional[Path] = Path(__file__)
        stdin_project = project_name.upper() == "STDIN"
        if not stdin_project:
            repo_path = await git_checkout_or_rebase(work_path, project_config, rebase)
            if not repo_path:
                continue
        await black_run(project_name, repo_path, project_config, results, no_diff)

        if not keep and not stdin_project:
            LOG.debug(f"Removing {repo_path}")
            rmtree_partial = partial(
                rmtree, path=repo_path, onerror=handle_PermissionError
            )
            await loop.run_in_executor(None, rmtree_partial)

        LOG.info(f"Finished {project_name}")


async def process_queue(
    config_file: str,
    work_path: Path,
    workers: int,
    projects_to_run: List[str],
    keep: bool = False,
    long_checkouts: bool = False,
    rebase: bool = False,
    no_diff: bool = False,
) -> int:
    """
    Process the queue with X workers and evaluate results
    - Success is guaged via the config "expect_formatting_changes"

    Integer return equals the number of failed projects
    """
    results = Results()
    results.stats["disabled"] = 0
    results.stats["failed"] = 0
    results.stats["skipped_long_checkout"] = 0
    results.stats["success"] = 0
    results.stats["wrong_py_ver"] = 0

    config, queue = await load_projects_queue(Path(config_file), projects_to_run)
    project_count = queue.qsize()
    s = "" if project_count == 1 else "s"
    LOG.info(f"{project_count} project{s} to run Black over")
    if project_count < 1:
        return -1

    s = "" if workers == 1 else "s"
    LOG.debug(f"Using {workers} parallel worker{s} to run Black")
    # Wait until we finish running all the projects before analyzing
    await asyncio.gather(
        *[
            project_runner(
                i,
                config,
                queue,
                work_path,
                results,
                long_checkouts,
                rebase,
                keep,
                no_diff,
            )
            for i in range(workers)
        ]
    )

    LOG.info("Analyzing results")
    return analyze_results(project_count, results)


if __name__ == "__main__":  # pragma: nocover
    raise NotImplementedError("lib is a library, funnily enough.")