Lazily import parallelized format modules
`black.reformat_many` depends on a lot of slow-to-import modules. When formatting simply a single file, the time paid to import those modules is totally wasted. So I moved `black.reformat_many` and its helpers to `black.concurrency` which is now *only* imported if there's more than one file to reformat. This way, running Black over a single file is snappier Here are the numbers before and after this patch running `python -m black --version`: - interpreted: 411 ms +- 9 ms -> 342 ms +- 7 ms: 1.20x faster - compiled: 365 ms +- 15 ms -> 304 ms +- 7 ms: 1.20x faster Co-authored-by: Fabio Zadrozny <fabiofz@gmail.com>
This commit is contained in:
parent
c47b91f513
commit
e269f44b25
@ -87,6 +87,8 @@
|
|||||||
|
|
||||||
<!-- Changes that improve Black's performance. -->
|
<!-- Changes that improve Black's performance. -->
|
||||||
|
|
||||||
|
- Reduce Black's startup time when formatting a single file by 15-30% (#3211)
|
||||||
|
|
||||||
## 22.6.0
|
## 22.6.0
|
||||||
|
|
||||||
### Style
|
### Style
|
||||||
|
@ -52,7 +52,7 @@ Formatting
|
|||||||
|
|
||||||
.. autofunction:: black.reformat_one
|
.. autofunction:: black.reformat_one
|
||||||
|
|
||||||
.. autofunction:: black.schedule_formatting
|
.. autofunction:: black.concurrency.schedule_formatting
|
||||||
|
|
||||||
File operations
|
File operations
|
||||||
---------------
|
---------------
|
||||||
@ -173,7 +173,7 @@ Utilities
|
|||||||
|
|
||||||
.. autofunction:: black.linegen.should_split_line
|
.. autofunction:: black.linegen.should_split_line
|
||||||
|
|
||||||
.. autofunction:: black.shutdown
|
.. autofunction:: black.concurrency.shutdown
|
||||||
|
|
||||||
.. autofunction:: black.strings.sub_twice
|
.. autofunction:: black.strings.sub_twice
|
||||||
|
|
||||||
|
@ -1,10 +1,8 @@
|
|||||||
import asyncio
|
|
||||||
import io
|
import io
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import platform
|
import platform
|
||||||
import re
|
import re
|
||||||
import signal
|
|
||||||
import sys
|
import sys
|
||||||
import tokenize
|
import tokenize
|
||||||
import traceback
|
import traceback
|
||||||
@ -13,10 +11,8 @@
|
|||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from json.decoder import JSONDecodeError
|
from json.decoder import JSONDecodeError
|
||||||
from multiprocessing import Manager, freeze_support
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import (
|
from typing import (
|
||||||
TYPE_CHECKING,
|
|
||||||
Any,
|
Any,
|
||||||
Dict,
|
Dict,
|
||||||
Generator,
|
Generator,
|
||||||
@ -32,15 +28,19 @@
|
|||||||
Union,
|
Union,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if sys.version_info >= (3, 8):
|
||||||
|
from typing import Final
|
||||||
|
else:
|
||||||
|
from typing_extensions import Final
|
||||||
|
|
||||||
import click
|
import click
|
||||||
from click.core import ParameterSource
|
from click.core import ParameterSource
|
||||||
from mypy_extensions import mypyc_attr
|
from mypy_extensions import mypyc_attr
|
||||||
from pathspec.patterns.gitwildmatch import GitWildMatchPatternError
|
from pathspec.patterns.gitwildmatch import GitWildMatchPatternError
|
||||||
|
|
||||||
from _black_version import version as __version__
|
from _black_version import version as __version__
|
||||||
from black.cache import Cache, filter_cached, get_cache_info, read_cache, write_cache
|
from black.cache import Cache, get_cache_info, read_cache, write_cache
|
||||||
from black.comments import normalize_fmt_off
|
from black.comments import normalize_fmt_off
|
||||||
from black.concurrency import cancel, maybe_install_uvloop, shutdown
|
|
||||||
from black.const import (
|
from black.const import (
|
||||||
DEFAULT_EXCLUDES,
|
DEFAULT_EXCLUDES,
|
||||||
DEFAULT_INCLUDES,
|
DEFAULT_INCLUDES,
|
||||||
@ -91,10 +91,8 @@
|
|||||||
from blib2to3.pgen2 import token
|
from blib2to3.pgen2 import token
|
||||||
from blib2to3.pytree import Leaf, Node
|
from blib2to3.pytree import Leaf, Node
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
|
||||||
from concurrent.futures import Executor
|
|
||||||
|
|
||||||
COMPILED = Path(__file__).suffix in (".pyd", ".so")
|
COMPILED = Path(__file__).suffix in (".pyd", ".so")
|
||||||
|
DEFAULT_WORKERS: Final = os.cpu_count()
|
||||||
|
|
||||||
# types
|
# types
|
||||||
FileContent = str
|
FileContent = str
|
||||||
@ -125,8 +123,6 @@ def from_configuration(
|
|||||||
# Legacy name, left for integrations.
|
# Legacy name, left for integrations.
|
||||||
FileMode = Mode
|
FileMode = Mode
|
||||||
|
|
||||||
DEFAULT_WORKERS = os.cpu_count()
|
|
||||||
|
|
||||||
|
|
||||||
def read_pyproject_toml(
|
def read_pyproject_toml(
|
||||||
ctx: click.Context, param: click.Parameter, value: Optional[str]
|
ctx: click.Context, param: click.Parameter, value: Optional[str]
|
||||||
@ -592,6 +588,8 @@ def main( # noqa: C901
|
|||||||
report=report,
|
report=report,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
|
from black.concurrency import reformat_many
|
||||||
|
|
||||||
reformat_many(
|
reformat_many(
|
||||||
sources=sources,
|
sources=sources,
|
||||||
fast=fast,
|
fast=fast,
|
||||||
@ -776,132 +774,6 @@ def reformat_one(
|
|||||||
report.failed(src, str(exc))
|
report.failed(src, str(exc))
|
||||||
|
|
||||||
|
|
||||||
# diff-shades depends on being to monkeypatch this function to operate. I know it's
|
|
||||||
# not ideal, but this shouldn't cause any issues ... hopefully. ~ichard26
|
|
||||||
@mypyc_attr(patchable=True)
|
|
||||||
def reformat_many(
|
|
||||||
sources: Set[Path],
|
|
||||||
fast: bool,
|
|
||||||
write_back: WriteBack,
|
|
||||||
mode: Mode,
|
|
||||||
report: "Report",
|
|
||||||
workers: Optional[int],
|
|
||||||
) -> None:
|
|
||||||
"""Reformat multiple files using a ProcessPoolExecutor."""
|
|
||||||
from concurrent.futures import Executor, ProcessPoolExecutor, ThreadPoolExecutor
|
|
||||||
|
|
||||||
executor: Executor
|
|
||||||
worker_count = workers if workers is not None else DEFAULT_WORKERS
|
|
||||||
if sys.platform == "win32":
|
|
||||||
# Work around https://bugs.python.org/issue26903
|
|
||||||
assert worker_count is not None
|
|
||||||
worker_count = min(worker_count, 60)
|
|
||||||
try:
|
|
||||||
executor = ProcessPoolExecutor(max_workers=worker_count)
|
|
||||||
except (ImportError, NotImplementedError, OSError):
|
|
||||||
# we arrive here if the underlying system does not support multi-processing
|
|
||||||
# like in AWS Lambda or Termux, in which case we gracefully fallback to
|
|
||||||
# a ThreadPoolExecutor with just a single worker (more workers would not do us
|
|
||||||
# any good due to the Global Interpreter Lock)
|
|
||||||
executor = ThreadPoolExecutor(max_workers=1)
|
|
||||||
|
|
||||||
loop = asyncio.new_event_loop()
|
|
||||||
asyncio.set_event_loop(loop)
|
|
||||||
try:
|
|
||||||
loop.run_until_complete(
|
|
||||||
schedule_formatting(
|
|
||||||
sources=sources,
|
|
||||||
fast=fast,
|
|
||||||
write_back=write_back,
|
|
||||||
mode=mode,
|
|
||||||
report=report,
|
|
||||||
loop=loop,
|
|
||||||
executor=executor,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
finally:
|
|
||||||
try:
|
|
||||||
shutdown(loop)
|
|
||||||
finally:
|
|
||||||
asyncio.set_event_loop(None)
|
|
||||||
if executor is not None:
|
|
||||||
executor.shutdown()
|
|
||||||
|
|
||||||
|
|
||||||
async def schedule_formatting(
|
|
||||||
sources: Set[Path],
|
|
||||||
fast: bool,
|
|
||||||
write_back: WriteBack,
|
|
||||||
mode: Mode,
|
|
||||||
report: "Report",
|
|
||||||
loop: asyncio.AbstractEventLoop,
|
|
||||||
executor: "Executor",
|
|
||||||
) -> None:
|
|
||||||
"""Run formatting of `sources` in parallel using the provided `executor`.
|
|
||||||
|
|
||||||
(Use ProcessPoolExecutors for actual parallelism.)
|
|
||||||
|
|
||||||
`write_back`, `fast`, and `mode` options are passed to
|
|
||||||
:func:`format_file_in_place`.
|
|
||||||
"""
|
|
||||||
cache: Cache = {}
|
|
||||||
if write_back not in (WriteBack.DIFF, WriteBack.COLOR_DIFF):
|
|
||||||
cache = read_cache(mode)
|
|
||||||
sources, cached = filter_cached(cache, sources)
|
|
||||||
for src in sorted(cached):
|
|
||||||
report.done(src, Changed.CACHED)
|
|
||||||
if not sources:
|
|
||||||
return
|
|
||||||
|
|
||||||
cancelled = []
|
|
||||||
sources_to_cache = []
|
|
||||||
lock = None
|
|
||||||
if write_back in (WriteBack.DIFF, WriteBack.COLOR_DIFF):
|
|
||||||
# For diff output, we need locks to ensure we don't interleave output
|
|
||||||
# from different processes.
|
|
||||||
manager = Manager()
|
|
||||||
lock = manager.Lock()
|
|
||||||
tasks = {
|
|
||||||
asyncio.ensure_future(
|
|
||||||
loop.run_in_executor(
|
|
||||||
executor, format_file_in_place, src, fast, mode, write_back, lock
|
|
||||||
)
|
|
||||||
): src
|
|
||||||
for src in sorted(sources)
|
|
||||||
}
|
|
||||||
pending = tasks.keys()
|
|
||||||
try:
|
|
||||||
loop.add_signal_handler(signal.SIGINT, cancel, pending)
|
|
||||||
loop.add_signal_handler(signal.SIGTERM, cancel, pending)
|
|
||||||
except NotImplementedError:
|
|
||||||
# There are no good alternatives for these on Windows.
|
|
||||||
pass
|
|
||||||
while pending:
|
|
||||||
done, _ = await asyncio.wait(pending, return_when=asyncio.FIRST_COMPLETED)
|
|
||||||
for task in done:
|
|
||||||
src = tasks.pop(task)
|
|
||||||
if task.cancelled():
|
|
||||||
cancelled.append(task)
|
|
||||||
elif task.exception():
|
|
||||||
report.failed(src, str(task.exception()))
|
|
||||||
else:
|
|
||||||
changed = Changed.YES if task.result() else Changed.NO
|
|
||||||
# If the file was written back or was successfully checked as
|
|
||||||
# well-formatted, store this information in the cache.
|
|
||||||
if write_back is WriteBack.YES or (
|
|
||||||
write_back is WriteBack.CHECK and changed is Changed.NO
|
|
||||||
):
|
|
||||||
sources_to_cache.append(src)
|
|
||||||
report.done(src, changed)
|
|
||||||
if cancelled:
|
|
||||||
if sys.version_info >= (3, 7):
|
|
||||||
await asyncio.gather(*cancelled, return_exceptions=True)
|
|
||||||
else:
|
|
||||||
await asyncio.gather(*cancelled, loop=loop, return_exceptions=True)
|
|
||||||
if sources_to_cache:
|
|
||||||
write_cache(cache, sources_to_cache, mode)
|
|
||||||
|
|
||||||
|
|
||||||
def format_file_in_place(
|
def format_file_in_place(
|
||||||
src: Path,
|
src: Path,
|
||||||
fast: bool,
|
fast: bool,
|
||||||
@ -1506,8 +1378,11 @@ def patch_click() -> None:
|
|||||||
|
|
||||||
|
|
||||||
def patched_main() -> None:
|
def patched_main() -> None:
|
||||||
maybe_install_uvloop()
|
if sys.platform == "win32" and getattr(sys, "frozen", False):
|
||||||
freeze_support()
|
from multiprocessing import freeze_support
|
||||||
|
|
||||||
|
freeze_support()
|
||||||
|
|
||||||
patch_click()
|
patch_click()
|
||||||
main()
|
main()
|
||||||
|
|
||||||
|
@ -1,9 +1,25 @@
|
|||||||
|
"""
|
||||||
|
Formatting many files at once via multiprocessing. Contains entrypoint and utilities.
|
||||||
|
|
||||||
|
NOTE: this module is only imported if we need to format several files at once.
|
||||||
|
"""
|
||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
import logging
|
import logging
|
||||||
|
import signal
|
||||||
import sys
|
import sys
|
||||||
from typing import Any, Iterable
|
from concurrent.futures import Executor, ProcessPoolExecutor, ThreadPoolExecutor
|
||||||
|
from multiprocessing import Manager
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Iterable, Optional, Set
|
||||||
|
|
||||||
|
from mypy_extensions import mypyc_attr
|
||||||
|
|
||||||
|
from black import DEFAULT_WORKERS, WriteBack, format_file_in_place
|
||||||
|
from black.cache import Cache, filter_cached, read_cache, write_cache
|
||||||
|
from black.mode import Mode
|
||||||
from black.output import err
|
from black.output import err
|
||||||
|
from black.report import Changed, Report
|
||||||
|
|
||||||
|
|
||||||
def maybe_install_uvloop() -> None:
|
def maybe_install_uvloop() -> None:
|
||||||
@ -11,7 +27,6 @@ def maybe_install_uvloop() -> None:
|
|||||||
|
|
||||||
This is called only from command-line entry points to avoid
|
This is called only from command-line entry points to avoid
|
||||||
interfering with the parent process if Black is used as a library.
|
interfering with the parent process if Black is used as a library.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
import uvloop
|
import uvloop
|
||||||
@ -55,3 +70,129 @@ def shutdown(loop: asyncio.AbstractEventLoop) -> None:
|
|||||||
cf_logger = logging.getLogger("concurrent.futures")
|
cf_logger = logging.getLogger("concurrent.futures")
|
||||||
cf_logger.setLevel(logging.CRITICAL)
|
cf_logger.setLevel(logging.CRITICAL)
|
||||||
loop.close()
|
loop.close()
|
||||||
|
|
||||||
|
|
||||||
|
# diff-shades depends on being to monkeypatch this function to operate. I know it's
|
||||||
|
# not ideal, but this shouldn't cause any issues ... hopefully. ~ichard26
|
||||||
|
@mypyc_attr(patchable=True)
|
||||||
|
def reformat_many(
|
||||||
|
sources: Set[Path],
|
||||||
|
fast: bool,
|
||||||
|
write_back: WriteBack,
|
||||||
|
mode: Mode,
|
||||||
|
report: Report,
|
||||||
|
workers: Optional[int],
|
||||||
|
) -> None:
|
||||||
|
"""Reformat multiple files using a ProcessPoolExecutor."""
|
||||||
|
maybe_install_uvloop()
|
||||||
|
|
||||||
|
executor: Executor
|
||||||
|
worker_count = workers if workers is not None else DEFAULT_WORKERS
|
||||||
|
if sys.platform == "win32":
|
||||||
|
# Work around https://bugs.python.org/issue26903
|
||||||
|
assert worker_count is not None
|
||||||
|
worker_count = min(worker_count, 60)
|
||||||
|
try:
|
||||||
|
executor = ProcessPoolExecutor(max_workers=worker_count)
|
||||||
|
except (ImportError, NotImplementedError, OSError):
|
||||||
|
# we arrive here if the underlying system does not support multi-processing
|
||||||
|
# like in AWS Lambda or Termux, in which case we gracefully fallback to
|
||||||
|
# a ThreadPoolExecutor with just a single worker (more workers would not do us
|
||||||
|
# any good due to the Global Interpreter Lock)
|
||||||
|
executor = ThreadPoolExecutor(max_workers=1)
|
||||||
|
|
||||||
|
loop = asyncio.new_event_loop()
|
||||||
|
asyncio.set_event_loop(loop)
|
||||||
|
try:
|
||||||
|
loop.run_until_complete(
|
||||||
|
schedule_formatting(
|
||||||
|
sources=sources,
|
||||||
|
fast=fast,
|
||||||
|
write_back=write_back,
|
||||||
|
mode=mode,
|
||||||
|
report=report,
|
||||||
|
loop=loop,
|
||||||
|
executor=executor,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
finally:
|
||||||
|
try:
|
||||||
|
shutdown(loop)
|
||||||
|
finally:
|
||||||
|
asyncio.set_event_loop(None)
|
||||||
|
if executor is not None:
|
||||||
|
executor.shutdown()
|
||||||
|
|
||||||
|
|
||||||
|
async def schedule_formatting(
|
||||||
|
sources: Set[Path],
|
||||||
|
fast: bool,
|
||||||
|
write_back: WriteBack,
|
||||||
|
mode: Mode,
|
||||||
|
report: "Report",
|
||||||
|
loop: asyncio.AbstractEventLoop,
|
||||||
|
executor: "Executor",
|
||||||
|
) -> None:
|
||||||
|
"""Run formatting of `sources` in parallel using the provided `executor`.
|
||||||
|
|
||||||
|
(Use ProcessPoolExecutors for actual parallelism.)
|
||||||
|
|
||||||
|
`write_back`, `fast`, and `mode` options are passed to
|
||||||
|
:func:`format_file_in_place`.
|
||||||
|
"""
|
||||||
|
cache: Cache = {}
|
||||||
|
if write_back not in (WriteBack.DIFF, WriteBack.COLOR_DIFF):
|
||||||
|
cache = read_cache(mode)
|
||||||
|
sources, cached = filter_cached(cache, sources)
|
||||||
|
for src in sorted(cached):
|
||||||
|
report.done(src, Changed.CACHED)
|
||||||
|
if not sources:
|
||||||
|
return
|
||||||
|
|
||||||
|
cancelled = []
|
||||||
|
sources_to_cache = []
|
||||||
|
lock = None
|
||||||
|
if write_back in (WriteBack.DIFF, WriteBack.COLOR_DIFF):
|
||||||
|
# For diff output, we need locks to ensure we don't interleave output
|
||||||
|
# from different processes.
|
||||||
|
manager = Manager()
|
||||||
|
lock = manager.Lock()
|
||||||
|
tasks = {
|
||||||
|
asyncio.ensure_future(
|
||||||
|
loop.run_in_executor(
|
||||||
|
executor, format_file_in_place, src, fast, mode, write_back, lock
|
||||||
|
)
|
||||||
|
): src
|
||||||
|
for src in sorted(sources)
|
||||||
|
}
|
||||||
|
pending = tasks.keys()
|
||||||
|
try:
|
||||||
|
loop.add_signal_handler(signal.SIGINT, cancel, pending)
|
||||||
|
loop.add_signal_handler(signal.SIGTERM, cancel, pending)
|
||||||
|
except NotImplementedError:
|
||||||
|
# There are no good alternatives for these on Windows.
|
||||||
|
pass
|
||||||
|
while pending:
|
||||||
|
done, _ = await asyncio.wait(pending, return_when=asyncio.FIRST_COMPLETED)
|
||||||
|
for task in done:
|
||||||
|
src = tasks.pop(task)
|
||||||
|
if task.cancelled():
|
||||||
|
cancelled.append(task)
|
||||||
|
elif task.exception():
|
||||||
|
report.failed(src, str(task.exception()))
|
||||||
|
else:
|
||||||
|
changed = Changed.YES if task.result() else Changed.NO
|
||||||
|
# If the file was written back or was successfully checked as
|
||||||
|
# well-formatted, store this information in the cache.
|
||||||
|
if write_back is WriteBack.YES or (
|
||||||
|
write_back is WriteBack.CHECK and changed is Changed.NO
|
||||||
|
):
|
||||||
|
sources_to_cache.append(src)
|
||||||
|
report.done(src, changed)
|
||||||
|
if cancelled:
|
||||||
|
if sys.version_info >= (3, 7):
|
||||||
|
await asyncio.gather(*cancelled, return_exceptions=True)
|
||||||
|
else:
|
||||||
|
await asyncio.gather(*cancelled, loop=loop, return_exceptions=True)
|
||||||
|
if sources_to_cache:
|
||||||
|
write_cache(cache, sources_to_cache, mode)
|
||||||
|
@ -1763,7 +1763,9 @@ def test_output_locking_when_writeback_diff(self, color: bool) -> None:
|
|||||||
src = (workspace / f"test{tag}.py").resolve()
|
src = (workspace / f"test{tag}.py").resolve()
|
||||||
with src.open("w") as fobj:
|
with src.open("w") as fobj:
|
||||||
fobj.write("print('hello')")
|
fobj.write("print('hello')")
|
||||||
with patch("black.Manager", wraps=multiprocessing.Manager) as mgr:
|
with patch(
|
||||||
|
"black.concurrency.Manager", wraps=multiprocessing.Manager
|
||||||
|
) as mgr:
|
||||||
cmd = ["--diff", str(workspace)]
|
cmd = ["--diff", str(workspace)]
|
||||||
if color:
|
if color:
|
||||||
cmd.append("--color")
|
cmd.append("--color")
|
||||||
@ -1810,7 +1812,7 @@ def test_filter_cached(self) -> None:
|
|||||||
str(cached): black.get_cache_info(cached),
|
str(cached): black.get_cache_info(cached),
|
||||||
str(cached_but_changed): (0.0, 0),
|
str(cached_but_changed): (0.0, 0),
|
||||||
}
|
}
|
||||||
todo, done = black.filter_cached(
|
todo, done = black.cache.filter_cached(
|
||||||
cache, {uncached, cached, cached_but_changed}
|
cache, {uncached, cached, cached_but_changed}
|
||||||
)
|
)
|
||||||
assert todo == {uncached, cached_but_changed}
|
assert todo == {uncached, cached_but_changed}
|
||||||
|
Loading…
Reference in New Issue
Block a user