From 4ef85904b551a61bb3be906db9e076d9cccc1d6b Mon Sep 17 00:00:00 2001 From: MeGaGiGaGon <107241144+MeGaGiGaGon@users.noreply.github.com> Date: Mon, 6 Oct 2025 13:38:11 -0700 Subject: [PATCH 1/4] Update concurrency.py --- src/black/concurrency.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/black/concurrency.py b/src/black/concurrency.py index f6a2b8a93be..fd8db5f7097 100644 --- a/src/black/concurrency.py +++ b/src/black/concurrency.py @@ -164,7 +164,7 @@ async def schedule_formatting( executor, format_file_in_place, src, fast, mode, write_back, lock ) ): src - for src in sorted(sources) + for src in sorted(sources, key=lambda f: (-f.stat().st_size, f)) } pending = tasks.keys() try: From 71e7db905dc1e237fa844c2c9877ced5e29da371 Mon Sep 17 00:00:00 2001 From: GiGaGon <107241144+MeGaGiGaGon@users.noreply.github.com> Date: Tue, 14 Oct 2025 10:49:09 -0700 Subject: [PATCH 2/4] Add changelog entry --- CHANGES.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGES.md b/CHANGES.md index efe3c22fe04..cf48aaf2cb5 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -34,6 +34,8 @@ +- Format files from largest to smallest to improve benefits from concurrency (#4784) + ### Output From fb45251d7e6da47f6a603e9f867373dce87306b5 Mon Sep 17 00:00:00 2001 From: cobalt <61329810+cobaltt7@users.noreply.github.com> Date: Sun, 26 Oct 2025 17:33:05 -0500 Subject: [PATCH 3/4] reuse stats from cache logic. these are very ugly types for the sake of not enumerating again Signed-off-by: cobalt <61329810+cobaltt7@users.noreply.github.com> --- src/black/__init__.py | 2 +- src/black/cache.py | 31 ++++++++++++++++++------------- src/black/concurrency.py | 29 ++++++++++++++++++++++++----- tests/test_black.py | 32 ++++++++++++++++---------------- 4 files changed, 59 insertions(+), 35 deletions(-) diff --git a/src/black/__init__.py b/src/black/__init__.py index 79541df2149..136344956da 100644 --- a/src/black/__init__.py +++ b/src/black/__init__.py @@ -904,7 +904,7 @@ def reformat_one( else: cache = Cache.read(mode) if write_back not in (WriteBack.DIFF, WriteBack.COLOR_DIFF): - if not cache.is_changed(src): + if not cache.is_changed(src)[0]: changed = Changed.CACHED if changed is not Changed.CACHED and format_file_in_place( src, fast=fast, write_back=write_back, mode=mode, lines=lines diff --git a/src/black/cache.py b/src/black/cache.py index ef9d99a7b90..7ee06e9b441 100644 --- a/src/black/cache.py +++ b/src/black/cache.py @@ -8,7 +8,7 @@ from collections.abc import Iterable from dataclasses import dataclass, field from pathlib import Path -from typing import NamedTuple +from typing import NamedTuple, Optional, Union from platformdirs import user_cache_dir @@ -99,33 +99,38 @@ def get_file_data(path: Path) -> FileData: hash = Cache.hash_digest(path) return FileData(stat.st_mtime, stat.st_size, hash) - def is_changed(self, source: Path) -> bool: - """Check if source has changed compared to cached version.""" + def is_changed(self, source: Path) -> tuple[bool, Optional[os.stat_result]]: + """Check if source has changed compared to cached version. + + Also returns the stat result that was used.""" res_src = source.resolve() old = self.file_data.get(str(res_src)) if old is None: - return True + return True, None st = res_src.stat() if st.st_size != old.st_size: - return True + return True, st if st.st_mtime != old.st_mtime: new_hash = Cache.hash_digest(res_src) if new_hash != old.hash: - return True - return False + return True, st + return False, st - def filtered_cached(self, sources: Iterable[Path]) -> tuple[set[Path], set[Path]]: + def filtered_cached( + self, sources: Iterable[Path] + ) -> tuple[set[tuple[Optional[os.stat_result], Path]], set[Path]]: """Split an iterable of paths in `sources` into two sets. - The first contains paths of files that modified on disk or are not in the - cache. The other contains paths to non-modified files. + The first contains paths and stat results of files that modified on disk + or are not in the cache. The other contains paths to non-modified files. """ - changed: set[Path] = set() + changed: set[tuple[Optional[os.stat_result], Path]] = set() done: set[Path] = set() for src in sources: - if self.is_changed(src): - changed.add(src) + is_changed, stat = self.is_changed(src) + if is_changed: + changed.add((stat, src)) else: done.add(src) return changed, done diff --git a/src/black/concurrency.py b/src/black/concurrency.py index fd8db5f7097..b786adc9342 100644 --- a/src/black/concurrency.py +++ b/src/black/concurrency.py @@ -16,7 +16,7 @@ from concurrent.futures import Executor, ProcessPoolExecutor, ThreadPoolExecutor from multiprocessing import Manager from pathlib import Path -from typing import Any, Optional +from typing import Any, Optional, Union from mypy_extensions import mypyc_attr @@ -143,11 +143,15 @@ async def schedule_formatting( :func:`format_file_in_place`. """ cache = Cache.read(mode) - if write_back not in (WriteBack.DIFF, WriteBack.COLOR_DIFF): - sources, cached = cache.filtered_cached(sources) + + if write_back in (WriteBack.DIFF, WriteBack.COLOR_DIFF): + sources_with_stats: Union[set[Path], set[tuple[Optional[os.stat_result], Path]]] = sources + else: + sources_with_stats, cached = cache.filtered_cached(sources) for src in sorted(cached): report.done(src, Changed.CACHED) - if not sources: + + if not sources_with_stats: return cancelled = [] @@ -164,7 +168,7 @@ async def schedule_formatting( executor, format_file_in_place, src, fast, mode, write_back, lock ) ): src - for src in sorted(sources, key=lambda f: (-f.stat().st_size, f)) + for _, src in sorted(sources_with_stats, key=_sources_sort_key) } pending = tasks.keys() try: @@ -196,3 +200,18 @@ async def schedule_formatting( await asyncio.gather(*cancelled, return_exceptions=True) if sources_to_cache: cache.write(sources_to_cache) + + +def _sources_sort_key( + source: Union[Path, tuple[Optional[os.stat_result], Path]], +) -> tuple[int, Path]: + if isinstance(source, tuple): + stat, src = source + else: + src = source + stat = None + + if not stat: + stat = src.stat() + + return -stat.st_size, src diff --git a/tests/test_black.py b/tests/test_black.py index 36ee7d9e1b9..22b40103806 100644 --- a/tests/test_black.py +++ b/tests/test_black.py @@ -1113,9 +1113,9 @@ def test_single_file_force_pyi(self) -> None: actual = path.read_text(encoding="utf-8") # verify cache with --pyi is separate pyi_cache = black.Cache.read(pyi_mode) - assert not pyi_cache.is_changed(path) + assert not pyi_cache.is_changed(path)[0] normal_cache = black.Cache.read(DEFAULT_MODE) - assert normal_cache.is_changed(path) + assert normal_cache.is_changed(path)[0] self.assertFormatEqual(expected, actual) black.assert_equivalent(contents, actual) black.assert_stable(contents, actual, pyi_mode) @@ -1140,8 +1140,8 @@ def test_multi_file_force_pyi(self) -> None: pyi_cache = black.Cache.read(pyi_mode) normal_cache = black.Cache.read(reg_mode) for path in paths: - assert not pyi_cache.is_changed(path) - assert normal_cache.is_changed(path) + assert not pyi_cache.is_changed(path)[0] + assert normal_cache.is_changed(path)[0] def test_pipe_force_pyi(self) -> None: source, expected = read_data("miscellaneous", "force_pyi") @@ -1163,9 +1163,9 @@ def test_single_file_force_py36(self) -> None: actual = path.read_text(encoding="utf-8") # verify cache with --target-version is separate py36_cache = black.Cache.read(py36_mode) - assert not py36_cache.is_changed(path) + assert not py36_cache.is_changed(path)[0] normal_cache = black.Cache.read(reg_mode) - assert normal_cache.is_changed(path) + assert normal_cache.is_changed(path)[0] self.assertEqual(actual, expected) @event_loop() @@ -1188,8 +1188,8 @@ def test_multi_file_force_py36(self) -> None: pyi_cache = black.Cache.read(py36_mode) normal_cache = black.Cache.read(reg_mode) for path in paths: - assert not pyi_cache.is_changed(path) - assert normal_cache.is_changed(path) + assert not pyi_cache.is_changed(path)[0] + assert normal_cache.is_changed(path)[0] def test_pipe_force_py36(self) -> None: source, expected = read_data("miscellaneous", "force_py36") @@ -2153,7 +2153,7 @@ def test_cache_broken_file(self) -> None: src.write_text("print('hello')", encoding="utf-8") invokeBlack([str(src)]) cache = black.Cache.read(mode) - assert not cache.is_changed(src) + assert not cache.is_changed(src)[0] def test_cache_single_file_already_cached(self) -> None: mode = DEFAULT_MODE @@ -2182,8 +2182,8 @@ def test_cache_multiple_files(self) -> None: assert one.read_text(encoding="utf-8") == "print('hello')" assert two.read_text(encoding="utf-8") == 'print("hello")\n' cache = black.Cache.read(mode) - assert not cache.is_changed(one) - assert not cache.is_changed(two) + assert not cache.is_changed(one)[0] + assert not cache.is_changed(two)[0] @pytest.mark.incompatible_with_mypyc @pytest.mark.parametrize("color", [False, True], ids=["no-color", "with-color"]) @@ -2246,7 +2246,7 @@ def test_write_cache_read_cache(self) -> None: write_cache = black.Cache.read(mode) write_cache.write([src]) read_cache = black.Cache.read(mode) - assert not read_cache.is_changed(src) + assert not read_cache.is_changed(src)[0] @pytest.mark.incompatible_with_mypyc def test_filter_cached(self) -> None: @@ -2334,8 +2334,8 @@ def test_failed_formatting_does_not_get_cached(self) -> None: clean.write_text('print("hello")\n', encoding="utf-8") invokeBlack([str(workspace)], exit_code=123) cache = black.Cache.read(mode) - assert cache.is_changed(failing) - assert not cache.is_changed(clean) + assert cache.is_changed(failing)[0] + assert not cache.is_changed(clean)[0] def test_write_cache_write_fail(self) -> None: mode = DEFAULT_MODE @@ -2354,9 +2354,9 @@ def test_read_cache_line_lengths(self) -> None: cache = black.Cache.read(mode) cache.write([path]) one = black.Cache.read(mode) - assert not one.is_changed(path) + assert not one.is_changed(path)[0] two = black.Cache.read(short_mode) - assert two.is_changed(path) + assert two.is_changed(path)[0] def test_cache_key(self) -> None: # Test that all members of the mode enum affect the cache key. From 2c67c009b4dda0f351cb15892a07dcca4c1cd57f Mon Sep 17 00:00:00 2001 From: cobalt <61329810+cobaltt7@users.noreply.github.com> Date: Sun, 26 Oct 2025 17:43:07 -0500 Subject: [PATCH 4/4] oops, ig we need another loop anyway Signed-off-by: cobalt <61329810+cobaltt7@users.noreply.github.com> --- src/black/concurrency.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/src/black/concurrency.py b/src/black/concurrency.py index 6cf3eef520b..70e250c0d5b 100644 --- a/src/black/concurrency.py +++ b/src/black/concurrency.py @@ -16,7 +16,7 @@ from concurrent.futures import Executor, ProcessPoolExecutor, ThreadPoolExecutor from multiprocessing import Manager from pathlib import Path -from typing import Any, Optional, Union +from typing import Any, Optional from mypy_extensions import mypyc_attr @@ -150,7 +150,9 @@ async def schedule_formatting( WriteBack.DIFF, WriteBack.COLOR_DIFF, ): - sources_with_stats: Union[set[Path], set[tuple[Optional[os.stat_result], Path]]] = sources + sources_with_stats: Iterable[tuple[Optional[os.stat_result], Path]] = ( + (None, src) for src in sources + ) else: sources_with_stats, cached = cache.filtered_cached(sources) for src in sorted(cached): @@ -208,13 +210,9 @@ async def schedule_formatting( def _sources_sort_key( - source: Union[Path, tuple[Optional[os.stat_result], Path]], + source: tuple[Optional[os.stat_result], Path], ) -> tuple[int, Path]: - if isinstance(source, tuple): - stat, src = source - else: - src = source - stat = None + stat, src = source if not stat: stat = src.stat()