Skip to content

Conversation

@gouzil
Copy link
Member

@gouzil gouzil commented Aug 17, 2025

PR Category

User Experience

PR Types

Not User Facing

Description

  • 从本 pr 开始, python format 将变成 ”混动模式“
  • 由于 exclude 不能写注释所以不能写注释防止冲突
  • 计划以 3 级目录为最大 exclude 值进行推进,部分文件夹下文件多的以 首个差异字母 进行推进

可以使用下面的代码查看哪些文件夹是可以由 ruff format 完全接管

from __future__ import annotations

import argparse
import os
import subprocess
import sys
from collections import defaultdict
from pathlib import Path


def get_files_needing_format(paths=None):
    """
    运行 `ruff format --check`,返回需要格式化的文件路径列表。
    """
    cmd = ["ruff", "format", "--check"]
    if paths:
        cmd.extend(paths)
    try:
        result = subprocess.run(
            cmd,
            capture_output=True,
            text=True,
            check=False,
        )
    except FileNotFoundError:
        print("找不到 `ruff` 命令,请确保已正确安装 Ruff。", file=sys.stderr)
        sys.exit(1)

    # Ruff 的退出状态码:0 表示无需格式化,1 表示有文件需格式化  [oai_citation:0‡Astral 文档](https://docs.astral.sh/ruff/formatter/?utm_source=chatgpt.com)
    output = result.stdout
    files: list[Path] = []
    for line in output.splitlines():
        prefix = "Would reformat: "
        if line.startswith(prefix):
            filepath = line[len(prefix) :].strip()
            files.append(Path(filepath))
    return files


# 按照文件夹层级拆分
def split_files_by_directory(
    files: list[Path], level: int
) -> dict[Path, list[Path]]:
    """
    将文件按照文件夹层级拆分。
    """
    if not files:
        return {}

    if level is None:
        raise ValueError("level 不能为空")

    # 负数或 0 统一视为不分层,全部归为根目录
    if level <= 0:
        return {Path("."): [Path(f) for f in files]}

    base = Path.cwd().resolve()
    buckets: dict[Path, list[Path]] = defaultdict(list)

    for f in files:
        p = Path(f)

        # 优先转为相对仓库根(当前工作目录),便于稳定分组
        resolved = p.resolve(strict=False)
        try:
            rel = resolved.relative_to(base)
        except ValueError:
            # 不在当前工作目录下,保留原路径(可能为绝对路径)
            rel = resolved

        dir_rel = rel.parent

        # 计算给定层级的目录 key
        if dir_rel.is_absolute() and dir_rel.anchor:
            # 绝对路径:忽略锚点('/'等)进行层级计数,但保留锚点构造 key
            parts_wo_anchor = dir_rel.parts[1:]
            selected = parts_wo_anchor[:level] if parts_wo_anchor else []
            key = (
                Path(dir_rel.anchor, *selected)
                if selected
                else Path(dir_rel.anchor)
            )
        else:
            selected = dir_rel.parts[:level] if dir_rel.parts else []
            key = Path(*selected) if selected else Path(".")

        buckets[key].append(rel)

    # 稳定输出:每个桶内的文件进行排序
    for k in list(buckets.keys()):
        buckets[k] = sorted(buckets[k], key=str)

    return buckets


DEFAULT_EXCLUDE_DIRS: list[str] = [
    ".git",
    "__pycache__",
    ".tox",
    ".mypy_cache",
    ".ruff_cache",
    ".pytest_cache",
    "venv",
    ".venv",
    "venvs",
    "build",
    "dist",
    ".vscode",
    ".idea",
    ".cache",
    "third_party",
]


from typing import TYPE_CHECKING

if TYPE_CHECKING:  # 仅用于类型检查,避免运行时不必要导入
    from collections.abc import Sequence


def _iter_python_files(
    root: Path, exclude_dirs: Sequence[str | Path] | None = None
):
    """
    在 root 下递归遍历 .py/.pyi 文件,支持排除目录。

    说明:使用 os.walk 并在遍历时原地剪枝,以避免进入被排除的目录。
    """
    root_resolved = root.resolve()

    # 规范化排除目录为绝对路径,便于比较
    ex_paths: list[Path] = []
    for e in exclude_dirs or []:
        ep = Path(e)
        if not ep.is_absolute():
            ep = (root_resolved / ep).resolve()
        else:
            ep = ep.resolve()
        ex_paths.append(ep)

    for dirpath, dirnames, filenames in os.walk(root_resolved):
        dp = Path(dirpath)

        # 剪枝:移除需要排除的子目录,避免深入
        for name in list(dirnames):
            subdir = (dp / name).resolve()
            if any(
                subdir == ex or subdir.is_relative_to(ex) for ex in ex_paths
            ):
                dirnames.remove(name)

        for fn in filenames:
            if fn.endswith(".py") or fn.endswith(".pyi"):
                yield (dp / fn).resolve()


def diff_dir(
    need_fix_files: list[Path],
    paths: None | list[str] = None,
    exclude_dirs: Sequence[str | Path] | None = None,
) -> dict[Path, list[Path]]:
    """
    计算哪些是已经修复过可以不需要修复的文件夹。

    通过扫描 (.py|.pyi) 后缀文件, 并找出不在需要修复列表中的文件。
    """
    fixed_dirs: dict[Path, list[Path]] = defaultdict(list)

    if not paths:
        paths_ = Path.cwd()
    else:
        paths_ = Path(paths[0])

    # 规范化 need_fix_files 为绝对路径,确保比较一致
    need_fix_set = {
        Path(p).resolve(strict=False) for p in (need_fix_files or [])
    }

    # 扫描项目下所有的 .py 和 .pyi 文件(支持排除目录)
    files_list: list[Path] = list(
        _iter_python_files(paths_, (exclude_dirs or []) or DEFAULT_EXCLUDE_DIRS)
    )

    for f in files_list:
        if f not in need_fix_set:
            fixed_dirs[f.parent.resolve()].append(f)

    return fixed_dirs


def collapse_dirs(dirs: list[Path]) -> list[Path]:
    """
    将目录集合压缩为“最底层”集合:若父子目录同时存在,仅保留子目录。
    例如同时有 /a、/a/b、/a/b/c,则最终只保留 /a/b/c。
    """
    # 统一为绝对路径并去重
    unique_dirs = {Path(d).resolve() for d in dirs}
    # 先按层级(parts 长度)降序,优先处理更深的目录
    ordered = sorted(unique_dirs, key=lambda p: (-len(p.parts), str(p)))
    leaves: list[Path] = []
    for d in ordered:
        # 若已存在更深层(或相同)的目录位于 d 之下,则跳过 d(保留更深的)
        if any(e == d or e.is_relative_to(d) for e in leaves):
            continue
        leaves.append(d)
    return leaves


def main():
    parser = argparse.ArgumentParser(description="Ruff 格式化辅助工具")
    parser.add_argument(
        "paths",
        nargs="*",
        help="要扫描的起始路径(默认当前工作目录)",
    )
    parser.add_argument(
        "-x",
        "--exclude",
        action="append",
        default=[],
        help=(
            "需要排除的目录,可重复指定。例如 -x build -x venv。"
            "若未指定,将使用内置的常见忽略目录。"
        ),
    )

    args = parser.parse_args()

    paths = args.paths if args.paths else None
    files = get_files_needing_format(paths)
    # 如需按层级查看 Ruff 需要修复的文件分布,可打开下行
    # split_files = split_files_by_directory(files, 10)
    # for key, value in split_files.items():
    #     # print(f"{key}: {len(value)}")
    #     print(f"Directory: {key}")
    #     for file in value:
    #         print(f"  - {file}")
    exclude_dirs = (
        [*DEFAULT_EXCLUDE_DIRS, *args.exclude]
        if args.exclude
        else DEFAULT_EXCLUDE_DIRS
    )
    fixed_dirs = diff_dir(files, paths, exclude_dirs)
    # 输出一个简要统计,避免未使用变量告警
    collapsed = collapse_dirs(list(fixed_dirs.keys()))
    print(f"候选可忽略目录数(折叠后): {len(collapsed)}")
    for dir_path in collapsed:
        print(f"  - {dir_path}")


if __name__ == "__main__":
    main()

@gouzil gouzil requested a review from SigureMo as a code owner August 17, 2025 15:20
@paddle-bot
Copy link

paddle-bot bot commented Aug 17, 2025

你的PR提交成功,感谢你对开源项目的贡献!
请关注后续CI自动化测试结果,详情请参考Paddle-CI手册
Your PR has been submitted. Thanks for your contribution!
Please wait for the result of CI firstly. See Paddle CI Manual for details.

@paddle-bot paddle-bot bot added the contributor External developers label Aug 17, 2025
@SigureMo SigureMo added the HappyOpenSource 快乐开源活动issue与PR label Aug 17, 2025
- id: black
exclude: |
(?x)^(
third_party/.+|
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这个已经在顶层 exclude 掉了,这里就不重复写了

- id: ruff-check
args: [--fix, --exit-non-zero-on-fix, --no-cache]
- id: ruff-format
files: \.(py|pyi)$
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这里写 files 是有必要的么?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

测了下好像没必要,先删了

@SigureMo SigureMo changed the title [CodeStyle] black -> ruff format migration, pre-commit monitoring - part 22 [CodeStyle] Move black to ruff format, initial pre-commit config setup for mix check mode - part 22 Aug 17, 2025
SigureMo
SigureMo previously approved these changes Aug 17, 2025
# | test/[m-z].+
# | tools/.+
)$
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

由于 exclude 不能写注释所以不能写注释防止冲突

我试了下应该可以,之后每个 PR 注释下面,解开上面就好了

特意空一行以免冲突

@SigureMo SigureMo merged commit d7133ee into PaddlePaddle:develop Aug 17, 2025
73 of 74 checks passed
@SigureMo SigureMo deleted the ruff/fmt/part-22 branch August 17, 2025 21:09
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

contributor External developers HappyOpenSource 快乐开源活动issue与PR skip-ci: api-benchmark

Projects

None yet

Development

Successfully merging this pull request may close these issues.

2 participants