From f0d8f8221689913e51e182664d89d21f0053b13f Mon Sep 17 00:00:00 2001 From: dylan Date: Tue, 23 Sep 2025 10:02:21 -0700 Subject: [PATCH 1/3] feat: Streaming files for lint --- marimo/_lint/__init__.py | 8 +- marimo/_lint/linter.py | 200 +++++++++++++++++++++++---------------- marimo/_utils/files.py | 67 ++++++++++++- 3 files changed, 186 insertions(+), 89 deletions(-) diff --git a/marimo/_lint/__init__.py b/marimo/_lint/__init__.py index 17560820751..1df953d7114 100644 --- a/marimo/_lint/__init__.py +++ b/marimo/_lint/__init__.py @@ -7,7 +7,7 @@ from marimo._lint.linter import FileStatus, Linter from marimo._lint.rule_engine import EarlyStoppingConfig, RuleEngine from marimo._lint.rules.base import LintRule -from marimo._utils.files import expand_file_patterns +from marimo._utils.files import async_expand_file_patterns if TYPE_CHECKING: from collections.abc import Callable @@ -43,8 +43,8 @@ def run_check( Returns: Linter with per-file status and diagnostics """ - # Expand patterns to actual files - files_to_check = expand_file_patterns(file_patterns) + # Get async generator for files + files_to_check = async_expand_file_patterns(file_patterns) linter = Linter( pipe=pipe, @@ -100,7 +100,7 @@ def message_pipe(msg: str) -> None: rules=filtered_rules, ) - files_to_check = expand_file_patterns(file_patterns) + files_to_check = async_expand_file_patterns(file_patterns) linter.run_streaming(files_to_check) return linter, "\n".join(messages) diff --git a/marimo/_lint/linter.py b/marimo/_lint/linter.py index 3ce31e4342e..db33b8bf5af 100644 --- a/marimo/_lint/linter.py +++ b/marimo/_lint/linter.py @@ -3,9 +3,10 @@ import asyncio import re +from collections.abc import AsyncIterator, Iterator from dataclasses import dataclass, field from pathlib import Path -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Union from marimo._ast.load import get_notebook_status from marimo._ast.parse import MarimoFileError @@ -17,7 +18,7 @@ from marimo._schemas.serialization import NotebookSerialization if TYPE_CHECKING: - from collections.abc import AsyncIterator, Callable + from collections.abc import AsyncIterator, Callable, Iterator from marimo._lint.rules.base import LintRule @@ -40,6 +41,20 @@ def _contents_differ_excluding_generated_with( return orig_cleaned != gen_cleaned +async def _to_async_iterator( + files_to_check: Union[AsyncIterator[Path], Iterator[Path]], +) -> AsyncIterator[Path]: + """Convert a regular iterator to an async iterator if needed.""" + if hasattr(files_to_check, "__aiter__"): + # Already an async iterator + async for file_path in files_to_check: + yield file_path + else: + # Convert regular iterator to async + for file_path in files_to_check: + yield file_path + + @dataclass class FileStatus: """Processing status and results for a single file.""" @@ -91,105 +106,114 @@ def __init__( self.fixed_count: int = 0 self.issues_count: int = 0 - async def _run_stream( - self, files_to_check: list[Path] - ) -> AsyncIterator[FileStatus]: - """Asynchronously check files and yield results as they complete.""" + async def _process_single_file(self, file: Path) -> FileStatus: + """Process a single file and return its status.""" + file_path = str(file) + file_status = FileStatus(file=file_path) + # Check if file exists first + if not file.exists(): + self.errored = True + file_status.failed = True + file_status.message = f"File not found: {file_path}" + file_status.details = [ + f"FileNotFoundError: No such file or directory: '{file_path}'" + ] + return file_status - # Create tasks for all files - async def process_file(file: Path) -> FileStatus: - file_path = str(file) - file_status = FileStatus(file=file_path) + # Check if file is a supported notebook format + if not file_path.endswith((".py", ".md", ".qmd")): + file_status.skipped = True + file_status.message = ( + f"Skipped: {file_path} (not a notebook file)" + ) + return file_status - # Check if file is a supported notebook format - if not file_path.endswith((".py", ".md", ".qmd")): + try: + with capture_output() as (stdout, stderr, logs): + load_result = get_notebook_status(file_path) + except SyntaxError as e: + # Handle syntax errors in notebooks + self.errored = True + file_status.failed = True + file_status.message = f"Failed to parse: {file_path}" + file_status.details = [f"SyntaxError: {str(e)}"] + return file_status + except MarimoFileError as e: + # Handle syntax errors in notebooks + if self.ignore_scripts: + # Skip this file silently when ignore_scripts is enabled file_status.skipped = True file_status.message = ( - f"Skipped: {file_path} (not a notebook file)" + f"Skipped: {file_path} (not a marimo notebook)" ) return file_status - - try: - with capture_output() as (stdout, stderr, logs): - load_result = get_notebook_status(file_path) - except SyntaxError as e: - # Handle syntax errors in notebooks + else: self.errored = True file_status.failed = True - file_status.message = f"Failed to parse: {file_path}" - file_status.details = [f"SyntaxError: {str(e)}"] + file_status.message = ( + f"Not recognizable as a marimo notebook: {file_path}" + ) + file_status.details = [f"MarimoFileError: {str(e)}"] return file_status - except MarimoFileError as e: - # Handle syntax errors in notebooks - if self.ignore_scripts: - # Skip this file silently when ignore_scripts is enabled - file_status.skipped = True - file_status.message = ( - f"Skipped: {file_path} (not a marimo notebook)" - ) - return file_status - else: - self.errored = True - file_status.failed = True - file_status.message = ( - f"Not recognizable as a marimo notebook: {file_path}" - ) - file_status.details = [f"MarimoFileError: {str(e)}"] - return file_status - file_status.notebook = load_result.notebook - file_status.contents = load_result.contents + file_status.notebook = load_result.notebook + file_status.contents = load_result.contents - if load_result.status == "empty": + if load_result.status == "empty": + file_status.skipped = True + file_status.message = f"Skipped: {file_path} (empty file)" + elif load_result.status == "invalid": + if self.ignore_scripts: + # Skip this file silently when ignore_scripts is enabled file_status.skipped = True - file_status.message = f"Skipped: {file_path} (empty file)" - elif load_result.status == "invalid": - if self.ignore_scripts: - # Skip this file silently when ignore_scripts is enabled - file_status.skipped = True - file_status.message = ( - f"Skipped: {file_path} (not a marimo notebook)" - ) - return file_status - else: - file_status.failed = True - file_status.message = ( - f"Failed to parse: {file_path} (not a valid notebook)" - ) - elif load_result.notebook is not None: - try: - # Check notebook with all rules including parsing - file_status.diagnostics = ( - await self.rule_engine.check_notebook( - load_result.notebook, - # Add parsing rule if there's captured output - stdout=stdout.getvalue().strip(), - stderr=stderr.getvalue().strip(), - logs=logs, - ) - ) - except Exception as e: - # Handle other parsing errors - self.errored = True - file_status.failed = True - file_status.message = f"Failed to process {file_path}" - file_status.details = [str(e)] + file_status.message = ( + f"Skipped: {file_path} (not a marimo notebook)" + ) + return file_status else: - # Status is valid but no notebook - shouldn't happen but handle gracefully - file_status.skipped = True + file_status.failed = True file_status.message = ( - f"Skipped: {file_path} (no notebook content)" + f"Failed to parse: {file_path} (not a valid notebook)" ) + elif load_result.notebook is not None: + try: + # Check notebook with all rules including parsing + file_status.diagnostics = ( + await self.rule_engine.check_notebook( + load_result.notebook, + # Add parsing rule if there's captured output + stdout=stdout.getvalue().strip(), + stderr=stderr.getvalue().strip(), + logs=logs, + ) + ) + except Exception as e: + # Handle other parsing errors + self.errored = True + file_status.failed = True + file_status.message = f"Failed to process {file_path}" + file_status.details = [str(e)] + else: + # Status is valid but no notebook - shouldn't happen but handle gracefully + file_status.skipped = True + file_status.message = ( + f"Skipped: {file_path} (no notebook content)" + ) - # Ensure diagnostics list is initialized for cases where no processing happened - if not hasattr(file_status, "diagnostics"): - file_status.diagnostics = [] + # Ensure diagnostics list is initialized for cases where no processing happened + if not hasattr(file_status, "diagnostics"): + file_status.diagnostics = [] - return file_status + return file_status + + async def _run_stream( + self, files_to_check: list[Path] + ) -> AsyncIterator[FileStatus]: + """Asynchronously check files and yield results as they complete.""" # Create tasks for all files tasks = [ - asyncio.create_task(process_file(file_path)) + asyncio.create_task(self._process_single_file(file_path)) for file_path in files_to_check ] @@ -254,15 +278,22 @@ def _generate_file_contents(file_status: FileStatus) -> str: file_status.notebook, file_status.file ) - def run_streaming(self, files_to_check: list[Path]) -> None: + def run_streaming( + self, files_to_check: Union[AsyncIterator[Path], Iterator[Path]] + ) -> None: """Run linting checks with real-time streaming output.""" asyncio.run(self._run_streaming_async(files_to_check)) - async def _run_streaming_async(self, files_to_check: list[Path]) -> None: + async def _run_streaming_async( + self, files_to_check: Union[AsyncIterator[Path], Iterator[Path]] + ) -> None: """Internal async implementation of run_streaming.""" # Process files as they complete fixed_count = 0 - async for file_status in self._run_stream(files_to_check): + + # Convert to async iterator and process + async for file_path in _to_async_iterator(files_to_check): + file_status = await self._process_single_file(file_path) self.files.append(file_status) # Stream output via pipe if available @@ -278,6 +309,7 @@ async def _run_streaming_async(self, files_to_check: list[Path]) -> None: fixed_count += 1 if self.pipe: self.pipe(f"Updated: {file_status.file}") + self.fixed_count = fixed_count async def fix(self, file_status: FileStatus) -> bool: diff --git a/marimo/_utils/files.py b/marimo/_utils/files.py index 2ad5a1394fc..55e205c0ba0 100644 --- a/marimo/_utils/files.py +++ b/marimo/_utils/files.py @@ -2,7 +2,7 @@ import fnmatch import os import re -from collections.abc import Generator +from collections.abc import AsyncGenerator, Generator from pathlib import Path from typing import Union @@ -27,6 +27,17 @@ def get_files(folder: str) -> Generator[Path, None, None]: yield from get_files(item.path) +async def async_get_files(folder: str) -> AsyncGenerator[Path, None]: + """Asynchronously recursively get all files from a folder.""" + with os.scandir(folder) as scan: + for item in scan: + if item.is_file(): + yield Path(item.path) + elif item.is_dir() and not item.name.startswith("."): + async for file_path in async_get_files(item.path): + yield file_path + + def expand_file_patterns(file_patterns: tuple[str, ...]) -> list[Path]: """Expand file patterns to actual file paths. @@ -70,3 +81,57 @@ def expand_file_patterns(file_patterns: tuple[str, ...]) -> list[Path]: # Remove duplicates and sort return sorted(set(files_to_check)) + + +async def async_expand_file_patterns( + file_patterns: tuple[str, ...], +) -> AsyncGenerator[Path, None]: + """Asynchronously expand file patterns to file paths, yielding as discovered. + + Args: + file_patterns: Tuple of file patterns (files, directories, or glob-like patterns) + + Yields: + Path objects for matching files (including non-existent explicit files) + """ + seen = set() + + for pattern in file_patterns: + if os.path.isfile(pattern): + path = Path(pattern) + if path not in seen: + seen.add(path) + yield path + elif os.path.isdir(pattern): + async for file_path in async_get_files(pattern): + if file_path not in seen: + seen.add(file_path) + yield file_path + else: + # Handle glob patterns by walking from root and filtering + if "**" in pattern or "*" in pattern or "?" in pattern: + # Extract root directory to walk from + root = "." + parts = pattern.split("/") + for i, part in enumerate(parts): + if "*" in part or "?" in part: + root = "/".join(parts[:i]) if i > 0 else "." + break + elif os.path.isdir("/".join(parts[: i + 1])): + root = "/".join(parts[: i + 1]) + + # Get all files from root and filter by pattern + if os.path.isdir(root): + async for file_path in async_get_files(root): + if ( + fnmatch.fnmatch(str(file_path), pattern) + and file_path not in seen + ): + seen.add(file_path) + yield file_path + else: + # Not a glob pattern but file doesn't exist - yield it anyway for error handling + path = Path(pattern) + if path not in seen: + seen.add(path) + yield path From 5781f3ff0f9ac951314de55106c00c88ede25670 Mon Sep 17 00:00:00 2001 From: dylan Date: Tue, 23 Sep 2025 10:20:08 -0700 Subject: [PATCH 2/3] tests: non existent file --- tests/_lint/test_run_check.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/tests/_lint/test_run_check.py b/tests/_lint/test_run_check.py index 81d36552cd8..4c1049d5077 100644 --- a/tests/_lint/test_run_check.py +++ b/tests/_lint/test_run_check.py @@ -368,4 +368,20 @@ def test_error_handling_in_run_check(self): assert result.files[0].failed is True # Note: errored might not be True as we changed error handling assert "Failed to parse" in result.files[0].message - """Test error handling in run_check.""" + + def test_run_check_with_nonexistent_file_pattern(self): + """Test run_check with a specific nonexistent file.""" + result = run_check(("nonexistent_file.py",)) + + assert isinstance(result, Linter) + assert len(result.files) == 1 # Should create a failed file status + assert result.files[0].failed is True + assert "File not found" in result.files[0].message + + def test_run_check_with_nonexistent_directory_pattern(self): + """Test run_check with nonexistent directory patterns.""" + result = run_check(("nonexistent_dir/**/*.py",)) + + assert isinstance(result, Linter) + assert len(result.files) == 0 + assert result.errored is False From ca708f66aa4e0b1fc6a7fb58e9d61436e5d317d4 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 23 Sep 2025 17:25:17 +0000 Subject: [PATCH 3/3] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- marimo/_lint/linter.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/marimo/_lint/linter.py b/marimo/_lint/linter.py index db33b8bf5af..c6688344a5c 100644 --- a/marimo/_lint/linter.py +++ b/marimo/_lint/linter.py @@ -123,9 +123,7 @@ async def _process_single_file(self, file: Path) -> FileStatus: # Check if file is a supported notebook format if not file_path.endswith((".py", ".md", ".qmd")): file_status.skipped = True - file_status.message = ( - f"Skipped: {file_path} (not a notebook file)" - ) + file_status.message = f"Skipped: {file_path} (not a notebook file)" return file_status try: @@ -196,9 +194,7 @@ async def _process_single_file(self, file: Path) -> FileStatus: else: # Status is valid but no notebook - shouldn't happen but handle gracefully file_status.skipped = True - file_status.message = ( - f"Skipped: {file_path} (no notebook content)" - ) + file_status.message = f"Skipped: {file_path} (no notebook content)" # Ensure diagnostics list is initialized for cases where no processing happened if not hasattr(file_status, "diagnostics"):