diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 348a1b7451b..d76ae7209ce 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -28,8 +28,8 @@ jobs: pip install .[quality] - name: Check quality run: | - black --check tests src benchmarks metrics - ruff tests src benchmarks metrics + ruff check tests src benchmarks metrics utils setup.py # linter + ruff format --check tests src benchmarks metrics utils setup.py # formatter test: needs: check_code_quality diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 8f01de92b81..d53187601dc 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,15 +1,9 @@ repos: - - repo: https://github.com/psf/black - rev: 23.1.0 + - repo: https://github.com/charliermarsh/ruff-pre-commit # https://github.com/charliermarsh/ruff#usage + rev: 'v0.1.5' hooks: - - id: black - language_version: python3 - types: [python] - stages: [commit] - args: ["--config", "pyproject.toml", "tests", "src", "benchmarks", "metrics"] - - repo: https://github.com/charliermarsh/ruff-pre-commit - rev: 'v0.0.255' - hooks: - - id: ruff - stages: [commit] - args: [ "--config", "pyproject.toml", "tests", "src", "benchmarks", "metrics", "--fix"] + # Run the linter. + - id: ruff + args: [ --fix ] + # Run the formatter. + - id: ruff-format diff --git a/Makefile b/Makefile index a1ee6f14bb7..916524e7754 100644 --- a/Makefile +++ b/Makefile @@ -5,14 +5,14 @@ check_dirs := tests src benchmarks metrics utils # Check that source code meets quality standards quality: - black --check $(check_dirs) setup.py - ruff $(check_dirs) setup.py + ruff check $(check_dirs) setup.py # linter + ruff format --check $(check_dirs) setup.py # formatter # Format source code automatically style: - black tests src benchmarks metrics setup.py - ruff $(check_dirs) setup.py --fix + ruff check --fix $(check_dirs) setup.py # linter + ruff format $(check_dirs) setup.py # formatter # Run tests for the library diff --git a/setup.py b/setup.py index 1d7d920c4a9..cf733c60ea5 100644 --- a/setup.py +++ b/setup.py @@ -216,7 +216,7 @@ TESTS_REQUIRE.extend(VISION_REQUIRE) TESTS_REQUIRE.extend(AUDIO_REQUIRE) -QUALITY_REQUIRE = ["black~=23.1", "ruff>=0.0.241", "pyyaml>=5.3.1"] +QUALITY_REQUIRE = ["ruff>=0.1.5"] DOCS_REQUIRE = [ # Might need to add doc-builder and some specific deps in the future diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index cecf8d454a3..ffd57c3a8ca 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -3101,7 +3101,8 @@ def load_processed_shard_from_cache(shard_kwargs): else: def format_cache_file_name( - cache_file_name: Optional[str], rank: Union[int, Literal["*"]] # noqa: F722 + cache_file_name: Optional[str], + rank: Union[int, Literal["*"]], # noqa: F722 ) -> Optional[str]: if not cache_file_name: return cache_file_name diff --git a/src/datasets/iterable_dataset.py b/src/datasets/iterable_dataset.py index 30ad9ebc3ee..d9c6057929c 100644 --- a/src/datasets/iterable_dataset.py +++ b/src/datasets/iterable_dataset.py @@ -127,8 +127,9 @@ def _convert_to_arrow( Drop the last batch if it is smaller than `batch_size`. """ if batch_size is None or batch_size <= 0: - yield "all", pa.Table.from_pylist( - cast_to_python_objects([example for _, example in iterable], only_1d_for_numpy=True) + yield ( + "all", + pa.Table.from_pylist(cast_to_python_objects([example for _, example in iterable], only_1d_for_numpy=True)), ) return iterator = iter(iterable) @@ -1112,8 +1113,9 @@ def __iter__(self): # Then for each example, `TypedExamplesIterable` automatically fills missing columns with None. # This is done with `_apply_feature_types_on_example`. for key, example in self.ex_iterable: - yield key, _apply_feature_types_on_example( - example, self.features, token_per_repo_id=self.token_per_repo_id + yield ( + key, + _apply_feature_types_on_example(example, self.features, token_per_repo_id=self.token_per_repo_id), ) def _iter_arrow(self) -> Iterator[Tuple[Key, pa.Table]]: diff --git a/src/datasets/load.py b/src/datasets/load.py index 23a34a9946f..b84f794cb24 100644 --- a/src/datasets/load.py +++ b/src/datasets/load.py @@ -1493,9 +1493,7 @@ def dataset_module_factory( download_config=download_config, download_mode=download_mode, ).get_module() - except ( - Exception - ) as e1: # noqa all the attempts failed, before raising the error we should check if the module is already cached. + except Exception as e1: # noqa all the attempts failed, before raising the error we should check if the module is already cached. try: return CachedDatasetModuleFactory(path, dynamic_modules_path=dynamic_modules_path).get_module() except Exception: # noqa if it's not in the cache, then it doesn't exist. @@ -1598,9 +1596,7 @@ def metric_module_factory( download_mode=download_mode, dynamic_modules_path=dynamic_modules_path, ).get_module() - except ( - Exception - ) as e1: # noqa all the attempts failed, before raising the error we should check if the module is already cached. + except Exception as e1: # noqa all the attempts failed, before raising the error we should check if the module is already cached. try: return CachedMetricModuleFactory(path, dynamic_modules_path=dynamic_modules_path).get_module() except Exception: # noqa if it's not in the cache, then it doesn't exist. diff --git a/src/datasets/packaged_modules/folder_based_builder/folder_based_builder.py b/src/datasets/packaged_modules/folder_based_builder/folder_based_builder.py index e8fd817196c..146ef4e613b 100644 --- a/src/datasets/packaged_modules/folder_based_builder/folder_based_builder.py +++ b/src/datasets/packaged_modules/folder_based_builder/folder_based_builder.py @@ -323,12 +323,15 @@ def _generate_examples(self, files, metadata_files, split_name, add_metadata, ad sample_label = {"label": os.path.basename(os.path.dirname(original_file))} else: sample_label = {} - yield file_idx, { - **sample_empty_metadata, - self.BASE_COLUMN_NAME: downloaded_file_or_dir, - **sample_metadata, - **sample_label, - } + yield ( + file_idx, + { + **sample_empty_metadata, + self.BASE_COLUMN_NAME: downloaded_file_or_dir, + **sample_metadata, + **sample_label, + }, + ) file_idx += 1 else: for downloaded_dir_file in downloaded_file_or_dir: @@ -391,10 +394,13 @@ def _generate_examples(self, files, metadata_files, split_name, add_metadata, ad sample_label = {"label": os.path.basename(os.path.dirname(downloaded_dir_file))} else: sample_label = {} - yield file_idx, { - **sample_empty_metadata, - self.BASE_COLUMN_NAME: downloaded_dir_file, - **sample_metadata, - **sample_label, - } + yield ( + file_idx, + { + **sample_empty_metadata, + self.BASE_COLUMN_NAME: downloaded_dir_file, + **sample_metadata, + **sample_label, + }, + ) file_idx += 1 diff --git a/src/datasets/splits.py b/src/datasets/splits.py index 9e1c003202b..817995ad217 100644 --- a/src/datasets/splits.py +++ b/src/datasets/splits.py @@ -111,6 +111,7 @@ class SplitBase(metaclass=abc.ABCMeta): to define which files to read and how to skip examples within file. """ + # pylint: enable=line-too-long @abc.abstractmethod @@ -265,6 +266,7 @@ class PercentSlice(metaclass=PercentSliceMeta): [guide on splits](../loading#slice-splits) for more information. """ + # pylint: enable=line-too-long pass @@ -438,6 +440,7 @@ class Split: ... ) ``` """ + # pylint: enable=line-too-long TRAIN = NamedSplit("train") TEST = NamedSplit("test") diff --git a/src/datasets/utils/patching.py b/src/datasets/utils/patching.py index 5fd8ddcb325..f245cabd970 100644 --- a/src/datasets/utils/patching.py +++ b/src/datasets/utils/patching.py @@ -63,7 +63,7 @@ def __enter__(self): # We don't check for the name of the global, but rather if its value *is* "os" or "os.path". # This allows to patch renamed modules like "from os import path as ospath". if obj_attr is submodule or ( - (isinstance(obj_attr, _PatchedModuleObj) and obj_attr._original_module is submodule) + isinstance(obj_attr, _PatchedModuleObj) and obj_attr._original_module is submodule ): self.original[attr] = obj_attr # patch at top level diff --git a/tests/test_arrow_dataset.py b/tests/test_arrow_dataset.py index 75a0d3559b9..4d4ecc9802b 100644 --- a/tests/test_arrow_dataset.py +++ b/tests/test_arrow_dataset.py @@ -3066,9 +3066,7 @@ def test_concatenate_mixed_memory_and_disk(self): cache_file_name=os.path.join(tmp_dir, "d1.arrow") ) as dset1, Dataset.from_dict(data2, info=info2).map( cache_file_name=os.path.join(tmp_dir, "d2.arrow") - ) as dset2, Dataset.from_dict( - data3 - ) as dset3: + ) as dset2, Dataset.from_dict(data3) as dset3: with concatenate_datasets([dset1, dset2, dset3]) as concatenated_dset: self.assertEqual(len(concatenated_dset), len(dset1) + len(dset2) + len(dset3)) self.assertListEqual(concatenated_dset["id"], dset1["id"] + dset2["id"] + dset3["id"]) diff --git a/tests/test_readme_util.py b/tests/test_readme_util.py index 3f8ade69391..10c50bf7b23 100644 --- a/tests/test_readme_util.py +++ b/tests/test_readme_util.py @@ -11,6 +11,7 @@ # @pytest.fixture # def example_yaml_structure(): + example_yaml_structure = yaml.safe_load( """\ name: ""