huggingface · mariosasko · Nov 21, 2023 · Nov 17, 2023 · Nov 17, 2023 · Nov 17, 2023
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -28,8 +28,8 @@ jobs:
           pip install .[quality]
       - name: Check quality
         run: |
-          black --check tests src benchmarks metrics
-          ruff tests src benchmarks metrics
+          ruff check tests src benchmarks metrics utils setup.py # linter
+          ruff format --check tests src benchmarks metrics utils setup.py # formatter
 
   test:
     needs: check_code_quality

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,15 +1,9 @@
 repos:
-  - repo: https://github.com/psf/black
-    rev: 23.1.0
+  - repo: https://github.com/charliermarsh/ruff-pre-commit # https://github.com/charliermarsh/ruff#usage
+    rev: 'v0.1.5'
     hooks:
-      - id: black
-        language_version: python3
-        types: [python]
-        stages: [commit]
-        args: ["--config", "pyproject.toml", "tests", "src", "benchmarks", "metrics"]
-  - repo: https://github.com/charliermarsh/ruff-pre-commit
-    rev: 'v0.0.255'
-    hooks:
-    - id: ruff
-      stages: [commit]
-      args: [ "--config", "pyproject.toml", "tests", "src", "benchmarks", "metrics", "--fix"]
+      # Run the linter.
+      - id: ruff
+        args: [ --fix ]
+      # Run the formatter.
+      - id: ruff-format
diff --git a/Makefile b/Makefile
@@ -5,14 +5,14 @@ check_dirs := tests src benchmarks metrics utils
 # Check that source code meets quality standards
 
 quality:
-	black --check $(check_dirs) setup.py
-	ruff $(check_dirs) setup.py
+	ruff check $(check_dirs) setup.py  # linter
+	ruff format --check $(check_dirs) setup.py # formatter
 
 # Format source code automatically
 
 style:
-	black tests src benchmarks metrics setup.py
-	ruff $(check_dirs) setup.py --fix
+	ruff check --fix $(check_dirs) setup.py # linter
+	ruff format $(check_dirs) setup.py # formatter
 
 # Run tests for the library
 

diff --git a/setup.py b/setup.py
@@ -216,7 +216,7 @@
 TESTS_REQUIRE.extend(VISION_REQUIRE)
 TESTS_REQUIRE.extend(AUDIO_REQUIRE)
 
-QUALITY_REQUIRE = ["black~=23.1", "ruff>=0.0.241", "pyyaml>=5.3.1"]
+QUALITY_REQUIRE = ["ruff>=0.1.5"]
 
 DOCS_REQUIRE = [
     # Might need to add doc-builder and some specific deps in the future

diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
@@ -3101,7 +3101,8 @@ def load_processed_shard_from_cache(shard_kwargs):
         else:
 
             def format_cache_file_name(
-                cache_file_name: Optional[str], rank: Union[int, Literal["*"]]  # noqa: F722
+                cache_file_name: Optional[str],
+                rank: Union[int, Literal["*"]],  # noqa: F722
             ) -> Optional[str]:
                 if not cache_file_name:
                     return cache_file_name

diff --git a/src/datasets/iterable_dataset.py b/src/datasets/iterable_dataset.py
@@ -127,8 +127,9 @@ def _convert_to_arrow(
             Drop the last batch if it is smaller than `batch_size`.
     """
     if batch_size is None or batch_size <= 0:
-        yield "all", pa.Table.from_pylist(
-            cast_to_python_objects([example for _, example in iterable], only_1d_for_numpy=True)
+        yield (
+            "all",
+            pa.Table.from_pylist(cast_to_python_objects([example for _, example in iterable], only_1d_for_numpy=True)),
         )
         return
     iterator = iter(iterable)
@@ -1112,8 +1113,9 @@ def __iter__(self):
         # Then for each example, `TypedExamplesIterable` automatically fills missing columns with None.
         # This is done with `_apply_feature_types_on_example`.
         for key, example in self.ex_iterable:
-            yield key, _apply_feature_types_on_example(
-                example, self.features, token_per_repo_id=self.token_per_repo_id
+            yield (
+                key,
+                _apply_feature_types_on_example(example, self.features, token_per_repo_id=self.token_per_repo_id),
             )
 
     def _iter_arrow(self) -> Iterator[Tuple[Key, pa.Table]]:

diff --git a/src/datasets/load.py b/src/datasets/load.py
@@ -1493,9 +1493,7 @@ def dataset_module_factory(
                     download_config=download_config,
                     download_mode=download_mode,
                 ).get_module()
-        except (
-            Exception
-        ) as e1:  # noqa all the attempts failed, before raising the error we should check if the module is already cached.
+        except Exception as e1:  # noqa all the attempts failed, before raising the error we should check if the module is already cached.
             try:
                 return CachedDatasetModuleFactory(path, dynamic_modules_path=dynamic_modules_path).get_module()
             except Exception:  # noqa if it's not in the cache, then it doesn't exist.
@@ -1598,9 +1596,7 @@ def metric_module_factory(
                     download_mode=download_mode,
                     dynamic_modules_path=dynamic_modules_path,
                 ).get_module()
-            except (
-                Exception
-            ) as e1:  # noqa all the attempts failed, before raising the error we should check if the module is already cached.
+            except Exception as e1:  # noqa all the attempts failed, before raising the error we should check if the module is already cached.
                 try:
                     return CachedMetricModuleFactory(path, dynamic_modules_path=dynamic_modules_path).get_module()
                 except Exception:  # noqa if it's not in the cache, then it doesn't exist.

diff --git a/src/datasets/packaged_modules/folder_based_builder/folder_based_builder.py b/src/datasets/packaged_modules/folder_based_builder/folder_based_builder.py
@@ -323,12 +323,15 @@ def _generate_examples(self, files, metadata_files, split_name, add_metadata, ad
                         sample_label = {"label": os.path.basename(os.path.dirname(original_file))}
                     else:
                         sample_label = {}
-                    yield file_idx, {
-                        **sample_empty_metadata,
-                        self.BASE_COLUMN_NAME: downloaded_file_or_dir,
-                        **sample_metadata,
-                        **sample_label,
-                    }
+                    yield (
+                        file_idx,
+                        {
+                            **sample_empty_metadata,
+                            self.BASE_COLUMN_NAME: downloaded_file_or_dir,
+                            **sample_metadata,
+                            **sample_label,
+                        },
+                    )
                     file_idx += 1
             else:
                 for downloaded_dir_file in downloaded_file_or_dir:
@@ -391,10 +394,13 @@ def _generate_examples(self, files, metadata_files, split_name, add_metadata, ad
                             sample_label = {"label": os.path.basename(os.path.dirname(downloaded_dir_file))}
                         else:
                             sample_label = {}
-                        yield file_idx, {
-                            **sample_empty_metadata,
-                            self.BASE_COLUMN_NAME: downloaded_dir_file,
-                            **sample_metadata,
-                            **sample_label,
-                        }
+                        yield (
+                            file_idx,
+                            {
+                                **sample_empty_metadata,
+                                self.BASE_COLUMN_NAME: downloaded_dir_file,
+                                **sample_metadata,
+                                **sample_label,
+                            },
+                        )
                         file_idx += 1
diff --git a/src/datasets/splits.py b/src/datasets/splits.py
@@ -111,6 +111,7 @@ class SplitBase(metaclass=abc.ABCMeta):
              to define which files to read and how to skip examples within file.
 
     """
+
     # pylint: enable=line-too-long
 
     @abc.abstractmethod
@@ -265,6 +266,7 @@ class PercentSlice(metaclass=PercentSliceMeta):
     [guide on splits](../loading#slice-splits)
     for more information.
     """
+
     # pylint: enable=line-too-long
     pass
 
@@ -438,6 +440,7 @@ class Split:
     ... )
     ```
     """
+
     # pylint: enable=line-too-long
     TRAIN = NamedSplit("train")
     TEST = NamedSplit("test")

diff --git a/src/datasets/utils/patching.py b/src/datasets/utils/patching.py
@@ -63,7 +63,7 @@ def __enter__(self):
                 # We don't check for the name of the global, but rather if its value *is* "os" or "os.path".
                 # This allows to patch renamed modules like "from os import path as ospath".
                 if obj_attr is submodule or (
-                    (isinstance(obj_attr, _PatchedModuleObj) and obj_attr._original_module is submodule)
+                    isinstance(obj_attr, _PatchedModuleObj) and obj_attr._original_module is submodule
                 ):
                     self.original[attr] = obj_attr
                     # patch at top level

diff --git a/tests/test_arrow_dataset.py b/tests/test_arrow_dataset.py
@@ -3066,9 +3066,7 @@ def test_concatenate_mixed_memory_and_disk(self):
                 cache_file_name=os.path.join(tmp_dir, "d1.arrow")
             ) as dset1, Dataset.from_dict(data2, info=info2).map(
                 cache_file_name=os.path.join(tmp_dir, "d2.arrow")
-            ) as dset2, Dataset.from_dict(
-                data3
-            ) as dset3:
+            ) as dset2, Dataset.from_dict(data3) as dset3:
                 with concatenate_datasets([dset1, dset2, dset3]) as concatenated_dset:
                     self.assertEqual(len(concatenated_dset), len(dset1) + len(dset2) + len(dset3))
                     self.assertListEqual(concatenated_dset["id"], dset1["id"] + dset2["id"] + dset3["id"])

diff --git a/tests/test_readme_util.py b/tests/test_readme_util.py
@@ -11,6 +11,7 @@
 # @pytest.fixture
 # def example_yaml_structure():
 
+
 example_yaml_structure = yaml.safe_load(
     """\
 name: ""