huggingface · lhoestq · Apr 28, 2025 · Mar 5, 2025
diff --git a/docs/source/about_dataset_load.mdx b/docs/source/about_dataset_load.mdx
@@ -97,7 +97,7 @@ To ensure a dataset is complete, [`load_dataset`] will perform a series of tests
 - The number of splits in the generated `DatasetDict`.
 - The number of samples in each split of the generated `DatasetDict`.
 - The list of downloaded files.
-- The SHA256 checksums of the downloaded files (disabled by defaut).
+- The SHA256 checksums of the downloaded files (disabled by default).
 
 If the dataset doesn't pass the verifications, it is likely that the original host of the dataset made some changes in the data files. 
 

diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
@@ -1488,7 +1488,7 @@ def save_to_disk(
             parent_cache_files_paths = {
                 Path(cache_filename["filename"]).resolve().parent for cache_filename in self.cache_files
             }
-            # Check that the dataset doesn't overwrite iself. It can cause a permission error on Windows and a segfault on linux.
+            # Check that the dataset doesn't overwrite itself. It can cause a permission error on Windows and a segfault on linux.
             if Path(dataset_path).expanduser().resolve() in parent_cache_files_paths:
                 raise PermissionError(
                     f"Tried to overwrite {Path(dataset_path).expanduser().resolve()} but a dataset can't overwrite itself."
@@ -2866,7 +2866,7 @@ def map(
           Note that the last batch may have less than `n` examples.
           A batch is a dictionary, e.g. a batch of `n` examples is `{"text": ["Hello there !"] * n}`.
 
-        If the function is asynchronous, then `map` will run your function in parallel, with up to one thousand simulatenous calls.
+        If the function is asynchronous, then `map` will run your function in parallel, with up to one thousand simultaneous calls.
         It is recommended to use a `asyncio.Semaphore` in your function if you want to set a maximum number of operations that can run at the same time.
 
         Args:
@@ -3464,7 +3464,7 @@ def iter_outputs(shard_iterable):
                     yield i, apply_function(example, i, offset=offset)
 
         num_examples_progress_update = 0
-        # If `update_data` is True after processing the first example/batch, initalize these resources with `init_buffer_and_writer`
+        # If `update_data` is True after processing the first example/batch, initialize these resources with `init_buffer_and_writer`
         buf_writer, writer, tmp_file = None, None, None
 
         # Check if Polars is available and import it if so
@@ -3648,7 +3648,7 @@ def filter(
         """Apply a filter function to all the elements in the table in batches
         and update the table so that the dataset only includes examples according to the filter function.
 
-        If the function is asynchronous, then `filter` will run your function in parallel, with up to one thousand simulatenous calls (configurable).
+        If the function is asynchronous, then `filter` will run your function in parallel, with up to one thousand simultaneous calls (configurable).
         It is recommended to use a `asyncio.Semaphore` in your function if you want to set a maximum number of operations that can run at the same time.
 
         Args:
@@ -4266,7 +4266,7 @@ def sort(
                     f"Column '{column}' not found in the dataset. Please provide a column selected in: {self._data.column_names}"
                 )
 
-        # Change null_placement to conform to pyarrow's sort_indices() while ensuring backwards compatability
+        # Change null_placement to conform to pyarrow's sort_indices() while ensuring backwards compatibility
         if null_placement not in ["at_start", "at_end"]:
             if null_placement == "first":
                 null_placement = "at_start"
@@ -5334,7 +5334,7 @@ def _push_parquet_shards_to_hub(
         Returns:
             additions (`List[CommitOperation]`): list of the `CommitOperationAdd` of the uploaded shards
             uploaded_size (`int`): number of uploaded bytes to the repository
-            dataset_nbytes (`int`): approximate size in bytes of the uploaded dataset afer uncompression
+            dataset_nbytes (`int`): approximate size in bytes of the uploaded dataset after uncompression
         """
         # Find decodable columns, because if there are any, we need to:
         # embed the bytes from the files in the shards
@@ -6166,7 +6166,7 @@ def _concatenate_map_style_datasets(
         # Return first dataset if all datasets are empty
         return dsets[0]
 
-    # Perform checks (and a potentional cast if axis=0)
+    # Perform checks (and a potential cast if axis=0)
     if axis == 0:
         _check_if_features_can_be_aligned([dset.features for dset in dsets])
     else:

diff --git a/src/datasets/arrow_writer.py b/src/datasets/arrow_writer.py
@@ -173,7 +173,7 @@ def get_inferred_type(self) -> FeatureType:
     def _infer_custom_type_and_encode(data: Iterable) -> tuple[Iterable, Optional[FeatureType]]:
         """Implement type inference for custom objects like PIL.Image.Image -> Image type.
 
-        This function is only used for custom python objects that can't be direclty passed to build
+        This function is only used for custom python objects that can't be directly passed to build
         an Arrow array. In such cases is infers the feature type to use, and it encodes the data so
         that they can be passed to an Arrow array.
 
@@ -478,7 +478,7 @@ def write_examples_on_file(self):
         batch_examples = {}
         for col in cols:
             # We use row[0][col] since current_examples contains (example, key) tuples.
-            # Morever, examples could be Arrow arrays of 1 element.
+            # Moreover, examples could be Arrow arrays of 1 element.
             # This can happen in `.map()` when we want to re-write the same Arrow data
             if all(isinstance(row[0][col], (pa.Array, pa.ChunkedArray)) for row in self.current_examples):
                 arrays = [row[0][col] for row in self.current_examples]
@@ -532,7 +532,7 @@ def write(
         if writer_batch_size is not None and len(self.current_examples) >= writer_batch_size:
             if self._check_duplicates:
                 self.check_duplicate_keys()
-                # Re-intializing to empty list for next batch
+                # Re-initializing to empty list for next batch
                 self.hkey_record = []
 
             self.write_examples_on_file()
@@ -632,7 +632,7 @@ def finalize(self, close_stream=True):
         # In case current_examples < writer_batch_size, but user uses finalize()
         if self._check_duplicates:
             self.check_duplicate_keys()
-            # Re-intializing to empty list for next batch
+            # Re-initializing to empty list for next batch
             self.hkey_record = []
         self.write_examples_on_file()
         # If schema is known, infer features even if no examples were written

diff --git a/src/datasets/config.py b/src/datasets/config.py
@@ -264,5 +264,5 @@
 # Maximum number of uploaded files per commit
 UPLOADS_MAX_NUMBER_PER_COMMIT = 50
 
-# Backward compatibiliy
+# Backward compatibility
 MAX_TABLE_NBYTES_FOR_PICKLING = 4 << 30
diff --git a/src/datasets/data_files.py b/src/datasets/data_files.py
@@ -113,7 +113,7 @@ class EmptyDatasetError(FileNotFoundError):
 
 
 def contains_wildcards(pattern: str) -> bool:
-    return any(wilcard_character in pattern for wilcard_character in WILDCARD_CHARACTERS)
+    return any(wildcard_character in pattern for wildcard_character in WILDCARD_CHARACTERS)
 
 
 def sanitize_patterns(patterns: Union[dict, list, str]) -> dict[str, Union[list[str], "DataFilesList"]]:
@@ -156,7 +156,7 @@ def sanitize_patterns(patterns: Union[dict, list, str]) -> dict[str, Union[list[
 
 def _is_inside_unrequested_special_dir(matched_rel_path: str, pattern: str) -> bool:
     """
-    When a path matches a pattern, we additionnally check if it's inside a special directory
+    When a path matches a pattern, we additionally check if it's inside a special directory
     we ignore by default (if it starts with a double underscore).
 
     Users can still explicitly request a filepath inside such a directory if "__pycache__" is
@@ -179,7 +179,7 @@ def _is_inside_unrequested_special_dir(matched_rel_path: str, pattern: str) -> b
     >>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "__*/*")
     False
     """
-    # We just need to check if every special directories from the path is present explicly in the pattern.
+    # We just need to check if every special directories from the path is present explicitly in the pattern.
     # Since we assume that the path matches the pattern, it's equivalent to counting that both
     # the parent path and the parent pattern have the same number of special directories.
     data_dirs_to_ignore_in_path = [part for part in PurePath(matched_rel_path).parent.parts if part.startswith("__")]
@@ -189,7 +189,7 @@ def _is_inside_unrequested_special_dir(matched_rel_path: str, pattern: str) -> b
 
 def _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(matched_rel_path: str, pattern: str) -> bool:
     """
-    When a path matches a pattern, we additionnally check if it's a hidden file or if it's inside
+    When a path matches a pattern, we additionally check if it's a hidden file or if it's inside
     a hidden directory we ignore by default, i.e. if the file name or a parent directory name starts with a dot.
 
     Users can still explicitly request a filepath that is hidden or is inside a hidden directory
@@ -237,7 +237,7 @@ def _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(matched_rel_
     >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".hidden_dir/.*")
     False
     """
-    # We just need to check if every hidden part from the path is present explicly in the pattern.
+    # We just need to check if every hidden part from the path is present explicitly in the pattern.
     # Since we assume that the path matches the pattern, it's equivalent to counting that both
     # the path and the pattern have the same number of hidden parts.
     hidden_directories_in_path = [
@@ -316,7 +316,7 @@ def resolve_pattern(
 
     Hidden files and directories (i.e. whose names start with a dot) are ignored, unless they are explicitly requested.
     The same applies to special directories that start with a double underscore like "__pycache__".
-    You can still include one if the pattern explicilty mentions it:
+    You can still include one if the pattern explicitly mentions it:
     - to include a hidden file: "*/.hidden.txt" or "*/.*"
     - to include a hidden directory: ".hidden/*" or ".*/*"
     - to include a special directory: "__special__/*" or "__*/*"

diff --git a/src/datasets/download/download_config.py b/src/datasets/download/download_config.py
@@ -15,7 +15,7 @@ class DownloadConfig:
             Specify a cache directory to save the file to (overwrite the
             default cache dir).
         force_download (`bool`, defaults to `False`):
-            If `True`, re-dowload the file even if it's already cached in
+            If `True`, re-download the file even if it's already cached in
             the cache dir.
         resume_download (`bool`, defaults to `False`):
             If `True`, resume the download if an incompletely received file is

diff --git a/src/datasets/features/translation.py b/src/datasets/features/translation.py
@@ -11,7 +11,7 @@
 @dataclass
 class Translation:
     """`Feature` for translations with fixed languages per example.
-    Here for compatiblity with tfds.
+    Here for compatibility with tfds.
 
     Args:
         languages (`dict`):
@@ -51,7 +51,7 @@ def flatten(self) -> Union["FeatureType", dict[str, "FeatureType"]]:
 @dataclass
 class TranslationVariableLanguages:
     """`Feature` for translations with variable languages per example.
-    Here for compatiblity with tfds.
+    Here for compatibility with tfds.
 
     Args:
         languages (`dict`):

diff --git a/src/datasets/formatting/__init__.py b/src/datasets/formatting/__init__.py
@@ -124,7 +124,7 @@ def get_formatter(format_type: Optional[str], **format_kwargs) -> Formatter:
     """
     Factory function to get a Formatter given its type name and keyword arguments.
     A formatter is an object that extracts and formats data from pyarrow table.
-    It defines the formatting for rows, colums and batches.
+    It defines the formatting for rows, columns and batches.
     If the formatter for a given type name doesn't exist or is not available, an error is raised.
     """
     format_type = get_format_type_from_alias(format_type)

diff --git a/src/datasets/formatting/formatting.py b/src/datasets/formatting/formatting.py
@@ -499,7 +499,7 @@ class CustomFormatter(Formatter[dict, ColumnFormat, dict]):
     The transform must take as input a batch of data extracted for an arrow table using the python extractor,
     and return a batch.
     If the output batch is not a dict, then output_all_columns won't work.
-    If the ouput batch has several fields, then querying a single column won't work since we don't know which field
+    If the output batch has several fields, then querying a single column won't work since we don't know which field
     to return.
     """
 

diff --git a/src/datasets/search.py b/src/datasets/search.py
@@ -74,7 +74,7 @@ def search_batch(self, queries, k: int = 10, **kwargs) -> BatchedSearchResults:
             queries (`Union[List[str], np.ndarray]`): The queries as a list of strings if `column` is a text index or as a numpy array if `column` is a vector index.
             k (`int`): The number of examples to retrieve per query.
 
-        Ouput:
+        Output:
             total_scores (`List[List[float]`): The retrieval scores of the retrieved examples per query.
             total_indices (`List[List[int]]`): The indices of the retrieved examples per query.
         """
@@ -186,7 +186,7 @@ def search(self, query: str, k=10, **kwargs) -> SearchResults:
             query (`str`): The query as a string.
             k (`int`): The number of examples to retrieve.
 
-        Ouput:
+        Output:
             scores (`List[List[float]`): The retrieval scores of the retrieved examples.
             indices (`List[List[int]]`): The indices of the retrieved examples.
         """
@@ -353,7 +353,7 @@ def search(self, query: np.array, k=10, **kwargs) -> SearchResults:
             query (`np.array`): The query as a numpy array.
             k (`int`): The number of examples to retrieve.
 
-        Ouput:
+        Output:
             scores (`List[List[float]`): The retrieval scores of the retrieved examples.
             indices (`List[List[int]]`): The indices of the retrieved examples.
         """
@@ -373,7 +373,7 @@ def search_batch(self, queries: np.array, k=10, **kwargs) -> BatchedSearchResult
             queries (`np.array`): The queries as a numpy array.
             k (`int`): The number of examples to retrieve.
 
-        Ouput:
+        Output:
             total_scores (`List[List[float]`): The retrieval scores of the retrieved examples per query.
             total_indices (`List[List[int]]`): The indices of the retrieved examples per query.
         """

diff --git a/src/datasets/table.py b/src/datasets/table.py
@@ -53,7 +53,7 @@ def _memory_mapped_record_batch_reader_from_file(filename: str) -> pa.RecordBatc
 def read_schema_from_file(filename: str) -> pa.Schema:
     """
     Infer arrow table schema from file without loading whole file into memory.
-    Usefull especially while having very big files.
+    Useful especially while having very big files.
     """
     with pa.memory_map(filename) as memory_mapped_stream:
         schema = pa.ipc.open_stream(memory_mapped_stream).schema