Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/source/about_dataset_load.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ To ensure a dataset is complete, [`load_dataset`] will perform a series of tests
- The number of splits in the generated `DatasetDict`.
- The number of samples in each split of the generated `DatasetDict`.
- The list of downloaded files.
- The SHA256 checksums of the downloaded files (disabled by defaut).
- The SHA256 checksums of the downloaded files (disabled by default).

If the dataset doesn't pass the verifications, it is likely that the original host of the dataset made some changes in the data files.

Expand Down
14 changes: 7 additions & 7 deletions src/datasets/arrow_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1488,7 +1488,7 @@ def save_to_disk(
parent_cache_files_paths = {
Path(cache_filename["filename"]).resolve().parent for cache_filename in self.cache_files
}
# Check that the dataset doesn't overwrite iself. It can cause a permission error on Windows and a segfault on linux.
# Check that the dataset doesn't overwrite itself. It can cause a permission error on Windows and a segfault on linux.
if Path(dataset_path).expanduser().resolve() in parent_cache_files_paths:
raise PermissionError(
f"Tried to overwrite {Path(dataset_path).expanduser().resolve()} but a dataset can't overwrite itself."
Expand Down Expand Up @@ -2866,7 +2866,7 @@ def map(
Note that the last batch may have less than `n` examples.
A batch is a dictionary, e.g. a batch of `n` examples is `{"text": ["Hello there !"] * n}`.

If the function is asynchronous, then `map` will run your function in parallel, with up to one thousand simulatenous calls.
If the function is asynchronous, then `map` will run your function in parallel, with up to one thousand simultaneous calls.
It is recommended to use a `asyncio.Semaphore` in your function if you want to set a maximum number of operations that can run at the same time.

Args:
Expand Down Expand Up @@ -3464,7 +3464,7 @@ def iter_outputs(shard_iterable):
yield i, apply_function(example, i, offset=offset)

num_examples_progress_update = 0
# If `update_data` is True after processing the first example/batch, initalize these resources with `init_buffer_and_writer`
# If `update_data` is True after processing the first example/batch, initialize these resources with `init_buffer_and_writer`
buf_writer, writer, tmp_file = None, None, None

# Check if Polars is available and import it if so
Expand Down Expand Up @@ -3648,7 +3648,7 @@ def filter(
"""Apply a filter function to all the elements in the table in batches
and update the table so that the dataset only includes examples according to the filter function.

If the function is asynchronous, then `filter` will run your function in parallel, with up to one thousand simulatenous calls (configurable).
If the function is asynchronous, then `filter` will run your function in parallel, with up to one thousand simultaneous calls (configurable).
It is recommended to use a `asyncio.Semaphore` in your function if you want to set a maximum number of operations that can run at the same time.

Args:
Expand Down Expand Up @@ -4266,7 +4266,7 @@ def sort(
f"Column '{column}' not found in the dataset. Please provide a column selected in: {self._data.column_names}"
)

# Change null_placement to conform to pyarrow's sort_indices() while ensuring backwards compatability
# Change null_placement to conform to pyarrow's sort_indices() while ensuring backwards compatibility
if null_placement not in ["at_start", "at_end"]:
if null_placement == "first":
null_placement = "at_start"
Expand Down Expand Up @@ -5334,7 +5334,7 @@ def _push_parquet_shards_to_hub(
Returns:
additions (`List[CommitOperation]`): list of the `CommitOperationAdd` of the uploaded shards
uploaded_size (`int`): number of uploaded bytes to the repository
dataset_nbytes (`int`): approximate size in bytes of the uploaded dataset afer uncompression
dataset_nbytes (`int`): approximate size in bytes of the uploaded dataset after uncompression
"""
# Find decodable columns, because if there are any, we need to:
# embed the bytes from the files in the shards
Expand Down Expand Up @@ -6166,7 +6166,7 @@ def _concatenate_map_style_datasets(
# Return first dataset if all datasets are empty
return dsets[0]

# Perform checks (and a potentional cast if axis=0)
# Perform checks (and a potential cast if axis=0)
if axis == 0:
_check_if_features_can_be_aligned([dset.features for dset in dsets])
else:
Expand Down
8 changes: 4 additions & 4 deletions src/datasets/arrow_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ def get_inferred_type(self) -> FeatureType:
def _infer_custom_type_and_encode(data: Iterable) -> tuple[Iterable, Optional[FeatureType]]:
"""Implement type inference for custom objects like PIL.Image.Image -> Image type.

This function is only used for custom python objects that can't be direclty passed to build
This function is only used for custom python objects that can't be directly passed to build
an Arrow array. In such cases is infers the feature type to use, and it encodes the data so
that they can be passed to an Arrow array.

Expand Down Expand Up @@ -478,7 +478,7 @@ def write_examples_on_file(self):
batch_examples = {}
for col in cols:
# We use row[0][col] since current_examples contains (example, key) tuples.
# Morever, examples could be Arrow arrays of 1 element.
# Moreover, examples could be Arrow arrays of 1 element.
# This can happen in `.map()` when we want to re-write the same Arrow data
if all(isinstance(row[0][col], (pa.Array, pa.ChunkedArray)) for row in self.current_examples):
arrays = [row[0][col] for row in self.current_examples]
Expand Down Expand Up @@ -532,7 +532,7 @@ def write(
if writer_batch_size is not None and len(self.current_examples) >= writer_batch_size:
if self._check_duplicates:
self.check_duplicate_keys()
# Re-intializing to empty list for next batch
# Re-initializing to empty list for next batch
self.hkey_record = []

self.write_examples_on_file()
Expand Down Expand Up @@ -632,7 +632,7 @@ def finalize(self, close_stream=True):
# In case current_examples < writer_batch_size, but user uses finalize()
if self._check_duplicates:
self.check_duplicate_keys()
# Re-intializing to empty list for next batch
# Re-initializing to empty list for next batch
self.hkey_record = []
self.write_examples_on_file()
# If schema is known, infer features even if no examples were written
Expand Down
2 changes: 1 addition & 1 deletion src/datasets/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,5 +264,5 @@
# Maximum number of uploaded files per commit
UPLOADS_MAX_NUMBER_PER_COMMIT = 50

# Backward compatibiliy
# Backward compatibility
MAX_TABLE_NBYTES_FOR_PICKLING = 4 << 30
12 changes: 6 additions & 6 deletions src/datasets/data_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ class EmptyDatasetError(FileNotFoundError):


def contains_wildcards(pattern: str) -> bool:
return any(wilcard_character in pattern for wilcard_character in WILDCARD_CHARACTERS)
return any(wildcard_character in pattern for wildcard_character in WILDCARD_CHARACTERS)


def sanitize_patterns(patterns: Union[dict, list, str]) -> dict[str, Union[list[str], "DataFilesList"]]:
Expand Down Expand Up @@ -156,7 +156,7 @@ def sanitize_patterns(patterns: Union[dict, list, str]) -> dict[str, Union[list[

def _is_inside_unrequested_special_dir(matched_rel_path: str, pattern: str) -> bool:
"""
When a path matches a pattern, we additionnally check if it's inside a special directory
When a path matches a pattern, we additionally check if it's inside a special directory
we ignore by default (if it starts with a double underscore).

Users can still explicitly request a filepath inside such a directory if "__pycache__" is
Expand All @@ -179,7 +179,7 @@ def _is_inside_unrequested_special_dir(matched_rel_path: str, pattern: str) -> b
>>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "__*/*")
False
"""
# We just need to check if every special directories from the path is present explicly in the pattern.
# We just need to check if every special directories from the path is present explicitly in the pattern.
# Since we assume that the path matches the pattern, it's equivalent to counting that both
# the parent path and the parent pattern have the same number of special directories.
data_dirs_to_ignore_in_path = [part for part in PurePath(matched_rel_path).parent.parts if part.startswith("__")]
Expand All @@ -189,7 +189,7 @@ def _is_inside_unrequested_special_dir(matched_rel_path: str, pattern: str) -> b

def _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(matched_rel_path: str, pattern: str) -> bool:
"""
When a path matches a pattern, we additionnally check if it's a hidden file or if it's inside
When a path matches a pattern, we additionally check if it's a hidden file or if it's inside
a hidden directory we ignore by default, i.e. if the file name or a parent directory name starts with a dot.

Users can still explicitly request a filepath that is hidden or is inside a hidden directory
Expand Down Expand Up @@ -237,7 +237,7 @@ def _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(matched_rel_
>>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".hidden_dir/.*")
False
"""
# We just need to check if every hidden part from the path is present explicly in the pattern.
# We just need to check if every hidden part from the path is present explicitly in the pattern.
# Since we assume that the path matches the pattern, it's equivalent to counting that both
# the path and the pattern have the same number of hidden parts.
hidden_directories_in_path = [
Expand Down Expand Up @@ -316,7 +316,7 @@ def resolve_pattern(

Hidden files and directories (i.e. whose names start with a dot) are ignored, unless they are explicitly requested.
The same applies to special directories that start with a double underscore like "__pycache__".
You can still include one if the pattern explicilty mentions it:
You can still include one if the pattern explicitly mentions it:
- to include a hidden file: "*/.hidden.txt" or "*/.*"
- to include a hidden directory: ".hidden/*" or ".*/*"
- to include a special directory: "__special__/*" or "__*/*"
Expand Down
2 changes: 1 addition & 1 deletion src/datasets/download/download_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ class DownloadConfig:
Specify a cache directory to save the file to (overwrite the
default cache dir).
force_download (`bool`, defaults to `False`):
If `True`, re-dowload the file even if it's already cached in
If `True`, re-download the file even if it's already cached in
the cache dir.
resume_download (`bool`, defaults to `False`):
If `True`, resume the download if an incompletely received file is
Expand Down
4 changes: 2 additions & 2 deletions src/datasets/features/translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
@dataclass
class Translation:
"""`Feature` for translations with fixed languages per example.
Here for compatiblity with tfds.
Here for compatibility with tfds.

Args:
languages (`dict`):
Expand Down Expand Up @@ -51,7 +51,7 @@ def flatten(self) -> Union["FeatureType", dict[str, "FeatureType"]]:
@dataclass
class TranslationVariableLanguages:
"""`Feature` for translations with variable languages per example.
Here for compatiblity with tfds.
Here for compatibility with tfds.

Args:
languages (`dict`):
Expand Down
2 changes: 1 addition & 1 deletion src/datasets/formatting/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ def get_formatter(format_type: Optional[str], **format_kwargs) -> Formatter:
"""
Factory function to get a Formatter given its type name and keyword arguments.
A formatter is an object that extracts and formats data from pyarrow table.
It defines the formatting for rows, colums and batches.
It defines the formatting for rows, columns and batches.
If the formatter for a given type name doesn't exist or is not available, an error is raised.
"""
format_type = get_format_type_from_alias(format_type)
Expand Down
2 changes: 1 addition & 1 deletion src/datasets/formatting/formatting.py
Original file line number Diff line number Diff line change
Expand Up @@ -499,7 +499,7 @@ class CustomFormatter(Formatter[dict, ColumnFormat, dict]):
The transform must take as input a batch of data extracted for an arrow table using the python extractor,
and return a batch.
If the output batch is not a dict, then output_all_columns won't work.
If the ouput batch has several fields, then querying a single column won't work since we don't know which field
If the output batch has several fields, then querying a single column won't work since we don't know which field
to return.
"""

Expand Down
8 changes: 4 additions & 4 deletions src/datasets/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def search_batch(self, queries, k: int = 10, **kwargs) -> BatchedSearchResults:
queries (`Union[List[str], np.ndarray]`): The queries as a list of strings if `column` is a text index or as a numpy array if `column` is a vector index.
k (`int`): The number of examples to retrieve per query.

Ouput:
Output:
total_scores (`List[List[float]`): The retrieval scores of the retrieved examples per query.
total_indices (`List[List[int]]`): The indices of the retrieved examples per query.
"""
Expand Down Expand Up @@ -186,7 +186,7 @@ def search(self, query: str, k=10, **kwargs) -> SearchResults:
query (`str`): The query as a string.
k (`int`): The number of examples to retrieve.

Ouput:
Output:
scores (`List[List[float]`): The retrieval scores of the retrieved examples.
indices (`List[List[int]]`): The indices of the retrieved examples.
"""
Expand Down Expand Up @@ -353,7 +353,7 @@ def search(self, query: np.array, k=10, **kwargs) -> SearchResults:
query (`np.array`): The query as a numpy array.
k (`int`): The number of examples to retrieve.

Ouput:
Output:
scores (`List[List[float]`): The retrieval scores of the retrieved examples.
indices (`List[List[int]]`): The indices of the retrieved examples.
"""
Expand All @@ -373,7 +373,7 @@ def search_batch(self, queries: np.array, k=10, **kwargs) -> BatchedSearchResult
queries (`np.array`): The queries as a numpy array.
k (`int`): The number of examples to retrieve.

Ouput:
Output:
total_scores (`List[List[float]`): The retrieval scores of the retrieved examples per query.
total_indices (`List[List[int]]`): The indices of the retrieved examples per query.
"""
Expand Down
2 changes: 1 addition & 1 deletion src/datasets/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def _memory_mapped_record_batch_reader_from_file(filename: str) -> pa.RecordBatc
def read_schema_from_file(filename: str) -> pa.Schema:
"""
Infer arrow table schema from file without loading whole file into memory.
Usefull especially while having very big files.
Useful especially while having very big files.
"""
with pa.memory_map(filename) as memory_mapped_stream:
schema = pa.ipc.open_stream(memory_mapped_stream).schema
Expand Down