Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions src/datasets/arrow_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@
import pandas as pd
import pyarrow as pa
import pyarrow.compute as pc
from huggingface_hub import CommitOperationAdd, CommitOperationDelete, DatasetCard, DatasetCardData, HfApi
from huggingface_hub import CommitInfo, CommitOperationAdd, CommitOperationDelete, DatasetCard, DatasetCardData, HfApi
from multiprocess import Pool

from . import config
Expand Down Expand Up @@ -5235,7 +5235,7 @@ def push_to_hub(
max_shard_size: Optional[Union[int, str]] = None,
num_shards: Optional[int] = None,
embed_external_files: bool = True,
):
) -> CommitInfo:
"""Pushes the dataset to the hub as a Parquet dataset.
The dataset is pushed using HTTP requests and does not need to have neither git or git-lfs installed.

Expand Down Expand Up @@ -5290,6 +5290,9 @@ def push_to_hub(

- [`Audio`] and [`Image`]: remove local path information and embed file content in the Parquet files.

Return:
huggingface_hub.CommitInfo

Example:

```python
Expand Down Expand Up @@ -5503,7 +5506,7 @@ def push_to_hub(

commit_message = commit_message if commit_message is not None else "Upload dataset"
if len(additions) <= config.UPLOADS_MAX_NUMBER_PER_COMMIT:
api.create_commit(
commit_info = api.create_commit(
repo_id,
operations=additions + deletions,
commit_message=commit_message,
Expand All @@ -5521,7 +5524,7 @@ def push_to_hub(
operations = additions[
i * config.UPLOADS_MAX_NUMBER_PER_COMMIT : (i + 1) * config.UPLOADS_MAX_NUMBER_PER_COMMIT
] + (deletions if i == 0 else [])
api.create_commit(
commit_info = api.create_commit(
repo_id,
operations=operations,
commit_message=commit_message + f" (part {i:05d}-of-{num_commits:05d})",
Expand All @@ -5535,6 +5538,7 @@ def push_to_hub(
+ (f" (still {num_commits - i - 1} to go)" if num_commits - i - 1 else "")
+ "."
)
return commit_info

@transmit_format
@fingerprint_transform(inplace=False)
Expand Down
11 changes: 8 additions & 3 deletions src/datasets/dataset_dict.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import fsspec
import numpy as np
from huggingface_hub import (
CommitInfo,
CommitOperationAdd,
CommitOperationDelete,
DatasetCard,
Expand Down Expand Up @@ -1564,7 +1565,7 @@ def push_to_hub(
max_shard_size: Optional[Union[int, str]] = None,
num_shards: Optional[Dict[str, int]] = None,
embed_external_files: bool = True,
):
) -> CommitInfo:
"""Pushes the [`DatasetDict`] to the hub as a Parquet dataset.
The [`DatasetDict`] is pushed using HTTP requests and does not need to have neither git or git-lfs installed.

Expand Down Expand Up @@ -1621,6 +1622,9 @@ def push_to_hub(

- [`Audio`] and [`Image`] removes local path information and embed file content in the Parquet files.

Return:
huggingface_hub.CommitInfo

Example:

```python
Expand Down Expand Up @@ -1780,7 +1784,7 @@ def push_to_hub(

commit_message = commit_message if commit_message is not None else "Upload dataset"
if len(additions) <= config.UPLOADS_MAX_NUMBER_PER_COMMIT:
api.create_commit(
commit_info = api.create_commit(
repo_id,
operations=additions + deletions,
commit_message=commit_message,
Expand All @@ -1798,7 +1802,7 @@ def push_to_hub(
operations = additions[
i * config.UPLOADS_MAX_NUMBER_PER_COMMIT : (i + 1) * config.UPLOADS_MAX_NUMBER_PER_COMMIT
] + (deletions if i == 0 else [])
api.create_commit(
commit_info = api.create_commit(
repo_id,
operations=operations,
commit_message=commit_message + f" (part {i:05d}-of-{num_commits:05d})",
Expand All @@ -1812,6 +1816,7 @@ def push_to_hub(
+ (f" (still {num_commits - i - 1} to go)" if num_commits - i - 1 else "")
+ "."
)
return commit_info


class IterableDatasetDict(dict):
Expand Down