huggingface · Wauplin · Oct 4, 2023 · Sep 28, 2023 · Sep 29, 2023 · Sep 29, 2023
diff --git a/docs/source/en/guides/upload.md b/docs/source/en/guides/upload.md
@@ -431,6 +431,50 @@ In addition to [`upload_file`] and [`upload_folder`], the following functions al
 
 For more detailed information, take a look at the [`HfApi`] reference.
 
+### Preupload LFS files before commit
+
+In some cases, you might want to upload huge files to S3 **before** making the commit call. For example, if you are
+committing a dataset in several shards that are generated in-memory, you would need to upload the shards one by one
+to avoid an out-of-memory issue. A solution is to upload each shard as a separate commit on the repo. While being
+perfectly valid, this solution has the drawback of potentially messing the git history by generating tens of commits.
+To overcome this issue, you can upload your files one by one to S3 and then create a single commit at the end. This
+is possible using [`preupload_lfs_files`] in combination with [`create_commit`].
+
+<Tip warning={true}>
+
+This is a power-user method. Directly using [`upload_file`], [`upload_folder`] or [`create_commit`] instead of handling
+the low-level logic of pre-uploading files is the way to go in the vast majority of cases. The main caveat of
+[`preupload_lfs_files`] is that until the commit is actually made, the upload files are not accessible on the repo on
+the Hub. If you have a question, feel free to ping us on our Discord or in a GitHub issue.
+
+</Tip>
+
+Here is a simple example illustrating how to pre-upload files:
+
+```py
+>>> from huggingface_hub import CommitOperationAdd, preupload_lfs_files, create_commit, create_repo
+
+>>> repo_id = create_repo("test_preupload").repo_id
+
+>>> operations = [] # List of all `CommitOperationAdd` objects that will be generated
+>>> for i in range(5):
+...     content = ... # generate binary content
+...     addition = CommitOperationAdd(path_in_repo=f"shard_{i}_of_5.bin", path_or_fileobj=content)
+...     preupload_lfs_files(repo_id, additions=[addition])
+...     operations.append(addition)
+
+>>> # Create commit
+>>> create_commit(repo_id, operations=operations, commit_message="Commit all shards")
+```
+
+First, we create the [`CommitOperationAdd`] objects one by one. In a real-world example, those would contain the
+generated shards. Each file is uploaded before generating the next one. During the [`preupload_lfs_files`] step, **the
+`CommitOperationAdd` object is mutated**. You should only use it to pass it directly to [`create_commit`]. The main
+update of the object is that **the binary content is removed** from it, meaning that it will be garbage-collected if
+you don't store another reference to it. This is expected as we don't want to keep in memory the content that is
+already uploaded. Finally we create the commit by passing all the operations to [`create_commit`]. You can pass
+additional operations (add, delete or copy) that have not been processed yet and they will be handled correctly.
+
 ## Tips and tricks for large uploads
 
 There are some limitations to be aware of when dealing with a large amount of data in your repo. Given the time it takes to stream the data,

diff --git a/src/huggingface_hub/__init__.py b/src/huggingface_hub/__init__.py
@@ -194,6 +194,7 @@
         "model_info",
         "move_repo",
         "pause_space",
+        "preupload_lfs_files",
         "rename_discussion",
         "repo_exists",
         "repo_info",
@@ -512,6 +513,7 @@ def __dir__():
         model_info,  # noqa: F401
         move_repo,  # noqa: F401
         pause_space,  # noqa: F401
+        preupload_lfs_files,  # noqa: F401
         rename_discussion,  # noqa: F401
         repo_exists,  # noqa: F401
         repo_info,  # noqa: F401

diff --git a/src/huggingface_hub/_commit_api.py b/src/huggingface_hub/_commit_api.py
@@ -136,10 +136,23 @@ class CommitOperationAdd:
     path_or_fileobj: Union[str, Path, bytes, BinaryIO]
     upload_info: UploadInfo = field(init=False, repr=False)
 
+    # Internal attributes
+    _upload_mode: Optional[UploadMode] = None  # set to "lfs" or "regular" once known
+    _is_uploaded: bool = False  # set to True once the file has been upload as LFS
+    _is_committed: bool = False  # set to True once the file has been committed
+
     def __post_init__(self) -> None:
         """Validates `path_or_fileobj` and compute `upload_info`."""
         self.path_in_repo = _validate_path_in_repo(self.path_in_repo)
 
+        # Validate `_is_uploaded` and `_upload_mode` cannot be set by user
+        if self._is_uploaded is not False:
+            raise ValueError("Attribute `_is_uploaded` cannot be set manually.")
+        if self._upload_mode is not None:
+            raise ValueError("Attribute `_upload_mode` cannot be set manually.")
+        if self._is_committed is not False:
+            raise ValueError("Attribute `_is_committed` cannot be set manually.")
+
         # Validate `path_or_fileobj` value
         if isinstance(self.path_or_fileobj, Path):
             self.path_or_fileobj = str(self.path_or_fileobj)
@@ -250,7 +263,7 @@ def _validate_path_in_repo(path_in_repo: str) -> str:
 CommitOperation = Union[CommitOperationAdd, CommitOperationCopy, CommitOperationDelete]
 
 
-def warn_on_overwriting_operations(operations: List[CommitOperation]) -> None:
+def _warn_on_overwriting_operations(operations: List[CommitOperation]) -> None:
     """
     Warn user when a list of operations is expected to overwrite itself in a single
     commit.
@@ -297,7 +310,7 @@ def warn_on_overwriting_operations(operations: List[CommitOperation]) -> None:
 
 
 @validate_hf_hub_args
-def upload_lfs_files(
+def _upload_lfs_files(
     *,
     additions: List[CommitOperationAdd],
     repo_type: str,
@@ -419,18 +432,18 @@ def _validate_preupload_info(preupload_info: dict):
 
 
 @validate_hf_hub_args
-def fetch_upload_modes(
+def _fetch_upload_modes(
     additions: Iterable[CommitOperationAdd],
     repo_type: str,
     repo_id: str,
     token: Optional[str],
     revision: str,
     endpoint: Optional[str] = None,
     create_pr: bool = False,
-) -> Dict[str, UploadMode]:
+) -> None:
     """
-    Requests the Hub "preupload" endpoint to determine whether each input file
-    should be uploaded as a regular git blob or as git LFS blob.
+    Requests the Hub "preupload" endpoint to determine whether each input file should be uploaded as a regular git blob
+    or as git LFS blob. Input `additions` are mutated in-place with the upload mode.
 
     Args:
         additions (`Iterable` of :class:`CommitOperationAdd`):
@@ -446,9 +459,6 @@ def fetch_upload_modes(
         revision (`str`):
             The git revision to upload the files to. Can be any valid git revision.
 
-    Returns: `Dict[str, UploadMode]`
-        Key is the file path, value is the upload mode ("regular" or "lfs").
-
     Raises:
         [`~utils.HfHubHTTPError`]
             If the Hub API returned an error.
@@ -483,18 +493,19 @@ def fetch_upload_modes(
         preupload_info = _validate_preupload_info(resp.json())
         upload_modes.update(**{file["path"]: file["uploadMode"] for file in preupload_info["files"]})
 
+    # Set upload mode for each addition operation
+    for addition in additions:
+        addition._upload_mode = upload_modes[addition.path_in_repo]
+
     # Empty files cannot be uploaded as LFS (S3 would fail with a 501 Not Implemented)
     # => empty files are uploaded as "regular" to still allow users to commit them.
     for addition in additions:
         if addition.upload_info.size == 0:
-            path = addition.path_in_repo
-            upload_modes[path] = "regular"
-
-    return upload_modes
+            addition._upload_mode = "regular"
 
 
 @validate_hf_hub_args
-def fetch_lfs_files_to_copy(
+def _fetch_lfs_files_to_copy(
     copies: Iterable[CommitOperationCopy],
     repo_type: str,
     repo_id: str,
@@ -555,9 +566,8 @@ def fetch_lfs_files_to_copy(
     return files_to_copy
 
 
-def prepare_commit_payload(
+def _prepare_commit_payload(
     operations: Iterable[CommitOperation],
-    upload_modes: Dict[str, UploadMode],
     files_to_copy: Dict[Tuple[str, Optional[str]], "RepoFile"],
     commit_message: str,
     commit_description: Optional[str] = None,
@@ -584,7 +594,7 @@ def prepare_commit_payload(
     # 2. Send operations, one per line
     for operation in operations:
         # 2.a. Case adding a regular file
-        if isinstance(operation, CommitOperationAdd) and upload_modes.get(operation.path_in_repo) == "regular":
+        if isinstance(operation, CommitOperationAdd) and operation._upload_mode == "regular":
             yield {
                 "key": "file",
                 "value": {
@@ -594,7 +604,7 @@ def prepare_commit_payload(
                 },
             }
         # 2.b. Case adding an LFS file
-        elif isinstance(operation, CommitOperationAdd) and upload_modes.get(operation.path_in_repo) == "lfs":
+        elif isinstance(operation, CommitOperationAdd) and operation._upload_mode == "lfs":
             yield {
                 "key": "lfsFile",
                 "value": {
@@ -627,5 +637,5 @@ def prepare_commit_payload(
         else:
             raise ValueError(
                 f"Unknown operation to commit. Operation: {operation}. Upload mode:"
-                f" {upload_modes.get(operation.path_in_repo)}"
+                f" {getattr(operation, '_upload_mode', None)}"
             )