diff --git a/docs/source/en/guides/upload.md b/docs/source/en/guides/upload.md index 9c8eed876b..f8eb0b0d03 100644 --- a/docs/source/en/guides/upload.md +++ b/docs/source/en/guides/upload.md @@ -431,6 +431,50 @@ In addition to [`upload_file`] and [`upload_folder`], the following functions al For more detailed information, take a look at the [`HfApi`] reference. +### Preupload LFS files before commit + +In some cases, you might want to upload huge files to S3 **before** making the commit call. For example, if you are +committing a dataset in several shards that are generated in-memory, you would need to upload the shards one by one +to avoid an out-of-memory issue. A solution is to upload each shard as a separate commit on the repo. While being +perfectly valid, this solution has the drawback of potentially messing the git history by generating tens of commits. +To overcome this issue, you can upload your files one by one to S3 and then create a single commit at the end. This +is possible using [`preupload_lfs_files`] in combination with [`create_commit`]. + + + +This is a power-user method. Directly using [`upload_file`], [`upload_folder`] or [`create_commit`] instead of handling +the low-level logic of pre-uploading files is the way to go in the vast majority of cases. The main caveat of +[`preupload_lfs_files`] is that until the commit is actually made, the upload files are not accessible on the repo on +the Hub. If you have a question, feel free to ping us on our Discord or in a GitHub issue. + + + +Here is a simple example illustrating how to pre-upload files: + +```py +>>> from huggingface_hub import CommitOperationAdd, preupload_lfs_files, create_commit, create_repo + +>>> repo_id = create_repo("test_preupload").repo_id + +>>> operations = [] # List of all `CommitOperationAdd` objects that will be generated +>>> for i in range(5): +... content = ... # generate binary content +... addition = CommitOperationAdd(path_in_repo=f"shard_{i}_of_5.bin", path_or_fileobj=content) +... preupload_lfs_files(repo_id, additions=[addition]) +... operations.append(addition) + +>>> # Create commit +>>> create_commit(repo_id, operations=operations, commit_message="Commit all shards") +``` + +First, we create the [`CommitOperationAdd`] objects one by one. In a real-world example, those would contain the +generated shards. Each file is uploaded before generating the next one. During the [`preupload_lfs_files`] step, **the +`CommitOperationAdd` object is mutated**. You should only use it to pass it directly to [`create_commit`]. The main +update of the object is that **the binary content is removed** from it, meaning that it will be garbage-collected if +you don't store another reference to it. This is expected as we don't want to keep in memory the content that is +already uploaded. Finally we create the commit by passing all the operations to [`create_commit`]. You can pass +additional operations (add, delete or copy) that have not been processed yet and they will be handled correctly. + ## Tips and tricks for large uploads There are some limitations to be aware of when dealing with a large amount of data in your repo. Given the time it takes to stream the data, diff --git a/src/huggingface_hub/__init__.py b/src/huggingface_hub/__init__.py index bf137fcf13..7a476dad9b 100644 --- a/src/huggingface_hub/__init__.py +++ b/src/huggingface_hub/__init__.py @@ -194,6 +194,7 @@ "model_info", "move_repo", "pause_space", + "preupload_lfs_files", "rename_discussion", "repo_exists", "repo_info", @@ -512,6 +513,7 @@ def __dir__(): model_info, # noqa: F401 move_repo, # noqa: F401 pause_space, # noqa: F401 + preupload_lfs_files, # noqa: F401 rename_discussion, # noqa: F401 repo_exists, # noqa: F401 repo_info, # noqa: F401 diff --git a/src/huggingface_hub/_commit_api.py b/src/huggingface_hub/_commit_api.py index ae69085b5a..8ee9d551cc 100644 --- a/src/huggingface_hub/_commit_api.py +++ b/src/huggingface_hub/_commit_api.py @@ -136,10 +136,23 @@ class CommitOperationAdd: path_or_fileobj: Union[str, Path, bytes, BinaryIO] upload_info: UploadInfo = field(init=False, repr=False) + # Internal attributes + _upload_mode: Optional[UploadMode] = None # set to "lfs" or "regular" once known + _is_uploaded: bool = False # set to True once the file has been upload as LFS + _is_committed: bool = False # set to True once the file has been committed + def __post_init__(self) -> None: """Validates `path_or_fileobj` and compute `upload_info`.""" self.path_in_repo = _validate_path_in_repo(self.path_in_repo) + # Validate `_is_uploaded` and `_upload_mode` cannot be set by user + if self._is_uploaded is not False: + raise ValueError("Attribute `_is_uploaded` cannot be set manually.") + if self._upload_mode is not None: + raise ValueError("Attribute `_upload_mode` cannot be set manually.") + if self._is_committed is not False: + raise ValueError("Attribute `_is_committed` cannot be set manually.") + # Validate `path_or_fileobj` value if isinstance(self.path_or_fileobj, Path): self.path_or_fileobj = str(self.path_or_fileobj) @@ -250,7 +263,7 @@ def _validate_path_in_repo(path_in_repo: str) -> str: CommitOperation = Union[CommitOperationAdd, CommitOperationCopy, CommitOperationDelete] -def warn_on_overwriting_operations(operations: List[CommitOperation]) -> None: +def _warn_on_overwriting_operations(operations: List[CommitOperation]) -> None: """ Warn user when a list of operations is expected to overwrite itself in a single commit. @@ -297,7 +310,7 @@ def warn_on_overwriting_operations(operations: List[CommitOperation]) -> None: @validate_hf_hub_args -def upload_lfs_files( +def _upload_lfs_files( *, additions: List[CommitOperationAdd], repo_type: str, @@ -419,7 +432,7 @@ def _validate_preupload_info(preupload_info: dict): @validate_hf_hub_args -def fetch_upload_modes( +def _fetch_upload_modes( additions: Iterable[CommitOperationAdd], repo_type: str, repo_id: str, @@ -427,10 +440,10 @@ def fetch_upload_modes( revision: str, endpoint: Optional[str] = None, create_pr: bool = False, -) -> Dict[str, UploadMode]: +) -> None: """ - Requests the Hub "preupload" endpoint to determine whether each input file - should be uploaded as a regular git blob or as git LFS blob. + Requests the Hub "preupload" endpoint to determine whether each input file should be uploaded as a regular git blob + or as git LFS blob. Input `additions` are mutated in-place with the upload mode. Args: additions (`Iterable` of :class:`CommitOperationAdd`): @@ -446,9 +459,6 @@ def fetch_upload_modes( revision (`str`): The git revision to upload the files to. Can be any valid git revision. - Returns: `Dict[str, UploadMode]` - Key is the file path, value is the upload mode ("regular" or "lfs"). - Raises: [`~utils.HfHubHTTPError`] If the Hub API returned an error. @@ -483,18 +493,19 @@ def fetch_upload_modes( preupload_info = _validate_preupload_info(resp.json()) upload_modes.update(**{file["path"]: file["uploadMode"] for file in preupload_info["files"]}) + # Set upload mode for each addition operation + for addition in additions: + addition._upload_mode = upload_modes[addition.path_in_repo] + # Empty files cannot be uploaded as LFS (S3 would fail with a 501 Not Implemented) # => empty files are uploaded as "regular" to still allow users to commit them. for addition in additions: if addition.upload_info.size == 0: - path = addition.path_in_repo - upload_modes[path] = "regular" - - return upload_modes + addition._upload_mode = "regular" @validate_hf_hub_args -def fetch_lfs_files_to_copy( +def _fetch_lfs_files_to_copy( copies: Iterable[CommitOperationCopy], repo_type: str, repo_id: str, @@ -555,9 +566,8 @@ def fetch_lfs_files_to_copy( return files_to_copy -def prepare_commit_payload( +def _prepare_commit_payload( operations: Iterable[CommitOperation], - upload_modes: Dict[str, UploadMode], files_to_copy: Dict[Tuple[str, Optional[str]], "RepoFile"], commit_message: str, commit_description: Optional[str] = None, @@ -584,7 +594,7 @@ def prepare_commit_payload( # 2. Send operations, one per line for operation in operations: # 2.a. Case adding a regular file - if isinstance(operation, CommitOperationAdd) and upload_modes.get(operation.path_in_repo) == "regular": + if isinstance(operation, CommitOperationAdd) and operation._upload_mode == "regular": yield { "key": "file", "value": { @@ -594,7 +604,7 @@ def prepare_commit_payload( }, } # 2.b. Case adding an LFS file - elif isinstance(operation, CommitOperationAdd) and upload_modes.get(operation.path_in_repo) == "lfs": + elif isinstance(operation, CommitOperationAdd) and operation._upload_mode == "lfs": yield { "key": "lfsFile", "value": { @@ -627,5 +637,5 @@ def prepare_commit_payload( else: raise ValueError( f"Unknown operation to commit. Operation: {operation}. Upload mode:" - f" {upload_modes.get(operation.path_in_repo)}" + f" {getattr(operation, '_upload_mode', None)}" ) diff --git a/src/huggingface_hub/hf_api.py b/src/huggingface_hub/hf_api.py index 5acaff3d17..4ff144b6b1 100644 --- a/src/huggingface_hub/hf_api.py +++ b/src/huggingface_hub/hf_api.py @@ -63,11 +63,11 @@ CommitOperationAdd, CommitOperationCopy, CommitOperationDelete, - fetch_lfs_files_to_copy, - fetch_upload_modes, - prepare_commit_payload, - upload_lfs_files, - warn_on_overwriting_operations, + _fetch_lfs_files_to_copy, + _fetch_upload_modes, + _prepare_commit_payload, + _upload_lfs_files, + _warn_on_overwriting_operations, ) from ._multi_commits import ( MULTI_COMMIT_PR_CLOSE_COMMENT_FAILURE_BAD_REQUEST_TEMPLATE, @@ -137,6 +137,10 @@ USERNAME_PLACEHOLDER = "hf_user" _REGEX_DISCUSSION_URL = re.compile(r".*/discussions/(\d+)$") +_CREATE_COMMIT_NO_REPO_ERROR_MESSAGE = ( + "\nNote: Creating a commit assumes that the repo already exists on the" + " Huggingface Hub. Please use `create_repo` if it's not the case." +) logger = logging.get_logger(__name__) @@ -2895,6 +2899,28 @@ def create_commit( """ Creates a commit in the given repo, deleting & uploading files as needed. + + + The input list of `CommitOperation` will be mutated during the commit process. Do not reuse the same objects + for multiple commits. + + + + + + `create_commit` assumes that the repo already exists on the Hub. If you get a + Client error 404, please make sure you are authenticated and that `repo_id` and + `repo_type` are set correctly. If repo does not exist, create it first using + [`~hf_api.create_repo`]. + + + + + + `create_commit` is limited to 25k LFS files and a 1GB payload for regular files. + + + Args: repo_id (`str`): The repository in which the commit will be created, for example: @@ -2907,6 +2933,9 @@ def create_commit( - [`~hf_api.CommitOperationDelete`] to delete a file - [`~hf_api.CommitOperationCopy`] to copy a file + Operation objects will be mutated to include information relative to the upload. Do not reuse the + same objects for multiple commits. + commit_message (`str`): The summary (first line) of the commit that will be created. @@ -2966,27 +2995,7 @@ def create_commit( [`~utils.RepositoryNotFoundError`]: If repository is not found (error 404): wrong repo_id/repo_type, private but not authenticated or repo does not exist. - - - - `create_commit` assumes that the repo already exists on the Hub. If you get a - Client error 404, please make sure you are authenticated and that `repo_id` and - `repo_type` are set correctly. If repo does not exist, create it first using - [`~hf_api.create_repo`]. - - - - - - `create_commit` is limited to 25k LFS files and a 1GB payload for regular files. - - """ - _CREATE_COMMIT_NO_REPO_ERROR_MESSAGE = ( - "\nNote: Creating a commit assumes that the repo already exists on the" - " Huggingface Hub. Please use `create_repo` if it's not the case." - ) - if parent_commit is not None and not REGEX_COMMIT_OID.fullmatch(parent_commit): raise ValueError( f"`parent_commit` is not a valid commit OID. It must match the following regex: {REGEX_COMMIT_OID}" @@ -3009,46 +3018,41 @@ def create_commit( nb_copies = len(copies) nb_deletions = len(operations) - nb_additions - nb_copies + for addition in additions: + if addition._is_committed: + raise ValueError( + f"CommitOperationAdd {addition} has already being committed and cannot be reused. Please create a" + " new CommitOperationAdd object if you want to create a new commit." + ) + logger.debug( f"About to commit to the hub: {len(additions)} addition(s), {len(copies)} copie(s) and" f" {nb_deletions} deletion(s)." ) # If updating twice the same file or update then delete a file in a single commit - warn_on_overwriting_operations(operations) + _warn_on_overwriting_operations(operations) - try: - upload_modes = fetch_upload_modes( - additions=additions, - repo_type=repo_type, - repo_id=repo_id, - token=token or self.token, - revision=revision, - endpoint=self.endpoint, - create_pr=create_pr, - ) - except RepositoryNotFoundError as e: - e.append_to_message(_CREATE_COMMIT_NO_REPO_ERROR_MESSAGE) - raise - files_to_copy = fetch_lfs_files_to_copy( - copies=copies, - repo_type=repo_type, + self.preupload_lfs_files( repo_id=repo_id, - token=token or self.token, + additions=additions, + token=token, + repo_type=repo_type, revision=revision, - endpoint=self.endpoint, + create_pr=create_pr, + num_threads=num_threads, + free_memory=False, # do not remove `CommitOperationAdd.path_or_fileobj` on LFS files for "normal" users ) - upload_lfs_files( - additions=[addition for addition in additions if upload_modes[addition.path_in_repo] == "lfs"], + files_to_copy = _fetch_lfs_files_to_copy( + copies=copies, repo_type=repo_type, repo_id=repo_id, token=token or self.token, + revision=revision, endpoint=self.endpoint, - num_threads=num_threads, ) - commit_payload = prepare_commit_payload( + commit_payload = _prepare_commit_payload( operations=operations, - upload_modes=upload_modes, files_to_copy=files_to_copy, commit_message=commit_message, commit_description=commit_description, @@ -3083,6 +3087,10 @@ def _payload_as_ndjson() -> Iterable[bytes]: ) raise + # Mark additions as committed (cannot be reused in another commit) + for addition in additions: + addition._is_committed = True + commit_data = commit_resp.json() return CommitInfo( commit_url=commit_data["commitUrl"], @@ -3124,6 +3132,14 @@ def create_commits_on_pr( + + + `create_commits_on_pr` assumes that the repo already exists on the Hub. If you get a Client error 404, please + make sure you are authenticated and that `repo_id` and `repo_type` are set correctly. If repo does not exist, + create it first using [`~hf_api.create_repo`]. + + + Args: repo_id (`str`): The repository in which the commits will be pushed. Example: `"username/my-cool-model"`. @@ -3187,14 +3203,6 @@ def create_commits_on_pr( [`MultiCommitException`]: If an unexpected issue occur in the process: empty commits, unexpected commits in a PR, unexpected PR description, etc. - - - - `create_commits_on_pr` assumes that the repo already exists on the Hub. If you get a Client error 404, please - make sure you are authenticated and that `repo_id` and `repo_type` are set correctly. If repo does not exist, - create it first using [`~hf_api.create_repo`]. - - """ logger = logging.get_logger(__name__ + ".create_commits_on_pr") if verbose: @@ -3392,6 +3400,122 @@ def create_commits_on_pr( return pr.url + def preupload_lfs_files( + self, + repo_id: str, + additions: Iterable[CommitOperationAdd], + *, + token: Optional[str] = None, + repo_type: Optional[str] = None, + revision: Optional[str] = None, + create_pr: Optional[bool] = None, + num_threads: int = 5, + free_memory: bool = True, + ): + """Pre-upload LFS files to S3 in preparation on a future commit. + + This method is useful if you are generating the files to upload on-the-fly and you don't want to store them + in memory before uploading them all at once. + + + + This is a power-user method. You shouldn't need to call it directly to make a normal commit. + Use [`create_commit`] directly instead. + + + + + + Commit operations will be mutated during the process. In particular, the attached `path_or_fileobj` will be + removed after the upload to save memory (and replaced by an empty `bytes` object). Do not reuse the same + objects except to pass them to [`create_commit`]. If you don't want to remove the attached content from the + commit operation object, pass `free_memory=False`. + + + + Args: + repo_id (`str`): + The repository in which you will commit the files, for example: `"username/custom_transformers"`. + + operations (`Iterable` of [`CommitOperationAdd`]): + The list of files to upload. Warning: the objects in this list will be mutated to include information + relative to the upload. Do not reuse the same objects for multiple commits. + + token (`str`, *optional*): + Authentication token. Will default to the stored token. + + repo_type (`str`, *optional*): + The type of repository to upload to (e.g. `"model"` -default-, `"dataset"` or `"space"`). + + revision (`str`, *optional*): + The git revision to commit from. Defaults to the head of the `"main"` branch. + + create_pr (`boolean`, *optional*): + Whether or not you plan to create a Pull Request with that commit. Defaults to `False`. + + num_threads (`int`, *optional*): + Number of concurrent threads for uploading files. Defaults to 5. + Setting it to 2 means at most 2 files will be uploaded concurrently. + + Example: + ```py + >>> from huggingface_hub import CommitOperationAdd, preupload_lfs_files, create_commit, create_repo + + >>> repo_id = create_repo("test_preupload").repo_id + + # Generate and preupload LFS files one by one + >>> operations = [] # List of all `CommitOperationAdd` objects that will be generated + >>> for i in range(5): + ... content = ... # generate binary content + ... addition = CommitOperationAdd(path_in_repo=f"shard_{i}_of_5.bin", path_or_fileobj=content) + ... preupload_lfs_files(repo_id, additions=[addition]) # upload + free memory + ... operations.append(addition) + + # Create commit + >>> create_commit(repo_id, operations=operations, commit_message="Commit all shards") + ``` + """ + repo_type = repo_type if repo_type is not None else REPO_TYPE_MODEL + if repo_type not in REPO_TYPES: + raise ValueError(f"Invalid repo type, must be one of {REPO_TYPES}") + revision = quote(revision, safe="") if revision is not None else DEFAULT_REVISION + create_pr = create_pr if create_pr is not None else False + + # Filter out already uploaded files + new_additions = [addition for addition in additions if not addition._is_uploaded] + + # Check which new files are LFS + try: + _fetch_upload_modes( + additions=new_additions, + repo_type=repo_type, + repo_id=repo_id, + token=token or self.token, + revision=revision, + endpoint=self.endpoint, + create_pr=create_pr or False, + ) + except RepositoryNotFoundError as e: + e.append_to_message(_CREATE_COMMIT_NO_REPO_ERROR_MESSAGE) + raise + + # Filter out regular files + new_lfs_additions = [addition for addition in new_additions if addition._upload_mode == "lfs"] + + # Upload new LFS files + _upload_lfs_files( + additions=new_lfs_additions, + repo_type=repo_type, + repo_id=repo_id, + token=token or self.token, + endpoint=self.endpoint, + num_threads=num_threads, + ) + for addition in new_lfs_additions: + addition._is_uploaded = True + if free_memory: + addition.path_or_fileobj = b"" + @overload def upload_file( # type: ignore self, @@ -6339,6 +6463,7 @@ def _parse_revision_from_pr_url(pr_url: str) -> str: delete_file = api.delete_file delete_folder = api.delete_folder create_commits_on_pr = api.create_commits_on_pr +preupload_lfs_files = api.preupload_lfs_files create_branch = api.create_branch delete_branch = api.delete_branch create_tag = api.create_tag diff --git a/tests/test_commit_api.py b/tests/test_commit_api.py index 404474620c..037c70e51f 100644 --- a/tests/test_commit_api.py +++ b/tests/test_commit_api.py @@ -3,7 +3,7 @@ from huggingface_hub._commit_api import ( CommitOperationAdd, CommitOperationDelete, - warn_on_overwriting_operations, + _warn_on_overwriting_operations, ) @@ -104,7 +104,7 @@ class TestWarnOnOverwritingOperations(unittest.TestCase): delete_folder_e = CommitOperationDelete(path_in_repo="e/") def test_no_overwrite(self) -> None: - warn_on_overwriting_operations( + _warn_on_overwriting_operations( [ self.add_file_ab, self.add_file_abc, @@ -115,21 +115,21 @@ def test_no_overwrite(self) -> None: def test_add_then_update_file(self) -> None: with self.assertWarns(UserWarning): - warn_on_overwriting_operations([self.add_file_abc, self.update_file_abc]) + _warn_on_overwriting_operations([self.add_file_abc, self.update_file_abc]) def test_add_then_delete_file(self) -> None: with self.assertWarns(UserWarning): - warn_on_overwriting_operations([self.add_file_abc, self.delete_file_abc]) + _warn_on_overwriting_operations([self.add_file_abc, self.delete_file_abc]) def test_add_then_delete_folder(self) -> None: with self.assertWarns(UserWarning): - warn_on_overwriting_operations([self.add_file_abc, self.delete_folder_a]) + _warn_on_overwriting_operations([self.add_file_abc, self.delete_folder_a]) with self.assertWarns(UserWarning): - warn_on_overwriting_operations([self.add_file_ab, self.delete_folder_a]) + _warn_on_overwriting_operations([self.add_file_ab, self.delete_folder_a]) def test_delete_file_then_add(self) -> None: - warn_on_overwriting_operations([self.delete_file_abc, self.add_file_abc]) + _warn_on_overwriting_operations([self.delete_file_abc, self.add_file_abc]) def test_delete_folder_then_add(self) -> None: - warn_on_overwriting_operations([self.delete_folder_a, self.add_file_ab, self.add_file_abc]) + _warn_on_overwriting_operations([self.delete_folder_a, self.add_file_ab, self.add_file_abc]) diff --git a/tests/test_hf_api.py b/tests/test_hf_api.py index ca56eee240..1dfed8b6c9 100644 --- a/tests/test_hf_api.py +++ b/tests/test_hf_api.py @@ -39,7 +39,7 @@ CommitOperationAdd, CommitOperationCopy, CommitOperationDelete, - fetch_upload_modes, + _fetch_upload_modes, ) from huggingface_hub.community import DiscussionComment, DiscussionWithDetails from huggingface_hub.constants import ( @@ -907,10 +907,11 @@ def test_create_commit_huge_regular_files(self): self._api.delete_repo(repo_id=REPO_NAME) @retry_endpoint - def test_commit_preflight_on_lots_of_lfs_files(self): + @use_tmp_repo() + def test_commit_preflight_on_lots_of_lfs_files(self, repo_url: RepoUrl): """Test committing 1300 LFS files at once. - This was not possible when `fetch_upload_modes` was not fetching metadata by + This was not possible when `_fetch_upload_modes` was not fetching metadata by chunks. We are not testing the full upload as it would require to upload 1300 files which is unnecessary for the test. Having an overall large payload (for `/create-commit` endpoint) is tested in `test_create_commit_huge_regular_files`. @@ -919,34 +920,27 @@ def test_commit_preflight_on_lots_of_lfs_files(self): See https://github.com/huggingface/huggingface_hub/pull/1117. """ - REPO_NAME = repo_name("commit_preflight_lots_of_lfs_files") - self._api.create_repo(repo_id=REPO_NAME, exist_ok=False) - try: - operations = [] - for num in range(1300): - operations.append( - CommitOperationAdd( - path_in_repo=f"file-{num}.bin", # considered as LFS - path_or_fileobj=b"Hello LFS" + b"a" * 2048, # big enough sample - ) - ) - - # Test `fetch_upload_modes` preflight ("are they regular or LFS files?") - res = fetch_upload_modes( - additions=operations, - repo_type="model", - repo_id=f"{USER}/{REPO_NAME}", - token=TOKEN, - revision="main", - endpoint=ENDPOINT_STAGING, + operations = [ + CommitOperationAdd( + path_in_repo=f"file-{num}.bin", # considered as LFS + path_or_fileobj=b"Hello LFS" + b"a" * 2048, # big enough sample ) - self.assertEqual(len(res), 1300) - for _, mode in res.items(): - self.assertEqual(mode, "lfs") - except Exception as err: - self.fail(err) - finally: - self._api.delete_repo(repo_id=REPO_NAME) + for num in range(1300) + ] + + # Test `_fetch_upload_modes` preflight ("are they regular or LFS files?") + _fetch_upload_modes( + additions=operations, + repo_type="model", + repo_id=repo_url.repo_id, + token=TOKEN, + revision="main", + endpoint=ENDPOINT_STAGING, + ) + for operation in operations: + self.assertEqual(operation._upload_mode, "lfs") + self.assertFalse(operation._is_committed) + self.assertFalse(operation._is_uploaded) @retry_endpoint def test_create_commit_repo_id_case_insensitive(self): @@ -1025,6 +1019,58 @@ def test_commit_copy_file(self, repo_url: RepoUrl) -> None: repo_file1, repo_file2 = self._api.list_files_info(repo_id=repo_id, paths=["lfs.bin", "lfs Copy.bin"]) self.assertEqual(repo_file1.lfs["sha256"], repo_file2.lfs["sha256"]) + @retry_endpoint + @use_tmp_repo() + def test_create_commit_mutates_operations(self, repo_url: RepoUrl) -> None: + repo_id = repo_url.repo_id + + operations = [ + CommitOperationAdd(path_in_repo="lfs.bin", path_or_fileobj=b"content"), + CommitOperationAdd(path_in_repo="file.txt", path_or_fileobj=b"content"), + ] + self._api.create_commit( + repo_id=repo_id, + commit_message="Copy LFS file.", + operations=operations, + ) + + self.assertTrue(operations[0]._is_committed) + self.assertTrue(operations[0]._is_uploaded) # LFS file + self.assertEqual(operations[0].path_or_fileobj, b"content") # not removed by default + self.assertTrue(operations[1]._is_committed) + self.assertEqual(operations[1].path_or_fileobj, b"content") + + @retry_endpoint + @use_tmp_repo() + def test_pre_upload_before_commit(self, repo_url: RepoUrl) -> None: + repo_id = repo_url.repo_id + + operations = [ + CommitOperationAdd(path_in_repo="lfs.bin", path_or_fileobj=b"content1"), + CommitOperationAdd(path_in_repo="file.txt", path_or_fileobj=b"content"), + CommitOperationAdd(path_in_repo="lfs2.bin", path_or_fileobj=b"content2"), + CommitOperationAdd(path_in_repo="file.txt", path_or_fileobj=b"content"), + ] + + # First: preupload 1 by 1 + for operation in operations: + self._api.preupload_lfs_files(repo_id, [operation]) + self.assertTrue(operations[0]._is_uploaded) + self.assertEqual(operations[0].path_or_fileobj, b"") # Freed memory + self.assertTrue(operations[2]._is_uploaded) + self.assertEqual(operations[2].path_or_fileobj, b"") # Freed memory + + # create commit and capture debug logs + with self.assertLogs("huggingface_hub", level="DEBUG") as debug_logs: + self._api.create_commit( + repo_id=repo_id, + commit_message="Copy LFS file.", + operations=operations, + ) + + # No LFS files uploaded during commit + self.assertTrue(any("No LFS files to upload." in log for log in debug_logs.output)) + class HfApiUploadEmptyFileTest(HfApiCommonTest): @classmethod