|
12 | 12 | from datasets.utils.file_utils import ( |
13 | 13 | OfflineModeIsEnabled, |
14 | 14 | _get_extraction_protocol, |
| 15 | + _prepare_single_hop_path_and_storage_options, |
15 | 16 | cached_path, |
16 | 17 | fsspec_get, |
17 | 18 | fsspec_head, |
|
47 | 48 |
|
48 | 49 | FILE_PATH = "file" |
49 | 50 |
|
50 | | -TEST_URL = "https://huggingface.co/datasets/hf-internal-testing/dataset_with_script/raw/main/some_text.txt" |
| 51 | +TEST_URL = "https://huggingface.co/datasets/hf-internal-testing/dataset_with_script/resolve/main/some_text.txt" |
51 | 52 | TEST_URL_CONTENT = "foo\nbar\nfoobar" |
52 | 53 |
|
53 | 54 | TEST_GG_DRIVE_FILENAME = "train.tsv" |
@@ -90,7 +91,6 @@ def test_cached_path_protocols(protocol, monkeypatch, tmp_path): |
90 | 91 | urls = {"hf": "hf://datasets/org-name/ds-name@main/filename.ext", "s3": "s3://bucket-name/filename.ext"} |
91 | 92 | url = urls[protocol] |
92 | 93 | _ = cached_path(url, download_config=download_config) |
93 | | - assert True |
94 | 94 | for mock in [mock_fsspec_head, mock_fsspec_get]: |
95 | 95 | assert mock.called |
96 | 96 | assert mock.call_count == 1 |
@@ -197,6 +197,75 @@ def test_fsspec_offline(tmp_path_factory): |
197 | 197 | fsspec_head("s3://huggingface.co") |
198 | 198 |
|
199 | 199 |
|
| 200 | +@pytest.mark.parametrize( |
| 201 | + "urlpath, download_config, expected_urlpath, expected_storage_options", |
| 202 | + [ |
| 203 | + ( |
| 204 | + "https://huggingface.co/datasets/hf-internal-testing/dataset_with_script/resolve/main/some_text.txt", |
| 205 | + DownloadConfig(), |
| 206 | + "hf://datasets/hf-internal-testing/dataset_with_script@main/some_text.txt", |
| 207 | + {"hf": {"endpoint": "https://huggingface.co", "token": None}}, |
| 208 | + ), |
| 209 | + ( |
| 210 | + "https://huggingface.co/datasets/hf-internal-testing/dataset_with_script/resolve/main/some_text.txt", |
| 211 | + DownloadConfig(token="MY-TOKEN"), |
| 212 | + "hf://datasets/hf-internal-testing/dataset_with_script@main/some_text.txt", |
| 213 | + {"hf": {"endpoint": "https://huggingface.co", "token": "MY-TOKEN"}}, |
| 214 | + ), |
| 215 | + ( |
| 216 | + "https://huggingface.co/datasets/hf-internal-testing/dataset_with_script/resolve/main/some_text.txt", |
| 217 | + DownloadConfig(token="MY-TOKEN", storage_options={"hf": {"on_error": "omit"}}), |
| 218 | + "hf://datasets/hf-internal-testing/dataset_with_script@main/some_text.txt", |
| 219 | + {"hf": {"endpoint": "https://huggingface.co", "token": "MY-TOKEN", "on_error": "omit"}}, |
| 220 | + ), |
| 221 | + ( |
| 222 | + "https://domain.org/data.txt", |
| 223 | + DownloadConfig(), |
| 224 | + "https://domain.org/data.txt", |
| 225 | + {"https": {"client_kwargs": {"trust_env": True}}}, |
| 226 | + ), |
| 227 | + ( |
| 228 | + "https://domain.org/data.txt", |
| 229 | + DownloadConfig(storage_options={"https": {"block_size": "omit"}}), |
| 230 | + "https://domain.org/data.txt", |
| 231 | + {"https": {"client_kwargs": {"trust_env": True}, "block_size": "omit"}}, |
| 232 | + ), |
| 233 | + ( |
| 234 | + "https://domain.org/data.txt", |
| 235 | + DownloadConfig(storage_options={"https": {"client_kwargs": {"raise_for_status": True}}}), |
| 236 | + "https://domain.org/data.txt", |
| 237 | + {"https": {"client_kwargs": {"trust_env": True, "raise_for_status": True}}}, |
| 238 | + ), |
| 239 | + ( |
| 240 | + "https://domain.org/data.txt", |
| 241 | + DownloadConfig(storage_options={"https": {"client_kwargs": {"trust_env": False}}}), |
| 242 | + "https://domain.org/data.txt", |
| 243 | + {"https": {"client_kwargs": {"trust_env": False}}}, |
| 244 | + ), |
| 245 | + ( |
| 246 | + "https://raw.githubusercontent.com/data.txt", |
| 247 | + DownloadConfig(storage_options={"https": {"headers": {"x-test": "true"}}}), |
| 248 | + "https://raw.githubusercontent.com/data.txt", |
| 249 | + { |
| 250 | + "https": { |
| 251 | + "client_kwargs": {"trust_env": True}, |
| 252 | + "headers": {"x-test": "true", "Accept-Encoding": "identity"}, |
| 253 | + } |
| 254 | + }, |
| 255 | + ), |
| 256 | + ], |
| 257 | +) |
| 258 | +def test_prepare_single_hop_path_and_storage_options( |
| 259 | + urlpath, download_config, expected_urlpath, expected_storage_options |
| 260 | +): |
| 261 | + original_download_config_storage_options = str(download_config.storage_options) |
| 262 | + prepared_urlpath, storage_options = _prepare_single_hop_path_and_storage_options(urlpath, download_config) |
| 263 | + assert prepared_urlpath == expected_urlpath |
| 264 | + assert storage_options == expected_storage_options |
| 265 | + # Check that DownloadConfig.storage_options are not modified: |
| 266 | + assert str(download_config.storage_options) == original_download_config_storage_options |
| 267 | + |
| 268 | + |
200 | 269 | class DummyTestFS(AbstractFileSystem): |
201 | 270 | protocol = "mock" |
202 | 271 | _file_class = AbstractBufferedFile |
|
0 commit comments