Skip to content
Merged
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions src/datasets/utils/streaming_download_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,11 @@ def _extract(self, urlpath: str) -> str:
return f"{protocol}://::{urlpath}"

def _get_extraction_protocol(self, urlpath: str) -> Optional[str]:
# get inner file: zip://train-00000.json.gz::https://foo.bar/data.zip -> zip://data/train-00000.json.gz
path = urlpath.split("::")[0]
# remove query params: https://foo.bar/train.json.gz?dl=1 -> https://foo.bar/train.json.gz
path = path.split("?")[0]
# Get extension: https://foo.bar/train.json.gz -> gz
extension = path.split(".")[-1]
if extension in BASE_KNOWN_EXTENSIONS:
return None
Expand Down