diff --git a/src/datasets/download/streaming_download_manager.py b/src/datasets/download/streaming_download_manager.py index d702b7ac899..42edd42ce85 100644 --- a/src/datasets/download/streaming_download_manager.py +++ b/src/datasets/download/streaming_download_manager.py @@ -692,6 +692,16 @@ def __truediv__(self, p: str) -> "xPath": return self.joinpath(p) +def xgzip_open(filepath_or_buffer, *args, use_auth_token: Optional[Union[str, bool]] = None, **kwargs): + import gzip + + if hasattr(filepath_or_buffer, "read"): + return gzip.open(filepath_or_buffer, *args, **kwargs) + else: + filepath_or_buffer = str(filepath_or_buffer) + return gzip.open(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), *args, **kwargs) + + def xpandas_read_csv(filepath_or_buffer, use_auth_token: Optional[Union[str, bool]] = None, **kwargs): import pandas as pd diff --git a/src/datasets/streaming.py b/src/datasets/streaming.py index 26d7982b53b..cd30f40dcc9 100644 --- a/src/datasets/streaming.py +++ b/src/datasets/streaming.py @@ -9,6 +9,7 @@ xet_parse, xgetsize, xglob, + xgzip_open, xisdir, xisfile, xjoin, @@ -88,6 +89,7 @@ def wrapper(*args, **kwargs): patch_submodule(module, "os.path.getsize", wrap_auth(xgetsize)).start() patch_submodule(module, "pathlib.Path", xPath).start() # file readers + patch_submodule(module, "gzip.open", wrap_auth(xgzip_open)).start() patch_submodule(module, "pandas.read_csv", wrap_auth(xpandas_read_csv), attrs=["__version__"]).start() patch_submodule(module, "pandas.read_excel", xpandas_read_excel, attrs=["__version__"]).start() patch_submodule(module, "scipy.io.loadmat", wrap_auth(xsio_loadmat), attrs=["__version__"]).start()