diff --git a/src/datasets/packaged_modules/json/json.py b/src/datasets/packaged_modules/json/json.py index 55ec9ec1a3e..c0a085b0731 100644 --- a/src/datasets/packaged_modules/json/json.py +++ b/src/datasets/packaged_modules/json/json.py @@ -77,11 +77,10 @@ def _generate_tables(self, files): pa_table = pa.Table.from_pydict(mapping=dataset) else: try: - pa_table = paj.read_json( - file, - read_options=self.config.pa_read_options, - parse_options=self.config.pa_parse_options, - ) + with open(file, "rb") as f: + pa_table = paj.read_json( + f, read_options=self.config.pa_read_options, parse_options=self.config.pa_parse_options + ) except pa.ArrowInvalid: with open(file, encoding="utf-8") as f: dataset = json.load(f) diff --git a/src/datasets/streaming.py b/src/datasets/streaming.py index 81d39ff248b..3d92b5f8d81 100644 --- a/src/datasets/streaming.py +++ b/src/datasets/streaming.py @@ -78,7 +78,7 @@ def extend_module_for_streaming(module_path, use_auth_token: Optional[Union[str, They rare replaced by `xopen` and `xjoin` defined to work with the StreamingDownloadManager. We use fsspec to extend `open` to be able to read remote files. - To join paths and naviguate in remote compressed archives, we use the "::" separator. + To join paths and navigate in remote compressed archives, we use the "::" separator. """ module = importlib.import_module(module_path) diff --git a/src/datasets/utils/streaming_download_manager.py b/src/datasets/utils/streaming_download_manager.py index 4499bfd7d69..d7ccb8a385d 100644 --- a/src/datasets/utils/streaming_download_manager.py +++ b/src/datasets/utils/streaming_download_manager.py @@ -67,7 +67,7 @@ def read_with_retries(*args, **kwargs): def xopen(file, mode="r", *args, **kwargs): """ - This function extends the builin `open` function to support remote files using fsspec. + This function extends the builtin `open` function to support remote files using fsspec. It also has a retry mechanism in case connection fails. The args and kwargs are passed to fsspec.open, except `use_auth_token` which is used for queries to private repos on huggingface.co diff --git a/tests/conftest.py b/tests/conftest.py index 3e416a35d98..cf7c854a75d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -214,7 +214,7 @@ def jsonl_path(tmp_path_factory): path = str(tmp_path_factory.mktemp("data") / "dataset.jsonl") with open(path, "w") as f: for item in DATA: - f.write(json.dumps(item)) + f.write(json.dumps(item) + "\n") return path @@ -255,3 +255,14 @@ def text_gz_path(tmp_path_factory, text_path): with gzip.open(path, "wb") as zipped_file: zipped_file.writelines(orig_file) return path + + +@pytest.fixture(scope="session") +def jsonl_gz_path(tmp_path_factory, jsonl_path): + import gzip + + path = str(tmp_path_factory.mktemp("data") / "dataset.jsonl.gz") + with open(jsonl_path, "rb") as orig_file: + with gzip.open(path, "wb") as zipped_file: + zipped_file.writelines(orig_file) + return path diff --git a/tests/test_load.py b/tests/test_load.py index ad3a148f199..69e744f396b 100644 --- a/tests/test_load.py +++ b/tests/test_load.py @@ -235,6 +235,15 @@ def test_load_dataset_streaming(dataset_loading_script_dir, data_dir): assert isinstance(next(iter(dataset["train"])), dict) +@require_streaming +def test_load_dataset_streaming_gz_json(jsonl_gz_path): + data_files = jsonl_gz_path + ds = load_dataset("json", split="train", data_files=data_files, streaming=True) + assert isinstance(ds, IterableDataset) + ds_item = next(iter(ds)) + assert ds_item == {"col_1": "0", "col_2": 0, "col_3": 0.0} + + def test_loading_from_the_datasets_hub(): with tempfile.TemporaryDirectory() as tmp_dir: dataset = load_dataset(SAMPLE_DATASET_IDENTIFIER, cache_dir=tmp_dir)