Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 4 additions & 5 deletions src/datasets/packaged_modules/json/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,11 +77,10 @@ def _generate_tables(self, files):
pa_table = pa.Table.from_pydict(mapping=dataset)
else:
try:
pa_table = paj.read_json(
file,
read_options=self.config.pa_read_options,
parse_options=self.config.pa_parse_options,
)
with open(file, "rb") as f:
pa_table = paj.read_json(
f, read_options=self.config.pa_read_options, parse_options=self.config.pa_parse_options
)
except pa.ArrowInvalid:
with open(file, encoding="utf-8") as f:
dataset = json.load(f)
Expand Down
2 changes: 1 addition & 1 deletion src/datasets/streaming.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def extend_module_for_streaming(module_path, use_auth_token: Optional[Union[str,
They rare replaced by `xopen` and `xjoin` defined to work with the StreamingDownloadManager.

We use fsspec to extend `open` to be able to read remote files.
To join paths and naviguate in remote compressed archives, we use the "::" separator.
To join paths and navigate in remote compressed archives, we use the "::" separator.
"""

module = importlib.import_module(module_path)
Expand Down
2 changes: 1 addition & 1 deletion src/datasets/utils/streaming_download_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def read_with_retries(*args, **kwargs):

def xopen(file, mode="r", *args, **kwargs):
"""
This function extends the builin `open` function to support remote files using fsspec.
This function extends the builtin `open` function to support remote files using fsspec.

It also has a retry mechanism in case connection fails.
The args and kwargs are passed to fsspec.open, except `use_auth_token` which is used for queries to private repos on huggingface.co
Expand Down
13 changes: 12 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ def jsonl_path(tmp_path_factory):
path = str(tmp_path_factory.mktemp("data") / "dataset.jsonl")
with open(path, "w") as f:
for item in DATA:
f.write(json.dumps(item))
f.write(json.dumps(item) + "\n")
return path


Expand Down Expand Up @@ -255,3 +255,14 @@ def text_gz_path(tmp_path_factory, text_path):
with gzip.open(path, "wb") as zipped_file:
zipped_file.writelines(orig_file)
return path


@pytest.fixture(scope="session")
def jsonl_gz_path(tmp_path_factory, jsonl_path):
import gzip

path = str(tmp_path_factory.mktemp("data") / "dataset.jsonl.gz")
with open(jsonl_path, "rb") as orig_file:
with gzip.open(path, "wb") as zipped_file:
zipped_file.writelines(orig_file)
return path
9 changes: 9 additions & 0 deletions tests/test_load.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,15 @@ def test_load_dataset_streaming(dataset_loading_script_dir, data_dir):
assert isinstance(next(iter(dataset["train"])), dict)


@require_streaming
def test_load_dataset_streaming_gz_json(jsonl_gz_path):
data_files = jsonl_gz_path
ds = load_dataset("json", split="train", data_files=data_files, streaming=True)
assert isinstance(ds, IterableDataset)
ds_item = next(iter(ds))
assert ds_item == {"col_1": "0", "col_2": 0, "col_3": 0.0}


def test_loading_from_the_datasets_hub():
with tempfile.TemporaryDirectory() as tmp_dir:
dataset = load_dataset(SAMPLE_DATASET_IDENTIFIER, cache_dir=tmp_dir)
Expand Down