-
Notifications
You must be signed in to change notification settings - Fork 3k
Closed
Labels
bugSomething isn't workingSomething isn't working
Description
CI test (unit, ubuntu-latest, deps-minimum) is broken, raising a SchemaInferenceError: see https://github.com/huggingface/datasets/actions/runs/3930901593/jobs/6721492004
FAILED tests/test_beam.py::BeamBuilderTest::test_download_and_prepare_sharded - datasets.arrow_writer.SchemaInferenceError: Please pass `features` or at least one example when writing data
Stack trace:
______________ BeamBuilderTest.test_download_and_prepare_sharded _______________
[gw1] linux -- Python 3.7.15 /opt/hostedtoolcache/Python/3.7.15/x64/bin/python
self = <tests.test_beam.BeamBuilderTest testMethod=test_download_and_prepare_sharded>
@require_beam
def test_download_and_prepare_sharded(self):
import apache_beam as beam
original_write_parquet = beam.io.parquetio.WriteToParquet
expected_num_examples = len(get_test_dummy_examples())
with tempfile.TemporaryDirectory() as tmp_cache_dir:
builder = DummyBeamDataset(cache_dir=tmp_cache_dir, beam_runner="DirectRunner")
with patch("apache_beam.io.parquetio.WriteToParquet") as write_parquet_mock:
write_parquet_mock.side_effect = partial(original_write_parquet, num_shards=2)
> builder.download_and_prepare()
tests/test_beam.py:97:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
/opt/hostedtoolcache/Python/3.7.15/x64/lib/python3.7/site-packages/datasets/builder.py:864: in download_and_prepare
**download_and_prepare_kwargs,
/opt/hostedtoolcache/Python/3.7.15/x64/lib/python3.7/site-packages/datasets/builder.py:1976: in _download_and_prepare
num_examples, num_bytes = beam_writer.finalize(metrics.query(m_filter))
/opt/hostedtoolcache/Python/3.7.15/x64/lib/python3.7/site-packages/datasets/arrow_writer.py:694: in finalize
shard_num_bytes, _ = parquet_to_arrow(source, destination)
/opt/hostedtoolcache/Python/3.7.15/x64/lib/python3.7/site-packages/datasets/arrow_writer.py:740: in parquet_to_arrow
num_bytes, num_examples = writer.finalize()
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = <datasets.arrow_writer.ArrowWriter object at 0x7f6dcbb3e810>
close_stream = True
def finalize(self, close_stream=True):
self.write_rows_on_file()
# In case current_examples < writer_batch_size, but user uses finalize()
if self._check_duplicates:
self.check_duplicate_keys()
# Re-intializing to empty list for next batch
self.hkey_record = []
self.write_examples_on_file()
# If schema is known, infer features even if no examples were written
if self.pa_writer is None and self.schema:
self._build_writer(self.schema)
if self.pa_writer is not None:
self.pa_writer.close()
self.pa_writer = None
if close_stream:
self.stream.close()
else:
if close_stream:
self.stream.close()
> raise SchemaInferenceError("Please pass `features` or at least one example when writing data")
E datasets.arrow_writer.SchemaInferenceError: Please pass `features` or at least one example when writing data
/opt/hostedtoolcache/Python/3.7.15/x64/lib/python3.7/site-packages/datasets/arrow_writer.py:593: SchemaInferenceError
Metadata
Metadata
Assignees
Labels
bugSomething isn't workingSomething isn't working