Skip to content

Commit d3d41be

Browse files
committed
Add get_source and get_readme functions for demo datasets (#2673)
1 parent 74c67f3 commit d3d41be

File tree

3 files changed

+328
-1
lines changed

3 files changed

+328
-1
lines changed

sdv/datasets/demo.py

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -338,3 +338,113 @@ def get_available_demos(modality):
338338
continue
339339

340340
return pd.DataFrame(tables_info)
341+
342+
343+
def _find_text_key(contents, dataset_prefix, filename):
344+
"""Find a text file key (README.txt or SOURCE.txt).
345+
346+
Performs a case-insensitive search for ``filename`` directly under ``dataset_prefix``.
347+
348+
Args:
349+
contents (list[dict]):
350+
List of objects from S3.
351+
dataset_prefix (str):
352+
Prefix like 'single_table/dataset/'.
353+
filename (str):
354+
The filename to look for (e.g., 'README.txt').
355+
356+
Returns:
357+
str or None:
358+
The key if found, otherwise ``None``.
359+
"""
360+
expected_lower = f'{dataset_prefix}{filename}'.lower()
361+
for entry in contents:
362+
key = entry.get('Key') or ''
363+
if key.lower() == expected_lower:
364+
return key
365+
366+
return None
367+
368+
369+
def _get_text_file_content(modality, dataset_name, filename, output_filepath=None):
370+
"""Fetch text file content under the dataset prefix.
371+
372+
Args:
373+
modality (str):
374+
The modality of the dataset: ``'single_table'``, ``'multi_table'``, ``'sequential'``.
375+
dataset_name (str):
376+
The name of the dataset.
377+
filename (str):
378+
The filename to fetch (``'README.txt'`` or ``'SOURCE.txt'``).
379+
output_filepath (str or None):
380+
If provided, save the file contents at this path.
381+
382+
Returns:
383+
str or None:
384+
The decoded text contents if the file exists, otherwise ``None``.
385+
"""
386+
_validate_modalities(modality)
387+
dataset_prefix = f'{modality}/{dataset_name}/'
388+
contents = _list_objects(dataset_prefix)
389+
390+
key = _find_text_key(contents, dataset_prefix, filename)
391+
if not key:
392+
LOGGER.info(f'No {filename} found for dataset {dataset_name}.')
393+
return None
394+
395+
try:
396+
raw = _get_data_from_bucket(key)
397+
except Exception:
398+
LOGGER.info(f'Error fetching {filename} for dataset {dataset_name}.')
399+
return None
400+
401+
text = raw.decode('utf-8', errors='replace')
402+
if output_filepath:
403+
try:
404+
parent = os.path.dirname(str(output_filepath))
405+
if parent:
406+
os.makedirs(parent, exist_ok=True)
407+
with open(output_filepath, 'w', encoding='utf-8') as f:
408+
f.write(text)
409+
410+
except Exception:
411+
LOGGER.info(f'Error saving {filename} for dataset {dataset_name}.')
412+
pass
413+
414+
return text
415+
416+
417+
def get_source(modality, dataset_name, output_filepath=None):
418+
"""Get dataset source/citation text.
419+
420+
Args:
421+
modality (str):
422+
The modality of the dataset: ``'single_table'``, ``'multi_table'``, ``'sequential'``.
423+
dataset_name (str):
424+
The name of the dataset to get the source information for.
425+
output_filepath (str or None):
426+
Optional path where to save the file.
427+
428+
Returns:
429+
str or None:
430+
The contents of the source file if it exists; otherwise ``None``.
431+
"""
432+
return _get_text_file_content(modality, dataset_name, 'SOURCE.txt', output_filepath)
433+
434+
435+
def get_readme(modality, dataset_name, output_filepath=None):
436+
"""Get dataset README text.
437+
438+
Args:
439+
modality (str):
440+
The modality of the dataset: ``'single_table'``, ``'multi_table'``, ``'sequential'``.
441+
dataset_name (str):
442+
The name of the dataset to get the README for.
443+
output_filepath (str or None):
444+
Optional path where to save the file.
445+
446+
Returns:
447+
str or None:
448+
The contents of the README file if it exists; otherwise ``None``.
449+
"""
450+
return _get_text_file_content(modality, dataset_name, 'README.txt', output_filepath)

tests/integration/datasets/test_demo.py

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import pandas as pd
22

3-
from sdv.datasets.demo import get_available_demos
3+
from sdv.datasets.demo import get_available_demos, get_readme, get_source
44

55

66
def test_get_available_demos_single_table():
@@ -85,3 +85,30 @@ def test_get_available_demos_multi_table():
8585
'num_tables': [2, 2],
8686
})
8787
pd.testing.assert_frame_equal(tables_info[['dataset_name', 'size_MB', 'num_tables']], expected)
88+
89+
90+
def test_get_readme_and_source_single_table_dataset1(tmp_path):
91+
"""Test it returns the README and SOURCE for a single table dataset."""
92+
# Run
93+
readme = get_readme('single_table', 'dataset1')
94+
source = get_source('single_table', 'dataset1')
95+
96+
# Assert
97+
assert isinstance(readme, str) and 'sample dataset' in readme.lower()
98+
assert isinstance(source, str) and source.strip() == 'unknown'
99+
100+
readme_out = tmp_path / 'r.txt'
101+
source_out = tmp_path / 's.txt'
102+
readme2 = get_readme('single_table', 'dataset1', str(readme_out))
103+
source2 = get_source('single_table', 'dataset1', str(source_out))
104+
assert readme2 == readme
105+
assert source2 == source
106+
assert readme_out.read_text(encoding='utf-8').strip() == readme.strip()
107+
assert source_out.read_text(encoding='utf-8').strip() == source.strip()
108+
109+
110+
def test_get_readme_missing_returns_none():
111+
"""Test it returns None when the README/SOURCE is missing."""
112+
# Run and Assert
113+
assert get_readme('single_table', 'dataset2') is None
114+
assert get_source('single_table', 'dataset2') is None

tests/unit/datasets/test_demo.py

Lines changed: 190 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,15 @@
1212
from sdv.datasets.demo import (
1313
_download,
1414
_find_data_zip_key,
15+
_find_text_key,
1516
_get_data_from_bucket,
1617
_get_first_v1_metadata_bytes,
18+
_get_text_file_content,
1719
_iter_metainfo_yaml_entries,
1820
download_demo,
1921
get_available_demos,
22+
get_readme,
23+
get_source,
2024
)
2125
from sdv.errors import DemoResourceNotFoundError
2226

@@ -579,3 +583,189 @@ def test_download_demo_no_v1_metadata_raises(mock_list, mock_get):
579583
# Run and Assert
580584
with pytest.raises(DemoResourceNotFoundError, match='METADATA_SPEC_VERSION'):
581585
download_demo('single_table', 'word')
586+
587+
588+
def test__find_text_key_returns_none_when_missing():
589+
"""Test it returns None when the key is missing."""
590+
# Setup
591+
contents = [
592+
{'Key': 'single_table/dataset/metadata.json'},
593+
{'Key': 'single_table/dataset/data.zip'},
594+
]
595+
dataset_prefix = 'single_table/dataset/'
596+
597+
# Run
598+
key = _find_text_key(contents, dataset_prefix, 'README.txt')
599+
600+
# Assert
601+
assert key is None
602+
603+
604+
def test__find_text_key_ignores_nested_paths():
605+
"""Test it ignores files in nested folders under the dataset prefix."""
606+
# Setup
607+
contents = [
608+
{'Key': 'single_table/dataset1/bad_folder/SOURCE.txt'},
609+
]
610+
dataset_prefix = 'single_table/dataset1/'
611+
612+
# Run
613+
key = _find_text_key(contents, dataset_prefix, 'SOURCE.txt')
614+
615+
# Assert
616+
assert key is None
617+
618+
619+
@patch('sdv.datasets.demo._get_data_from_bucket')
620+
@patch('sdv.datasets.demo._list_objects')
621+
def test__get_text_file_content_happy_path(mock_list, mock_get, tmpdir):
622+
"""Test it gets the text file content when it exists."""
623+
# Setup
624+
mock_list.return_value = [
625+
{'Key': 'single_table/dataset1/README.txt'},
626+
]
627+
mock_get.return_value = 'Hello README'.encode()
628+
629+
# Run
630+
text = _get_text_file_content('single_table', 'dataset1', 'README.txt')
631+
632+
# Assert
633+
assert text == 'Hello README'
634+
635+
636+
@patch('sdv.datasets.demo._list_objects')
637+
def test__get_text_file_content_missing_key_returns_none(mock_list):
638+
"""Test it returns None when the key is missing."""
639+
# Setup
640+
mock_list.return_value = [
641+
{'Key': 'single_table/dataset1/metadata.json'},
642+
]
643+
644+
# Run
645+
text = _get_text_file_content('single_table', 'dataset1', 'README.txt')
646+
647+
# Assert
648+
assert text is None
649+
650+
651+
@patch('sdv.datasets.demo._list_objects')
652+
def test__get_text_file_content_logs_when_missing_key(mock_list, caplog):
653+
"""It logs an info when the key is missing under the dataset prefix."""
654+
# Setup
655+
mock_list.return_value = [
656+
{'Key': 'single_table/dataset1/metadata.json'},
657+
]
658+
659+
# Run
660+
caplog.set_level(logging.INFO, logger='sdv.datasets.demo')
661+
text = _get_text_file_content('single_table', 'dataset1', 'README.txt')
662+
663+
# Assert
664+
assert text is None
665+
assert 'No README.txt found for dataset dataset1.' in caplog.text
666+
667+
668+
@patch('sdv.datasets.demo._get_data_from_bucket')
669+
@patch('sdv.datasets.demo._list_objects')
670+
def test__get_text_file_content_fetch_error_returns_none(mock_list, mock_get):
671+
"""Test it returns None when the fetch error occurs."""
672+
# Setup
673+
mock_list.return_value = [
674+
{'Key': 'single_table/dataset1/SOURCE.txt'},
675+
]
676+
mock_get.side_effect = Exception('boom')
677+
678+
# Run
679+
text = _get_text_file_content('single_table', 'dataset1', 'SOURCE.txt')
680+
681+
# Assert
682+
assert text is None
683+
684+
685+
@patch('sdv.datasets.demo._get_data_from_bucket')
686+
@patch('sdv.datasets.demo._list_objects')
687+
def test__get_text_file_content_logs_on_fetch_error(mock_list, mock_get, caplog):
688+
"""It logs an info when fetching the key raises an error."""
689+
# Setup
690+
mock_list.return_value = [
691+
{'Key': 'single_table/dataset1/SOURCE.txt'},
692+
]
693+
mock_get.side_effect = Exception('boom')
694+
695+
# Run
696+
caplog.set_level(logging.INFO, logger='sdv.datasets.demo')
697+
text = _get_text_file_content('single_table', 'dataset1', 'SOURCE.txt')
698+
699+
# Assert
700+
assert text is None
701+
assert 'Error fetching SOURCE.txt for dataset dataset1.' in caplog.text
702+
703+
704+
@patch('sdv.datasets.demo._get_data_from_bucket')
705+
@patch('sdv.datasets.demo._list_objects')
706+
def test__get_text_file_content_writes_file_when_output_filepath_given(
707+
mock_list, mock_get, tmp_path
708+
):
709+
"""Test it writes the file when the output filepath is given."""
710+
# Setup
711+
mock_list.return_value = [
712+
{'Key': 'single_table/dataset1/README.txt'},
713+
]
714+
mock_get.return_value = 'Write me'.encode()
715+
out = tmp_path / 'subdir' / 'readme.txt'
716+
717+
# Run
718+
text = _get_text_file_content('single_table', 'dataset1', 'README.txt', str(out))
719+
720+
# Assert
721+
assert text == 'Write me'
722+
with open(out, 'r', encoding='utf-8') as f:
723+
assert f.read() == 'Write me'
724+
725+
726+
@patch('sdv.datasets.demo._get_data_from_bucket')
727+
@patch('sdv.datasets.demo._list_objects')
728+
def test__get_text_file_content_logs_on_save_error(
729+
mock_list, mock_get, tmp_path, caplog, monkeypatch
730+
):
731+
"""It logs an info when saving to disk fails."""
732+
# Setup
733+
mock_list.return_value = [
734+
{'Key': 'single_table/dataset1/README.txt'},
735+
]
736+
mock_get.return_value = 'Write me'.encode()
737+
out = tmp_path / 'subdir' / 'readme.txt'
738+
739+
def _fail_open(*args, **kwargs):
740+
raise OSError('fail-open')
741+
742+
monkeypatch.setattr('builtins.open', _fail_open)
743+
744+
# Run
745+
caplog.set_level(logging.INFO, logger='sdv.datasets.demo')
746+
text = _get_text_file_content('single_table', 'dataset1', 'README.txt', str(out))
747+
748+
# Assert
749+
assert text == 'Write me'
750+
assert 'Error saving README.txt for dataset dataset1.' in caplog.text
751+
752+
753+
def test_get_readme_and_get_source_call_wrapper(monkeypatch):
754+
"""Test it calls the wrapper function when the output filepath is given."""
755+
# Setup
756+
calls = []
757+
758+
def fake(modality, dataset_name, filename, output_filepath=None):
759+
calls.append((modality, dataset_name, filename, output_filepath))
760+
return 'X'
761+
762+
monkeypatch.setattr('sdv.datasets.demo._get_text_file_content', fake)
763+
764+
# Run
765+
r = get_readme('single_table', 'dataset1', '/tmp/readme')
766+
s = get_source('single_table', 'dataset1', '/tmp/source')
767+
768+
# Assert
769+
assert r == 'X' and s == 'X'
770+
assert calls[0] == ('single_table', 'dataset1', 'README.txt', '/tmp/readme')
771+
assert calls[1] == ('single_table', 'dataset1', 'SOURCE.txt', '/tmp/source')

0 commit comments

Comments
 (0)