sdv-dev
diff --git a/‎sdv/datasets/demo.py‎
Lines changed: 212 additions & 64 deletions b/‎sdv/datasets/demo.py‎
Lines changed: 212 additions & 64 deletions
diff --git a/‎sdv/errors.py‎
Lines changed: 8 additions & 0 deletions b/‎sdv/errors.py‎
Lines changed: 8 additions & 0 deletions
@@ -4,23 +4,23 @@
 import json
 import logging
 import os
-import warnings
 from collections import defaultdict
 from pathlib import Path
 from zipfile import ZipFile
 
 import boto3
 import numpy as np
 import pandas as pd
+import yaml
 from botocore import UNSIGNED
 from botocore.client import Config
-from botocore.exceptions import ClientError
 
+from sdv.errors import DemoResourceNotFoundError
 from sdv.metadata.metadata import Metadata
 
 LOGGER = logging.getLogger(__name__)
-BUCKET = 'sdv-demo-datasets'
-BUCKET_URL = 'https://sdv-demo-datasets.s3.amazonaws.com'
+BUCKET = 'sdv-datasets-public'
+BUCKET_URL = f'https://{BUCKET}.s3.amazonaws.com'
 SIGNATURE_VERSION = UNSIGNED
 METADATA_FILENAME = 'metadata.json'
 
@@ -39,41 +39,154 @@ def _validate_output_folder(output_folder_name):
         )
 
 
+def _create_s3_client():
+    """Create and return an S3 client with unsigned requests."""
+    return boto3.client('s3', config=Config(signature_version=SIGNATURE_VERSION))
+
+
 def _get_data_from_bucket(object_key):
-    session = boto3.Session()
-    s3 = session.client('s3', config=Config(signature_version=SIGNATURE_VERSION))
+    s3 = _create_s3_client()
     response = s3.get_object(Bucket=BUCKET, Key=object_key)
     return response['Body'].read()
 
 
-def _download(modality, dataset_name):
-    dataset_url = f'{BUCKET_URL}/{modality.upper()}/{dataset_name}.zip'
-    object_key = f'{modality.upper()}/{dataset_name}.zip'
-    LOGGER.info(f'Downloading dataset {dataset_name} from {dataset_url}')
-    try:
-        file_content = _get_data_from_bucket(object_key)
-    except ClientError:
-        raise ValueError(
-            f"Invalid dataset name '{dataset_name}'. "
-            'Make sure you have the correct modality for the dataset name or '
-            "use 'get_available_demos' to get a list of demo datasets."
+def _list_objects(prefix):
+    """List all objects under a given prefix using pagination.
+
+    Args:
+        prefix (str):
+            The S3 prefix to list.
+
+    Returns:
+        list[dict]:
+            A list of object summaries.
+    """
+    client = _create_s3_client()
+    contents = []
+    paginator = client.get_paginator('list_objects_v2')
+    for resp in paginator.paginate(Bucket=BUCKET, Prefix=prefix):
+        contents.extend(resp.get('Contents', []))
+
+    if not contents:
+        raise DemoResourceNotFoundError(f"No objects found under '{prefix}' in bucket '{BUCKET}'.")
+
+    return contents
+
+
+def _search_contents_keys(contents, match_fn):
+    """Return list of keys from ``contents`` that satisfy ``match_fn``.
+
+    Args:
+        contents (list[dict]):
+            S3 list_objects-like contents entries.
+        match_fn (callable):
+            Function that receives a key (str) and returns True if it matches.
+
+    Returns:
+        list[str]:
+            Keys in their original order that matched the predicate.
+    """
+    matches = []
+    for entry in contents or []:
+        key = entry.get('Key', '')
+        try:
+            if match_fn(key):
+                matches.append(key)
+        except Exception:
+            continue
+
+    return matches
+
+
+def _find_data_zip_key(contents, dataset_prefix):
+    """Find the 'data.zip' object key under dataset prefix, case-insensitive.
+
+    Args:
+        contents (list[dict]):
+            List of objects from S3.
+        dataset_prefix (str):
+            Prefix like 'single_table/dataset/'.
+
+    Returns:
+        str:
+            The key to the data zip if found.
+    """
+    prefix_lower = dataset_prefix.lower()
+
+    def is_data_zip(key):
+        return key.lower() == f'{prefix_lower}data.zip'
+
+    matches = _search_contents_keys(contents, is_data_zip)
+    if matches:
+        return matches[0]
+
+    raise DemoResourceNotFoundError("Could not find 'data.zip' for the requested dataset.")
+
+
+def _get_first_v1_metadata_bytes(contents, dataset_prefix):
+    """Find and return bytes of the first V1 metadata JSON under `dataset_prefix`.
+
+    Scans S3 listing `contents` and, for any JSON file directly under the dataset prefix,
+    downloads and returns its bytes if it contains METADATA_SPEC_VERSION == 'V1'.
+
+    Returns:
+        bytes:
+            The bytes of the first V1 metadata JSON.
+    """
+    prefix_lower = dataset_prefix.lower()
+
+    def is_direct_json_under_prefix(key):
+        key_lower = key.lower()
+        return (
+            key_lower.startswith(prefix_lower)
+            and key_lower.endswith('.json')
+            and 'metadata' in key_lower
+            and key_lower.count('/') == prefix_lower.count('/')
         )
 
-    return io.BytesIO(file_content)
+    candidate_keys = _search_contents_keys(contents, is_direct_json_under_prefix)
+
+    for key in candidate_keys:
+        try:
+            raw = _get_data_from_bucket(key)
+            metadict = json.loads(raw)
+            if isinstance(metadict, dict) and metadict.get('METADATA_SPEC_VERSION') == 'V1':
+                return raw
+
+        except Exception:
+            continue
+
+    raise DemoResourceNotFoundError(
+        'Could not find a valid metadata JSON with METADATA_SPEC_VERSION "V1".'
+    )
+
+
+def _download(modality, dataset_name):
+    """Download dataset resources from a bucket.
+
+    Returns:
+        tuple:
+            (BytesIO(zip_bytes), metadata_bytes)
+    """
+    dataset_prefix = f'{modality}/{dataset_name}/'
+    LOGGER.info(
+        f"Downloading dataset '{dataset_name}' for modality '{modality}' from "
+        f'{BUCKET_URL}/{dataset_prefix}'
+    )
+    contents = _list_objects(dataset_prefix)
+
+    zip_key = _find_data_zip_key(contents, dataset_prefix)
+    zip_bytes = _get_data_from_bucket(zip_key)
+    metadata_bytes = _get_first_v1_metadata_bytes(contents, dataset_prefix)
+
+    return io.BytesIO(zip_bytes), metadata_bytes
 
 
 def _extract_data(bytes_io, output_folder_name):
     with ZipFile(bytes_io) as zf:
         if output_folder_name:
             os.makedirs(output_folder_name, exist_ok=True)
             zf.extractall(output_folder_name)
-            metadata_v0_filepath = os.path.join(output_folder_name, 'metadata_v0.json')
-            if os.path.isfile(metadata_v0_filepath):
-                os.remove(metadata_v0_filepath)
-            os.rename(
-                os.path.join(output_folder_name, 'metadata_v1.json'),
-                os.path.join(output_folder_name, METADATA_FILENAME),
-            )
 
         else:
             in_memory_directory = {}
@@ -104,32 +217,14 @@ def _get_data(modality, output_folder_name, in_memory_directory):
     return data
 
 
-def _get_metadata(output_folder_name, in_memory_directory, dataset_name):
-    metadata = Metadata()
-    if output_folder_name:
-        metadata_path = os.path.join(output_folder_name, METADATA_FILENAME)
-        metadata = metadata.load_from_json(metadata_path, dataset_name)
-
-    else:
-        metadata_path = 'metadata_v2.json'
-        if metadata_path not in in_memory_directory:
-            warnings.warn(f'Metadata for {dataset_name} is missing updated version v2.')
-            metadata_path = 'metadata_v1.json'
-
-        metadict = json.loads(in_memory_directory[metadata_path])
-        metadata = metadata.load_from_dict(metadict, dataset_name)
-
-    return metadata
-
-
 def download_demo(modality, dataset_name, output_folder_name=None):
     """Download a demo dataset.
 
     Args:
         modality (str):
             The modality of the dataset: ``'single_table'``, ``'multi_table'``, ``'sequential'``.
         dataset_name (str):
-            Name of the dataset to be downloaded from the sdv-datasets S3 bucket.
+            Name of the dataset to be downloaded from the sdv-datasets-public S3 bucket.
         output_folder_name (str or None):
             The name of the local folder where the metadata and data should be stored.
             If ``None`` the data is not saved locally and is loaded as a Python object.
@@ -149,14 +244,41 @@ def download_demo(modality, dataset_name, output_folder_name=None):
     """
     _validate_modalities(modality)
     _validate_output_folder(output_folder_name)
-    bytes_io = _download(modality, dataset_name)
-    in_memory_directory = _extract_data(bytes_io, output_folder_name)
+    data_io, metadata_bytes = _download(modality, dataset_name)
+    in_memory_directory = _extract_data(data_io, output_folder_name)
     data = _get_data(modality, output_folder_name, in_memory_directory)
-    metadata = _get_metadata(output_folder_name, in_memory_directory, dataset_name)
+
+    try:
+        metadict = json.loads(metadata_bytes)
+        metadata = Metadata().load_from_dict(metadict, dataset_name)
+    except Exception as e:
+        raise DemoResourceNotFoundError('Failed to parse metadata JSON for the dataset.') from e
 
     return data, metadata
 
 
+def _iter_metainfo_yaml_entries(contents, modality):
+    """Yield (dataset_name, yaml_key) for metainfo.yaml files under a modality.
+
+    This matches keys like '<modality>/<dataset>/metainfo.yaml'.
+    """
+    modality_lower = (modality or '').lower()
+
+    def is_metainfo_yaml(key):
+        parts = key.split('/')
+        if len(parts) != 3:
+            return False
+        if parts[0].lower() != modality_lower:
+            return False
+        if parts[-1].lower() != 'metainfo.yaml':
+            return False
+        return bool(parts[1])
+
+    for key in _search_contents_keys(contents, is_metainfo_yaml):
+        dataset_name = key.split('/')[1]
+        yield dataset_name, key
+
+
 def get_available_demos(modality):
     """Get demo datasets available for a ``modality``.
 
@@ -170,23 +292,49 @@ def get_available_demos(modality):
                 ``dataset_name``: The name of the dataset.
                 ``size_MB``: The unzipped folder size in MB.
                 ``num_tables``: The number of tables in the dataset.
-
-    Raises:
-        Error:
-            * If ``modality`` is not ``'single_table'``, ``'multi_table'`` or ``'sequential'``.
     """
     _validate_modalities(modality)
-    client = boto3.client('s3', config=Config(signature_version=SIGNATURE_VERSION))
+    contents = _list_objects(f'{modality}/')
     tables_info = defaultdict(list)
-    for item in client.list_objects(Bucket=BUCKET)['Contents']:
-        dataset_modality, dataset = item['Key'].split('/', 1)
-        if dataset_modality == modality.upper():
-            tables_info['dataset_name'].append(dataset.replace('.zip', ''))
-            headers = client.head_object(Bucket=BUCKET, Key=item['Key'])['Metadata']
-            size_mb = headers.get('size-mb', np.nan)
-            tables_info['size_MB'].append(round(float(size_mb), 2))
-            tables_info['num_tables'].append(headers.get('num-tables', np.nan))
-
-    df = pd.DataFrame(tables_info)
-    df['num_tables'] = pd.to_numeric(df['num_tables'])
-    return df
+    for dataset_name, yaml_key in _iter_metainfo_yaml_entries(contents, modality):
+        try:
+            raw = _get_data_from_bucket(yaml_key)
+            info = yaml.safe_load(raw) or {}
+            name = info.get('dataset-name') or dataset_name
+
+            size_mb_val = info.get('dataset-size-mb')
+            try:
+                size_mb = float(size_mb_val) if size_mb_val is not None else np.nan
+            except (ValueError, TypeError):
+                LOGGER.info(
+                    f'Invalid dataset-size-mb {size_mb_val} for dataset {name}; defaulting to NaN.'
+                )
+                size_mb = np.nan
+
+            num_tables_val = info.get('num-tables', np.nan)
+            if isinstance(num_tables_val, str):
+                try:
+                    num_tables_val = float(num_tables_val)
+                except (ValueError, TypeError):
+                    LOGGER.info(
+                        f'Could not cast num_tables_val {num_tables_val} to float for '
+                        f'dataset {name}; defaulting to NaN.'
+                    )
+                    num_tables_val = np.nan
+
+            try:
+                num_tables = int(num_tables_val) if not pd.isna(num_tables_val) else np.nan
+            except (ValueError, TypeError):
+                LOGGER.info(
+                    f'Invalid num-tables {num_tables_val} for dataset {name} when parsing as int.'
+                )
+                num_tables = np.nan
+
+            tables_info['dataset_name'].append(name)
+            tables_info['size_MB'].append(size_mb)
+            tables_info['num_tables'].append(num_tables)
+
+        except Exception:
+            continue
+
+    return pd.DataFrame(tables_info)
@@ -91,3 +91,11 @@ class RefitWarning(UserWarning):
     Warning to be raised if a change to a synthesizer requires the synthesizer
     to be refit for the change to be applied.
     """
+
+
+class DemoResourceNotFoundError(Exception):
+    """Raised when a demo dataset or one of its resources cannot be found.
+
+    This error is intended for missing demo assets such as the dataset archive,
+    metadata, license, README, or other auxiliary files in the demo bucket.
+    """