Skip to content

Dataset UI shows 0:00/0:00 for audio after push_to_hub #8171

@MNIKIEMA

Description

@MNIKIEMA

I’m seeing an issue where audio previews in the dataset viewer show 0:00/0:00 after uploading with push_to_hub.

What I did

I tried to avoid torchcodec because I want to keep the packages small using the following code:

import io
import os
import shutil
import tempfile
from pathlib import Path

import pyarrow as pa
import soundfile as sf
from datasets import Audio, Dataset, DatasetInfo, Features
from datasets.table import InMemoryTable
from dotenv import load_dotenv
from loguru import logger

load_dotenv()

AUDIO_EXTENSIONS = {".flac", ".wav", ".mp3", ".ogg"}

HF_TOKEN = os.environ.get("HF_TOKEN", "")
TARGET_DATASET = "madoss/test-infra"
SAMPLES_DIR = Path(__file__).parent / "samples"

_AUDIO_PA_TYPE = pa.struct(
    [
        pa.field("bytes", pa.binary()),
        pa.field("path", pa.string()),
    ]
)


def _to_flac_bytes(path: Path) -> bytes:
    data, sr = sf.read(str(path), dtype="float32", always_2d=False)
    if data.ndim > 1:
        data = data.mean(axis=1)
    buf = io.BytesIO()
    sf.write(buf, data, sr, format="FLAC")
    return buf.getvalue()


def build_dataset(samples_dir: Path) -> Dataset:
    audio_files = sorted(p for p in samples_dir.iterdir() if p.suffix in AUDIO_EXTENSIONS)
    if not audio_files:
        raise SystemExit(f"No audio files found in {samples_dir}")
    logger.info(f"Found {len(audio_files)} audio files")

    rows = []
    with tempfile.TemporaryDirectory() as tmpdir:
        for src in audio_files:
            dst = Path(tmpdir) / src.name
            shutil.copy2(src, dst)
            audio_bytes = _to_flac_bytes(dst)
            rows.append({"bytes": audio_bytes, "path": src.name})
    table = pa.table({"audio": pa.array(rows, type=_AUDIO_PA_TYPE)})
    return Dataset(
        InMemoryTable(table),
        info=DatasetInfo(features=Features({"audio": Audio(decode=False)})),
    )


def main():
    logger.info("Loading VAD model...")

    logger.info("Building dataset...")
    ds = build_dataset(SAMPLES_DIR)
    logger.info(f"Dataset: {ds}")
    logger.info(f"Features: {ds.features}")

    logger.info(f"Pushing to {TARGET_DATASET}...")
    ds.push_to_hub(TARGET_DATASET, token=HF_TOKEN or None)
    logger.info(f"Done → https://huggingface.co/datasets/{TARGET_DATASET}")


if __name__ == "__main__":
    main()

Expected behavior

In the dataset UI, each audio sample should display its real duration and be playable with a normal timeline (for example, 0:00 / 0:05).

Actual behavior

The UI shows 0:00/0:00 for audio samples.

Image

Environment

  • datasets==4.8.4, pyarrow==24.0.0
  • Python 3.12.10
  • Ubuntu 24.04.4

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions