I’m seeing an issue where audio previews in the dataset viewer show 0:00/0:00 after uploading with push_to_hub.
What I did
I tried to avoid torchcodec because I want to keep the packages small using the following code:
import io
import os
import shutil
import tempfile
from pathlib import Path
import pyarrow as pa
import soundfile as sf
from datasets import Audio, Dataset, DatasetInfo, Features
from datasets.table import InMemoryTable
from dotenv import load_dotenv
from loguru import logger
load_dotenv()
AUDIO_EXTENSIONS = {".flac", ".wav", ".mp3", ".ogg"}
HF_TOKEN = os.environ.get("HF_TOKEN", "")
TARGET_DATASET = "madoss/test-infra"
SAMPLES_DIR = Path(__file__).parent / "samples"
_AUDIO_PA_TYPE = pa.struct(
[
pa.field("bytes", pa.binary()),
pa.field("path", pa.string()),
]
)
def _to_flac_bytes(path: Path) -> bytes:
data, sr = sf.read(str(path), dtype="float32", always_2d=False)
if data.ndim > 1:
data = data.mean(axis=1)
buf = io.BytesIO()
sf.write(buf, data, sr, format="FLAC")
return buf.getvalue()
def build_dataset(samples_dir: Path) -> Dataset:
audio_files = sorted(p for p in samples_dir.iterdir() if p.suffix in AUDIO_EXTENSIONS)
if not audio_files:
raise SystemExit(f"No audio files found in {samples_dir}")
logger.info(f"Found {len(audio_files)} audio files")
rows = []
with tempfile.TemporaryDirectory() as tmpdir:
for src in audio_files:
dst = Path(tmpdir) / src.name
shutil.copy2(src, dst)
audio_bytes = _to_flac_bytes(dst)
rows.append({"bytes": audio_bytes, "path": src.name})
table = pa.table({"audio": pa.array(rows, type=_AUDIO_PA_TYPE)})
return Dataset(
InMemoryTable(table),
info=DatasetInfo(features=Features({"audio": Audio(decode=False)})),
)
def main():
logger.info("Loading VAD model...")
logger.info("Building dataset...")
ds = build_dataset(SAMPLES_DIR)
logger.info(f"Dataset: {ds}")
logger.info(f"Features: {ds.features}")
logger.info(f"Pushing to {TARGET_DATASET}...")
ds.push_to_hub(TARGET_DATASET, token=HF_TOKEN or None)
logger.info(f"Done → https://huggingface.co/datasets/{TARGET_DATASET}")
if __name__ == "__main__":
main()
Expected behavior
In the dataset UI, each audio sample should display its real duration and be playable with a normal timeline (for example, 0:00 / 0:05).
Actual behavior
The UI shows 0:00/0:00 for audio samples.
Environment
datasets==4.8.4, pyarrow==24.0.0
Python 3.12.10
Ubuntu 24.04.4
I’m seeing an issue where audio previews in the dataset viewer show
0:00/0:00after uploading with push_to_hub.What I did
I tried to avoid
torchcodecbecause I want to keep the packages small using the following code:Expected behavior
In the dataset UI, each audio sample should display its real duration and be playable with a normal timeline (for example, 0:00 / 0:05).
Actual behavior
The UI shows 0:00/0:00 for audio samples.
Environment
datasets==4.8.4, pyarrow==24.0.0Python 3.12.10Ubuntu 24.04.4