diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d299ceb4f2b..a4a9041fd28 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -44,15 +44,17 @@ jobs: - uses: actions/checkout@v4 with: fetch-depth: 0 + - name: Setup FFmpeg + if: ${{ matrix.os == 'ubuntu-latest' }} + run: | + sudo apt update + sudo apt install -y ffmpeg - name: Set up Python 3.9 uses: actions/setup-python@v5 with: python-version: "3.9" - name: Upgrade pip run: python -m pip install --upgrade pip - - name: Pin setuptools-scm - if: ${{ matrix.os == 'ubuntu-latest' }} - run: echo "installing pinned version of setuptools-scm to fix seqeval installation on 3.7" && pip install "setuptools-scm==6.4.2" - name: Install uv run: pip install --upgrade uv - name: Install dependencies @@ -80,6 +82,11 @@ jobs: - uses: actions/checkout@v4 with: fetch-depth: 0 + - name: Setup FFmpeg + if: ${{ matrix.os == 'ubuntu-latest' }} + run: | + sudo apt update + sudo apt install -y ffmpeg - name: Set up Python 3.11 uses: actions/setup-python@v5 with: @@ -107,6 +114,11 @@ jobs: - uses: actions/checkout@v4 with: fetch-depth: 0 + - name: Setup FFmpeg + if: ${{ matrix.os == 'ubuntu-latest' }} + run: | + sudo apt update + sudo apt install -y ffmpeg - name: Set up Python 3.11 uses: actions/setup-python@v5 with: diff --git a/docs/source/about_dataset_features.mdx b/docs/source/about_dataset_features.mdx index fd3f9e1d560..d575e28065d 100644 --- a/docs/source/about_dataset_features.mdx +++ b/docs/source/about_dataset_features.mdx @@ -53,7 +53,7 @@ See the [flatten](./process#flatten) section to learn how you can extract the ne -The array feature type is useful for creating arrays of various sizes. You can create arrays with two dimensions using [`Array2D`], and even arrays with five dimensions using [`Array5D`]. +The array feature type is useful for creating arrays of various sizes. You can create arrays with two dimensions using [`Array2D`], and even arrays with five dimensions using [`Array5D`]. ```py >>> features = Features({'a': Array2D(shape=(1, 3), dtype='int32')}) @@ -69,9 +69,9 @@ The array type also allows the first dimension of the array to be dynamic. This Audio datasets have a column with type [`Audio`], which contains three important fields: -* `array`: the decoded audio data represented as a 1-dimensional array. -* `path`: the path to the downloaded audio file. -* `sampling_rate`: the sampling rate of the audio data. +- `array`: the decoded audio data represented as a 1-dimensional array. +- `path`: the path to the downloaded audio file. +- `sampling_rate`: the sampling rate of the audio data. When you load an audio dataset and call the audio column, the [`Audio`] feature automatically decodes and resamples the audio file: @@ -80,10 +80,7 @@ When you load an audio dataset and call the audio column, the [`Audio`] feature >>> dataset = load_dataset("PolyAI/minds14", "en-US", split="train") >>> dataset[0]["audio"] -{'array': array([ 0. , 0.00024414, -0.00024414, ..., -0.00024414, - 0. , 0. ], dtype=float32), - 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav', - 'sampling_rate': 8000} + ``` @@ -92,7 +89,7 @@ Index into an audio dataset using the row index first and then the `audio` colum -With `decode=False`, the [`Audio`] type simply gives you the path or the bytes of the audio file, without decoding it into an `array`, +With `decode=False`, the [`Audio`] type simply gives you the path or the bytes of the audio file, without decoding it into an torchcodec `AudioDecoder` object, ```py >>> dataset = load_dataset("PolyAI/minds14", "en-US", split="train").cast_column("audio", Audio(decode=False)) @@ -126,7 +123,7 @@ Index into an image dataset using the row index first and then the `image` colum -With `decode=False`, the [`Image`] type simply gives you the path or the bytes of the image file, without decoding it into an `PIL.Image`, +With `decode=False`, the [`Image`] type simply gives you the path or the bytes of the image file, without decoding it into an `PIL.Image`, ```py >>> dataset = load_dataset("AI-Lab-Makerere/beans", split="train").cast_column("image", Image(decode=False)) @@ -146,4 +143,4 @@ You can also define a dataset of images from numpy arrays: And in this case the numpy arrays are encoded into PNG (or TIFF if the pixels values precision is important). For multi-channels arrays like RGB or RGBA, only uint8 is supported. If you use a larger precision, you get a warning and the array is downcasted to uint8. -For gray-scale images you can use the integer or float precision you want as long as it is compatible with `Pillow`. A warning is shown if your image integer or float precision is too high, and in this case the array is downcated: an int64 array is downcasted to int32, and a float64 array is downcasted to float32. +For gray-scale images you can use the integer or float precision you want as long as it is compatible with `Pillow`. A warning is shown if your image integer or float precision is too high, and in this case the array is downcated: an int64 array is downcasted to int32, and a float64 array is downcasted to float32. diff --git a/docs/source/audio_dataset.mdx b/docs/source/audio_dataset.mdx index 6e6e0b75863..8419a70f698 100644 --- a/docs/source/audio_dataset.mdx +++ b/docs/source/audio_dataset.mdx @@ -10,10 +10,9 @@ dataset = load_dataset("/my_dataset") There are several methods for creating and sharing an audio dataset: -* Create an audio dataset from local files in python with [`Dataset.push_to_hub`]. This is an easy way that requires only a few steps in python. - -* Create an audio dataset repository with the `AudioFolder` builder. This is a no-code solution for quickly creating an audio dataset with several thousand audio files. +- Create an audio dataset from local files in python with [`Dataset.push_to_hub`]. This is an easy way that requires only a few steps in python. +- Create an audio dataset repository with the `AudioFolder` builder. This is a no-code solution for quickly creating an audio dataset with several thousand audio files. @@ -28,10 +27,7 @@ You can load your own dataset using the paths to your audio files. Use the [`~Da ```py >>> audio_dataset = Dataset.from_dict({"audio": ["path/to/audio_1", "path/to/audio_2", ..., "path/to/audio_n"]}).cast_column("audio", Audio()) >>> audio_dataset[0]["audio"] -{'array': array([ 0. , 0.00024414, -0.00024414, ..., -0.00024414, - 0. , 0. ], dtype=float32), - 'path': 'path/to/audio_1', - 'sampling_rate': 16000} + ``` Then upload the dataset to the Hugging Face Hub using [`Dataset.push_to_hub`]: @@ -51,7 +47,6 @@ my_dataset/ ## AudioFolder - The `AudioFolder` is a dataset builder designed to quickly load an audio dataset with several thousand audio files without requiring you to write any code. @@ -101,7 +96,6 @@ If all audio files are contained in a single directory or if they are not on the - If there is additional information you'd like to include about your dataset, like text captions or bounding boxes, add it as a `metadata.csv` file in your folder. This lets you quickly create datasets for different computer vision tasks like text captioning or object detection. You can also use a JSONL file `metadata.jsonl` or a Parquet file `metadata.parquet`. ``` diff --git a/docs/source/audio_load.mdx b/docs/source/audio_load.mdx index c9db4fd5686..0321a82e624 100644 --- a/docs/source/audio_load.mdx +++ b/docs/source/audio_load.mdx @@ -8,7 +8,6 @@ Audio decoding is based on the [`soundfile`](https://github.com/bastibe/python-s To work with audio datasets, you need to have the `audio` dependencies installed. Check out the [installation](./installation#audio) guide to learn how to install it. - ## Local files You can load your own dataset using the paths to your audio files. Use the [`~Dataset.cast_column`] function to take a column of audio file paths, and cast it to the [`Audio`] feature: @@ -16,10 +15,7 @@ You can load your own dataset using the paths to your audio files. Use the [`~Da ```py >>> audio_dataset = Dataset.from_dict({"audio": ["path/to/audio_1", "path/to/audio_2", ..., "path/to/audio_n"]}).cast_column("audio", Audio()) >>> audio_dataset[0]["audio"] -{'array': array([ 0. , 0.00024414, -0.00024414, ..., -0.00024414, - 0. , 0. ], dtype=float32), - 'path': 'path/to/audio_1', - 'sampling_rate': 16000} + ``` ## AudioFolder @@ -99,7 +95,7 @@ For a guide on how to load any type of dataset, take a look at the general process guide. - ## Cast The [`~Dataset.cast_column`] function is used to cast a column to another feature to be decoded. When you use this function with the [`Audio`] feature, you can resample the sampling rate: @@ -22,16 +21,26 @@ The [`~Dataset.cast_column`] function is used to cast a column to another featur Audio files are decoded and resampled on-the-fly, so the next time you access an example, the audio file is resampled to 16kHz: ```py ->>> dataset[0]["audio"] -{'array': array([ 2.3443763e-05, 2.1729663e-04, 2.2145823e-04, ..., - 3.8356509e-05, -7.3497440e-06, -2.1754686e-05], dtype=float32), - 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav', - 'sampling_rate': 16000} +>>> audio = dataset[0]["audio"] + +>>> audio = audio_dataset[0]["audio"] +>>> samples = audio.get_all_samples() +>>> samples.data +tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.3447e-06, + -1.9127e-04, -5.3330e-05]] +>>> samples.sample_rate +16000 ```
- - + +
## Map @@ -40,30 +49,30 @@ The [`~Dataset.map`] function helps preprocess your entire dataset at once. Depe - For pretrained speech recognition models, load a feature extractor and tokenizer and combine them in a `processor`: - ```py - >>> from transformers import AutoTokenizer, AutoFeatureExtractor, AutoProcessor + ```py + >>> from transformers import AutoTokenizer, AutoFeatureExtractor, AutoProcessor - >>> model_checkpoint = "facebook/wav2vec2-large-xlsr-53" - # after defining a vocab.json file you can instantiate a tokenizer object: - >>> tokenizer = AutoTokenizer("./vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|") - >>> feature_extractor = AutoFeatureExtractor.from_pretrained(model_checkpoint) - >>> processor = AutoProcessor.from_pretrained(feature_extractor=feature_extractor, tokenizer=tokenizer) - ``` + >>> model_checkpoint = "facebook/wav2vec2-large-xlsr-53" + # after defining a vocab.json file you can instantiate a tokenizer object: + >>> tokenizer = AutoTokenizer("./vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|") + >>> feature_extractor = AutoFeatureExtractor.from_pretrained(model_checkpoint) + >>> processor = AutoProcessor.from_pretrained(feature_extractor=feature_extractor, tokenizer=tokenizer) + ``` - For fine-tuned speech recognition models, you only need to load a `processor`: - ```py - >>> from transformers import AutoProcessor + ```py + >>> from transformers import AutoProcessor - >>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h") - ``` + >>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h") + ``` When you use [`~Dataset.map`] with your preprocessing function, include the `audio` column to ensure you're actually resampling the audio data: ```py >>> def prepare_dataset(batch): ... audio = batch["audio"] -... batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0] +... batch["input_values"] = processor(audio.get_all_samples().data, sampling_rate=audio["sampling_rate"]).input_values[0] ... batch["input_length"] = len(batch["input_values"]) ... with processor.as_target_processor(): ... batch["labels"] = processor(batch["sentence"]).input_ids diff --git a/docs/source/create_dataset.mdx b/docs/source/create_dataset.mdx index 7f12b2575c6..0a6d508f9ae 100644 --- a/docs/source/create_dataset.mdx +++ b/docs/source/create_dataset.mdx @@ -4,8 +4,8 @@ Sometimes, you may need to create a dataset if you're working with your own data In this tutorial, you'll learn how to use 🤗 Datasets low-code methods for creating all types of datasets: -* Folder-based builders for quickly creating an image or audio dataset -* `from_` methods for creating datasets from local files +- Folder-based builders for quickly creating an image or audio dataset +- `from_` methods for creating datasets from local files ## File-based builders @@ -24,10 +24,10 @@ To get the list of supported formats and code examples, follow this guide [here] There are two folder-based builders, [`ImageFolder`] and [`AudioFolder`]. These are low-code methods for quickly creating an image or speech and audio dataset with several thousand examples. They are great for rapidly prototyping computer vision and speech models before scaling to a larger dataset. Folder-based builders takes your data and automatically generates the dataset's features, splits, and labels. Under the hood: -* [`ImageFolder`] uses the [`~datasets.Image`] feature to decode an image file. Many image extension formats are supported, such as jpg and png, but other formats are also supported. You can check the complete [list](https://github.com/huggingface/datasets/blob/b5672a956d5de864e6f5550e493527d962d6ae55/src/datasets/packaged_modules/imagefolder/imagefolder.py#L39) of supported image extensions. -* [`AudioFolder`] uses the [`~datasets.Audio`] feature to decode an audio file. Audio extensions such as wav and mp3 are supported, and you can check the complete [list](https://github.com/huggingface/datasets/blob/b5672a956d5de864e6f5550e493527d962d6ae55/src/datasets/packaged_modules/audiofolder/audiofolder.py#L39) of supported audio extensions. +- [`ImageFolder`] uses the [`~datasets.Image`] feature to decode an image file. Many image extension formats are supported, such as jpg and png, but other formats are also supported. You can check the complete [list](https://github.com/huggingface/datasets/blob/b5672a956d5de864e6f5550e493527d962d6ae55/src/datasets/packaged_modules/imagefolder/imagefolder.py#L39) of supported image extensions. +- [`AudioFolder`] uses the [`~datasets.Audio`] feature to decode an audio file. Extensions such as wav, mp3, and even mp4 are supported, and you can check the complete [list](https://ffmpeg.org/ffmpeg-formats.html) of supported audio extensions. Decoding is done via ffmpeg. -The dataset splits are generated from the repository structure, and the label names are automatically inferred from the directory name. +The dataset splits are generated from the repository structure, and the label names are automatically inferred from the directory name. For example, if your image dataset (it is the same for an audio dataset) is stored like this: @@ -44,7 +44,7 @@ pokemon/test/water/wartortle.png Then this is how the folder-based builder generates an example:
- +
Create the image dataset by specifying `imagefolder` in [`load_dataset`]: diff --git a/docs/source/installation.md b/docs/source/installation.md index a6027b2ee5d..c52b72cfc12 100644 --- a/docs/source/installation.md +++ b/docs/source/installation.md @@ -30,7 +30,7 @@ You should install 🤗 Datasets in a [virtual environment](https://docs.python. ```bash # Activate the virtual environment source .env/bin/activate - + # Deactivate the virtual environment source .env/bin/deactivate ``` @@ -65,18 +65,6 @@ To work with audio datasets, you need to install the [`Audio`] feature as an ext pip install datasets[audio] ``` - - -To decode mp3 files, you need to have at least version 1.1.0 of the `libsndfile` system library. Usually, it's bundled with the python [`soundfile`](https://github.com/bastibe/python-soundfile) package, which is installed as an extra audio dependency for 🤗 Datasets. -For Linux, the required version of `libsndfile` is bundled with `soundfile` starting from version 0.12.0. You can run the following command to determine which version of `libsndfile` is being used by `soundfile`: - -```bash -python -c "import soundfile; print(soundfile.__libsndfile_version__)" -``` - - - - ## Vision To work with image datasets, you need to install the [`Image`] feature as an extra dependency: diff --git a/docs/source/process.mdx b/docs/source/process.mdx index e5aba32f30d..bdc7e33caf5 100644 --- a/docs/source/process.mdx +++ b/docs/source/process.mdx @@ -289,7 +289,7 @@ Notice how the subfields are now their own independent columns: `answers.text` a Some of the more powerful applications of 🤗 Datasets come from using the [`~Dataset.map`] function. The primary purpose of [`~Dataset.map`] is to speed up processing functions. It allows you to apply a processing function to each example in a dataset, independently or in batches. This function can even create new rows and columns. -In the following example, prefix each `sentence1` value in the dataset with `'My sentence: '`. +In the following example, prefix each `sentence1` value in the dataset with `'My sentence: '`. Start by creating a function that adds `'My sentence: '` to the beginning of each sentence. The function needs to accept and output a `dict`: @@ -348,18 +348,18 @@ Multiprocessing significantly speeds up processing by parallelizing processes on >>> updated_dataset = dataset.map(lambda example, idx: {"sentence2": f"{idx}: " + example["sentence2"]}, with_indices=True, num_proc=4) ``` -The [`~Dataset.map`] also works with the rank of the process if you set `with_rank=True`. This is analogous to the `with_indices` parameter. The `with_rank` parameter in the mapped function goes after the `index` one if it is already present. +The [`~Dataset.map`] also works with the rank of the process if you set `with_rank=True`. This is analogous to the `with_indices` parameter. The `with_rank` parameter in the mapped function goes after the `index` one if it is already present. ```py >>> import torch >>> from multiprocess import set_start_method ->>> from transformers import AutoTokenizer, AutoModelForCausalLM +>>> from transformers import AutoTokenizer, AutoModelForCausalLM >>> from datasets import load_dataset ->>> +>>> >>> # Get an example dataset >>> dataset = load_dataset("fka/awesome-chatgpt-prompts", split="train") ->>> ->>> # Get an example model and its tokenizer +>>> +>>> # Get an example model and its tokenizer >>> model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen1.5-0.5B-Chat").eval() >>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-0.5B-Chat") >>> @@ -367,7 +367,7 @@ The [`~Dataset.map`] also works with the rank of the process if you set `with_ra ... # Move the model on the right GPU if it's not there already ... device = f"cuda:{(rank or 0) % torch.cuda.device_count()}" ... model.to(device) -... +... ... # Your big GPU call goes here, for example: ... chats = [[ ... {"role": "system", "content": "You are a helpful assistant."}, @@ -395,7 +395,7 @@ The [`~Dataset.map`] also works with the rank of the process if you set `with_ra ... ) ``` -The main use-case for rank is to parallelize computation across several GPUs. This requires setting `multiprocess.set_start_method("spawn")`. If you don't you'll receive the following CUDA error: +The main use-case for rank is to parallelize computation across several GPUs. This requires setting `multiprocess.set_start_method("spawn")`. If you don't you'll receive the following CUDA error: ```bash RuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start method. @@ -528,7 +528,7 @@ Note the presence of a `Semaphore`: it sets the maximum number of queries that c Let's use it to call the [microsoft/Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct) model and ask it to return the main topic of each math problem in the [Maxwell-Jia/AIME_2024](https://huggingface.co/Maxwell-Jia/AIME_2024) dataset: -```python +````python >>> from datasets import load_dataset >>> ds = load_dataset("Maxwell-Jia/AIME_2024", split="train") >>> model = "microsoft/Phi-3-mini-4k-instruct" @@ -542,7 +542,7 @@ Let's use it to call the [microsoft/Phi-3-mini-4k-instruct](https://huggingface. 'Solution': 'Denote $\\log_2(x) = a$, $\\log_2(y) = b$, and..., 'Answer': 33, 'Output': 'The main topic is Logarithms.'} -``` +```` Here, [`Dataset.map`] runs many `get_topic` function asynchronously so it doesn't have to wait for every single model response which would take a lot of time to do sequentially. @@ -644,7 +644,7 @@ You can also concatenate two datasets horizontally by setting `axis=1` as long a ### Interleave -You can also mix several datasets together by taking alternating examples from each one to create a new dataset. This is known as *interleaving*, which is enabled by the [`interleave_datasets`] function. Both [`interleave_datasets`] and [`concatenate_datasets`] work with regular [`Dataset`] and [`IterableDataset`] objects. +You can also mix several datasets together by taking alternating examples from each one to create a new dataset. This is known as _interleaving_, which is enabled by the [`interleave_datasets`] function. Both [`interleave_datasets`] and [`concatenate_datasets`] work with regular [`Dataset`] and [`IterableDataset`] objects. Refer to the [Stream](./stream#interleave) guide for an example of how to interleave [`IterableDataset`] objects. You can define sampling probabilities for each of the original datasets to specify how to interleave the datasets. @@ -779,9 +779,9 @@ The [`~Dataset.with_transform`] function applies a custom formatting transform o There is also [`~Dataset.set_transform`] which does the same but runs in-place. -You can also use the [`~Dataset.with_transform`] function to decode formats not supported by [`Features`]. For example, the [`Audio`] feature uses [`soundfile`](https://python-soundfile.readthedocs.io/en/0.11.0/) - a fast and simple library to install - but it does not provide support for less common audio formats. Here is where you can use [`~Dataset.set_transform`] to apply a custom decoding transform on the fly. You're free to use any library you like to decode the audio files. +You can also use the [`~Dataset.with_transform`] function for custom decoding on [`Features`]. -The example below uses the [`pydub`](http://pydub.com/) package to open an audio format not supported by `soundfile`: +The example below uses the [`pydub`](http://pydub.com/) package as an alternative to `torchcodec` decoding: ```py >>> import numpy as np @@ -844,12 +844,12 @@ Use the [`~Dataset.save_to_disk`] and [`load_from_disk`] function to reload the 🤗 Datasets supports exporting as well so you can work with your dataset in other applications. The following table shows currently supported file formats you can export to: -| File type | Export method | -|-------------------------|----------------------------------------------------------------| -| CSV | [`Dataset.to_csv`] | -| JSON | [`Dataset.to_json`] | -| Parquet | [`Dataset.to_parquet`] | -| SQL | [`Dataset.to_sql`] | +| File type | Export method | +| ----------------------- | ------------------------------------------------------------------- | +| CSV | [`Dataset.to_csv`] | +| JSON | [`Dataset.to_json`] | +| Parquet | [`Dataset.to_parquet`] | +| SQL | [`Dataset.to_sql`] | | In-memory Python object | [`Dataset.to_pandas`], [`Dataset.to_polars`] or [`Dataset.to_dict`] | For example, export your dataset to a CSV file like this: diff --git a/docs/source/quickstart.mdx b/docs/source/quickstart.mdx index d001614641f..092940de95c 100644 --- a/docs/source/quickstart.mdx +++ b/docs/source/quickstart.mdx @@ -19,20 +19,44 @@ This quickstart is intended for developers who are ready to dive into the code a Each dataset is unique, and depending on the task, some datasets may require additional steps to prepare it for training. But you can always use 🤗 Datasets tools to load and process a dataset. The fastest and easiest way to get started is by loading an existing dataset from the [Hugging Face Hub](https://huggingface.co/datasets). There are thousands of datasets to choose from, spanning many tasks. Choose the type of dataset you want to work with, and let's get started! @@ -49,31 +73,23 @@ pip install datasets 🤗 Datasets also support audio and image data formats: -* To work with audio datasets, install the [`Audio`] feature: +- To work with audio datasets, install the [`Audio`] feature: - ```bash - pip install datasets[audio] - ``` + ```bash + pip install datasets[audio] + ``` -* To work with image datasets, install the [`Image`] feature: +- To work with image datasets, install the [`Image`] feature: - ```bash - pip install datasets[vision] - ``` + ```bash + pip install datasets[vision] + ``` Besides 🤗 Datasets, make sure your preferred machine learning framework is installed: - -```bash -pip install torch -``` - - -```bash -pip install tensorflow -``` - + ```bash pip install torch ``` + ```bash pip install tensorflow ``` ## Audio @@ -102,10 +118,7 @@ Audio datasets are loaded just like text datasets. However, an audio dataset is ```py >>> dataset = dataset.cast_column("audio", Audio(sampling_rate=16000)) >>> dataset[0]["audio"] -{'array': array([ 2.3443763e-05, 2.1729663e-04, 2.2145823e-04, ..., - 3.8356509e-05, -7.3497440e-06, -2.1754686e-05], dtype=float32), - 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav', - 'sampling_rate': 16000} + ``` **4**. Create a function to preprocess the audio `array` with the feature extractor, and truncate and pad the sequences into tidy rectangular tensors. The most important thing to remember is to call the audio `array` in the feature extractor since the `array` - the actual speech signal - is the model input. @@ -114,7 +127,7 @@ Once you have a preprocessing function, use the [`~Dataset.map`] function to spe ```py >>> def preprocess_function(examples): -... audio_arrays = [x["array"] for x in examples["audio"]] +... audio_arrays = [x.get_all_samples().data for x in examples["audio"]] ... inputs = feature_extractor( ... audio_arrays, ... sampling_rate=16000, @@ -145,12 +158,13 @@ Use the [`~Dataset.set_format`] function to set the dataset format to `torch` an >>> dataset.set_format(type="torch", columns=["input_values", "labels"]) >>> dataloader = DataLoader(dataset, batch_size=4) ``` + Use the [`~transformers.TFPreTrainedModel.prepare_tf_dataset`] method from 🤗 Transformers to prepare the dataset to be compatible with TensorFlow, and ready to train/fine-tune a model, as it wraps a HuggingFace [`~datasets.Dataset`] as a `tf.data.Dataset` -with collation and batching, so one can pass it directly to Keras methods like `fit()` without further modification. +with collation and batching, so one can pass it directly to Keras methods like `fit()` without further modification. ```py >>> import tensorflow as tf @@ -161,6 +175,7 @@ with collation and batching, so one can pass it directly to Keras methods like ` ... shuffle=True, ... ) ``` + @@ -225,12 +240,13 @@ Wrap the dataset in [`torch.utils.data.DataLoader`](https://alband.github.io/doc ... for example in examples: ... images.append((example["pixel_values"])) ... labels.append(example["labels"]) -... +... ... pixel_values = torch.stack(images) ... labels = torch.tensor(labels) ... return {"pixel_values": pixel_values, "labels": labels} >>> dataloader = DataLoader(dataset, collate_fn=collate_fn, batch_size=4) ``` + @@ -267,6 +283,7 @@ pip install -U albumentations opencv-python ... shuffle=True, ... ) ``` + @@ -336,12 +353,13 @@ Use the [`~Dataset.with_format`] function to set the dataset format to `torch` a >>> dataset = dataset.with_format(type="torch") >>> dataloader = torch.utils.data.DataLoader(dataset, batch_size=32) ``` + Use the [`~transformers.TFPreTrainedModel.prepare_tf_dataset`] method from 🤗 Transformers to prepare the dataset to be compatible with TensorFlow, and ready to train/fine-tune a model, as it wraps a HuggingFace [`~datasets.Dataset`] as a `tf.data.Dataset` -with collation and batching, so one can pass it directly to Keras methods like `fit()` without further modification. +with collation and batching, so one can pass it directly to Keras methods like `fit()` without further modification. ```py >>> import tensorflow as tf @@ -352,6 +370,7 @@ with collation and batching, so one can pass it directly to Keras methods like ` ... shuffle=True, ... ) ``` + diff --git a/docs/source/use_dataset.mdx b/docs/source/use_dataset.mdx index c508c184dca..1c709054905 100644 --- a/docs/source/use_dataset.mdx +++ b/docs/source/use_dataset.mdx @@ -1,6 +1,6 @@ # Preprocess -In addition to loading datasets, 🤗 Datasets other main goal is to offer a diverse set of preprocessing functions to get a dataset into an appropriate format for training with your machine learning framework. +In addition to loading datasets, 🤗 Datasets other main goal is to offer a diverse set of preprocessing functions to get a dataset into an appropriate format for training with your machine learning framework. There are many possible ways to preprocess a dataset, and it all depends on your specific dataset. Sometimes you may need to rename a column, and other times you might need to unflatten nested fields. 🤗 Datasets provides a way to do most of these things. But in nearly all preprocessing cases, depending on your dataset modality, you'll need to: @@ -20,7 +20,7 @@ Grab a dataset of your choice and follow along! ## Tokenize text -Models cannot process raw text, so you'll need to convert the text into numbers. Tokenization provides a way to do this by dividing text into individual words called *tokens*. Tokens are finally converted to numbers. +Models cannot process raw text, so you'll need to convert the text into numbers. Tokenization provides a way to do this by dividing text into individual words called _tokens_. Tokens are finally converted to numbers. @@ -42,8 +42,8 @@ Check out the [Tokenizers](https://huggingface.co/course/chapter2/4?fw=pt) secti ```py >>> tokenizer(dataset[0]["text"]) -{'input_ids': [101, 1103, 2067, 1110, 17348, 1106, 1129, 1103, 6880, 1432, 112, 188, 1207, 107, 14255, 1389, 107, 1105, 1115, 1119, 112, 188, 1280, 1106, 1294, 170, 24194, 1256, 3407, 1190, 170, 11791, 5253, 188, 1732, 7200, 10947, 12606, 2895, 117, 179, 7766, 118, 172, 15554, 1181, 3498, 6961, 3263, 1137, 188, 1566, 7912, 14516, 6997, 119, 102], - 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], +{'input_ids': [101, 1103, 2067, 1110, 17348, 1106, 1129, 1103, 6880, 1432, 112, 188, 1207, 107, 14255, 1389, 107, 1105, 1115, 1119, 112, 188, 1280, 1106, 1294, 170, 24194, 1256, 3407, 1190, 170, 11791, 5253, 188, 1732, 7200, 10947, 12606, 2895, 117, 179, 7766, 118, 172, 15554, 1181, 3498, 6961, 3263, 1137, 188, 1566, 7912, 14516, 6997, 119, 102], + 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]} ``` @@ -75,6 +75,7 @@ Use the [`~Dataset.set_format`] function to set the dataset format to be compati >>> dataset.format['type'] 'torch' ``` + Use the [`~Dataset.to_tf_dataset`] function to set the dataset format to be compatible with TensorFlow. You'll also need to import a [data collator](https://huggingface.co/docs/transformers/main_classes/data_collator#transformers.DataCollatorWithPadding) from 🤗 Transformers to combine the varying sequence lengths into a single batch of equal lengths: @@ -91,6 +92,7 @@ Use the [`~Dataset.to_tf_dataset`] function to set the dataset format to be comp ... shuffle=True ... ) ``` + @@ -98,7 +100,7 @@ Use the [`~Dataset.to_tf_dataset`] function to set the dataset format to be comp ## Resample audio signals -Audio inputs like text datasets need to be divided into discrete data points. This is known as *sampling*; the sampling rate tells you how much of the speech signal is captured per second. It is important to make sure the sampling rate of your dataset matches the sampling rate of the data used to pretrain the model you're using. If the sampling rates are different, the pretrained model may perform poorly on your dataset because it doesn't recognize the differences in the sampling rate. +Audio inputs like text datasets need to be divided into discrete data points. This is known as _sampling_; the sampling rate tells you how much of the speech signal is captured per second. It is important to make sure the sampling rate of your dataset matches the sampling rate of the data used to pretrain the model you're using. If the sampling rates are different, the pretrained model may perform poorly on your dataset because it doesn't recognize the differences in the sampling rate. **1**. Start by loading the [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) dataset, the [`Audio`] feature, and the feature extractor corresponding to a pretrained [Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base-960h) model: @@ -113,11 +115,11 @@ Audio inputs like text datasets need to be divided into discrete data points. Th **2**. Index into the first row of the dataset. When you call the `audio` column of the dataset, it is automatically decoded and resampled: ```py ->>> dataset[0]["audio"] -{'array': array([ 0. , 0.00024414, -0.00024414, ..., -0.00024414, - 0. , 0. ], dtype=float32), - 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav', - 'sampling_rate': 8000} +>>> audio = dataset[0]["audio"] +>>> print(audio) + +>>> audio.get_all_samples().sample_rate +8000 ``` **3**. Reading a dataset card is incredibly useful and can give you a lot of information about the dataset. A quick look at the MInDS-14 dataset card tells you the sampling rate is 8kHz. Likewise, you can get many details about a model from its model card. The Wav2Vec2 model card says it was sampled on 16kHz speech audio. This means you'll need to upsample the MInDS-14 dataset to match the sampling rate of the model. @@ -126,18 +128,18 @@ Use the [`~Dataset.cast_column`] function and set the `sampling_rate` parameter ```py >>> dataset = dataset.cast_column("audio", Audio(sampling_rate=16_000)) ->>> dataset[0]["audio"] -{'array': array([ 2.3443763e-05, 2.1729663e-04, 2.2145823e-04, ..., - 3.8356509e-05, -7.3497440e-06, -2.1754686e-05], dtype=float32), - 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav', - 'sampling_rate': 16000} +>>> audio = dataset[0]["audio"] +>>> print(audio) + +>>> audio.get_all_samples().sample_rate +16000 ``` **4**. Use the [`~Dataset.map`] function to resample the entire dataset to 16kHz. This function speeds up resampling by applying the feature extractor to batches of examples instead of individual examples. Set the `batched` parameter to `True`: ```py >>> def preprocess_function(examples): -... audio_arrays = [x["array"] for x in examples["audio"]] +... audio_arrays = [x.get_all_samples().data for x in examples["audio"]] ... inputs = feature_extractor( ... audio_arrays, sampling_rate=feature_extractor.sampling_rate, max_length=16000, truncation=True ... ) @@ -150,7 +152,7 @@ Use the [`~Dataset.cast_column`] function and set the `sampling_rate` parameter ## Apply data augmentations -The most common preprocessing you'll do with image datasets is *data augmentation*, a process that introduces random variations to an image without changing the meaning of the data. This can mean changing the color properties of an image or randomly cropping an image. You are free to use any data augmentation library you like, and 🤗 Datasets will help you apply your data augmentations to your dataset. +The most common preprocessing you'll do with image datasets is _data augmentation_, a process that introduces random variations to an image without changing the meaning of the data. This can mean changing the color properties of an image or randomly cropping an image. You are free to use any data augmentation library you like, and 🤗 Datasets will help you apply your data augmentations to your dataset. **1**. Start by loading the [Beans](https://huggingface.co/datasets/beans) dataset, the `Image` feature, and the feature extractor corresponding to a pretrained [ViT](https://huggingface.co/google/vit-base-patch16-224-in21k) model: diff --git a/docs/source/video_load.mdx b/docs/source/video_load.mdx index e636a2e8c7b..5aad7aa0709 100644 --- a/docs/source/video_load.mdx +++ b/docs/source/video_load.mdx @@ -6,7 +6,7 @@ Video support is experimental and is subject to change. -Video datasets have [`Video`] type columns, which contain `torchvision` objects. +Video datasets have [`Video`] type columns, which contain `torchvision` objects. @@ -21,7 +21,7 @@ When you load a video dataset and call the video column, the videos are decoded >>> dataset = load_dataset("path/to/video/folder", split="train") >>> dataset[0]["video"] - + ``` @@ -38,43 +38,41 @@ Access frames directly from a video using the `VideoReader` using `next()`: ```python >>> video = dataset[0]["video"] ->>> first_frame = next(video) ->>> first_frame["data"].shape +>>> first_frame = video.get_frame_at(0) +>>> first_frame.data.shape (3, 240, 320) ->>> first_frame["pts"] # timestamp +>>> first_frame.pts_seconds # timestamp 0.0 ``` -To get multiple frames at once, you need to iterate on the `VideoReader`. This is the efficient way to obtain a long list of frames: +To get multiple frames at once, you can call `.get_frames_in_range(start: int, stop: int, step: int)`. This will return a frame batch. +This is the efficient way to obtain a long list of frames refer to the [torchcodec docs](https://docs.pytorch.org/torchcodec/stable/generated/torchcodec.decoders.VideoDecoder.html) to see more functions for effiently accessing the data: ```python >>> import torch ->>> import itertools ->>> frames = torch.stack([frame["data"] for frame in islice(video, 5)]) ->>> frames.shape -(5, 3, 240, 320) +>>> frames = video.get_frames_in_range(0, 6, 1) +>>> frames.data.shape +torch.Size([5, 3, 240, 320]) ``` -There is also `.seek()` if you need to set the current timestamp of the video: +There is also `.get_frames_played_in_range(start_seconds: float, stop_seconds: float)` to access all frames played whithin a certain time range. ```python ->>> video.get_metadata() -{'video': {'fps': [10.0], 'duration': [16.1]}} ->>> video = video.seek(8.0, keyframes_only=True) ->>> frame = next(video) ->>> first_frame["data"].shape -(3, 240, 320) +>>> frames = video.get_frames_played_in_range(.5, 1.2) +>>> frames.data.shape +torch.Size([42, 3, 240, 320]) ``` ## Local files -You can load a dataset from the video path. Use the [`~Dataset.cast_column`] function to accept a column of video file paths, and decode it into a `torchvision` video with the [`Video`] feature: +You can load a dataset from the video path. Use the [`~Dataset.cast_column`] function to accept a column of video file paths, and decode it into a `torchcodec` video with the [`Video`] feature: + ```py >>> from datasets import Dataset, Video >>> dataset = Dataset.from_dict({"video": ["path/to/video_1", "path/to/video_2", ..., "path/to/video_n"]}).cast_column("video", Video()) >>> dataset[0]["video"] - + ``` If you only want to load the underlying path to the video dataset without decoding the video object, set `decode=False` in the [`Video`] feature: @@ -116,14 +114,14 @@ For local datasets, this is equivalent to passing `videofolder` manually in [`lo >>> dataset = load_dataset("videofolder", data_dir="/path/to/folder") ``` -Then you can access the videos as `torchvision.io.video_reader.VideoReader` objects: +Then you can access the videos as `torchcodec.decoders._video_decoder.VideoDecoder` objects: ``` >>> dataset["train"][0] -{"video": , "label": 0} +{"video": , "label": 0} >>> dataset["train"][-1] -{"video": , "label": 1} +{"video": , "label": 1} ``` To ignore the information in the metadata file, set `drop_metadata=True` in [`load_dataset`]: diff --git a/setup.py b/setup.py index 64f266eee16..b44b3e4deca 100644 --- a/setup.py +++ b/setup.py @@ -137,8 +137,7 @@ AUDIO_REQUIRE = [ "soundfile>=0.12.1", - "librosa", - "soxr>=0.4.0", # Supports numpy-2 + "torchcodec>=0.4.0", ] VISION_REQUIRE = [ @@ -174,7 +173,6 @@ "py7zr", "rarfile>=4.0", "sqlalchemy", - "s3fs>=2021.11.1", # aligned with fsspec[http]>=2021.11.1; test only on python 3.7 for now "protobuf<4.0.0", # 4.0.0 breaks compatibility with tensorflow<2.12 "tensorflow>=2.6.0; python_version<'3.10' and sys_platform != 'win32'", # numpy-2 is not supported for Python < 3.10 "tensorflow>=2.16.0; python_version>='3.10' and sys_platform != 'win32'", # Pins numpy < 2 @@ -185,17 +183,13 @@ "transformers>=4.42.0", # Pins numpy < 2 "zstandard", "polars[timezone]>=0.20.0", - "torchvision", - "av", + "Pillow>=9.4.0", # When PIL.Image.ExifTags was introduced + "soundfile>=0.12.1", + "torchcodec>=0.4.0; sys_platform != 'win32'", # not available for windows ] - -TESTS_REQUIRE.extend(VISION_REQUIRE) -TESTS_REQUIRE.extend(AUDIO_REQUIRE) - NUMPY2_INCOMPATIBLE_LIBRARIES = [ "faiss-cpu", - "librosa", # librosa -> numba-0.60.0 requires numpy < 2.1 (see GH-7111) "tensorflow", ] TESTS_NUMPY2_REQUIRE = [ @@ -205,8 +199,6 @@ QUALITY_REQUIRE = ["ruff>=0.3.0"] DOCS_REQUIRE = [ - # Might need to add doc-builder and some specific deps in the future - "s3fs", # Following dependencies are required for the Python reference to be built properly "transformers", "torch", @@ -224,7 +216,6 @@ "tensorflow_gpu": ["tensorflow>=2.6.0"], "torch": ["torch"], "jax": ["jax>=0.3.14", "jaxlib>=0.3.14"], - "s3": ["s3fs"], "streaming": [], # for backward compatibility "dev": TESTS_REQUIRE + QUALITY_REQUIRE + DOCS_REQUIRE, "tests": TESTS_REQUIRE, diff --git a/src/datasets/config.py b/src/datasets/config.py index 33d86209287..045ad5f92ec 100644 --- a/src/datasets/config.py +++ b/src/datasets/config.py @@ -140,6 +140,7 @@ IS_MP3_SUPPORTED = importlib.util.find_spec("soundfile") is not None and version.parse( importlib.import_module("soundfile").__libsndfile_version__ ) >= version.parse("1.1.0") +TORCHCODEC_AVAILABLE = importlib.util.find_spec("torchcodec") is not None TORCHVISION_AVAILABLE = importlib.util.find_spec("torchvision") is not None PDFPLUMBER_AVAILABLE = importlib.util.find_spec("pdfplumber") is not None diff --git a/src/datasets/features/_torchcodec.py b/src/datasets/features/_torchcodec.py new file mode 100644 index 00000000000..d71b4156a51 --- /dev/null +++ b/src/datasets/features/_torchcodec.py @@ -0,0 +1,15 @@ +import numpy as np +from torchcodec.decoders import AudioDecoder as _AudioDecoder + + +class AudioDecoder(_AudioDecoder): + def __getitem__(self, key: str): + if key == "array": + y = self.get_all_samples().data.cpu().numpy() + return np.mean(y, axis=tuple(range(y.ndim - 1))) if y.ndim > 1 else y + elif key == "sampling_rate": + return self.get_samples_played_in_range(0, 0).sample_rate + elif hasattr(super(), "__getitem__"): + return super().__getitem__(key) + else: + raise TypeError("'torchcodec.decoders.AudioDecoder' object is not subscriptable") diff --git a/src/datasets/features/audio.py b/src/datasets/features/audio.py index 0cfce71dec9..d982e329326 100644 --- a/src/datasets/features/audio.py +++ b/src/datasets/features/audio.py @@ -9,11 +9,13 @@ from .. import config from ..download.download_config import DownloadConfig from ..table import array_cast -from ..utils.file_utils import xopen, xsplitext +from ..utils.file_utils import is_local_path, xopen from ..utils.py_utils import no_op_if_value_is_null, string_to_dict if TYPE_CHECKING: + from torchcodec.decoders import AudioDecoder + from .features import FeatureType @@ -28,15 +30,19 @@ class Audio: - `path`: String with relative path of the audio file to the archive file. - `bytes`: Bytes content of the audio file. - This is useful for archived files with sequential access. + This is useful for parquet or webdataset files which embed audio files. - A `dict` with the keys: - - `path`: String with relative path of the audio file to the archive file. - `array`: Array containing the audio sample - `sampling_rate`: Integer corresponding to the sampling rate of the audio sample. - This is useful for archived files with sequential access. + - A `torchcodec.decoders.AudioDecoder`: torchcodec audio decoder object. + + Output: The Audio features output data as `torchcodec.decoders.AudioDecoder` objects, with additional keys: + + - `array`: Array containing the audio sample + - `sampling_rate`: Integer corresponding to the sampling rate of the audio sample. Args: sampling_rate (`int`, *optional*): @@ -47,24 +53,30 @@ class Audio: decode (`bool`, defaults to `True`): Whether to decode the audio data. If `False`, returns the underlying dictionary in the format `{"path": audio_path, "bytes": audio_bytes}`. + stream_index (`int`, *optional*): + The streaming index to use from the file. If `None` defaults to the "best" index. Example: ```py >>> from datasets import load_dataset, Audio >>> ds = load_dataset("PolyAI/minds14", name="en-US", split="train") - >>> ds = ds.cast_column("audio", Audio(sampling_rate=16000)) + >>> ds = ds.cast_column("audio", Audio(sampling_rate=44100)) >>> ds[0]["audio"] - {'array': array([ 2.3443763e-05, 2.1729663e-04, 2.2145823e-04, ..., - 3.8356509e-05, -7.3497440e-06, -2.1754686e-05], dtype=float32), - 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav', - 'sampling_rate': 16000} + + >>> audio = ds[0]["audio"] + >>> audio.get_samples_played_in_range(0, 10) + AudioSamples: + data (shape): torch.Size([2, 110592]) + pts_seconds: 0.0 + duration_seconds: 2.507755102040816 + sample_rate: 44100 ``` """ sampling_rate: Optional[int] = None - mono: bool = True decode: bool = True + stream_index: Optional[int] = None id: Optional[str] = field(default=None, repr=False) # Automatically constructed dtype: ClassVar[str] = "dict" @@ -74,28 +86,40 @@ class Audio: def __call__(self): return self.pa_type - def encode_example(self, value: Union[str, bytes, bytearray, dict]) -> dict: + def encode_example(self, value: Union[str, bytes, bytearray, dict, "AudioDecoder"]) -> dict: """Encode example into a format for Arrow. Args: - value (`str` or `dict`): + value (`str`, `bytes`,`bytearray`,`dict`, `AudioDecoder`): Data passed as input to Audio feature. Returns: `dict` """ try: - import soundfile as sf # soundfile is a dependency of librosa, needed to decode audio files. + import soundfile as sf # needed to write audio files except ImportError as err: raise ImportError("To support encoding audio data, please install 'soundfile'.") from err + + if value is None: + raise ValueError("value must be provided") + + if config.TORCHCODEC_AVAILABLE: + from torchcodec.decoders import AudioDecoder + + else: + AudioDecoder = None + if isinstance(value, str): return {"bytes": None, "path": value} elif isinstance(value, (bytes, bytearray)): return {"bytes": value, "path": None} + elif AudioDecoder is not None and isinstance(value, AudioDecoder): + return encode_torchcodec_audio(value) elif "array" in value: # convert the audio array to wav bytes buffer = BytesIO() - sf.write(buffer, value["array"], value["sampling_rate"], format="wav") + sf.write(buffer, value["array"].T, value["sampling_rate"], format="wav") return {"bytes": buffer.getvalue(), "path": None} elif value.get("path") is not None and os.path.isfile(value["path"]): # we set "bytes": None to not duplicate the data if they're already available locally @@ -125,7 +149,7 @@ def encode_example(self, value: Union[str, bytes, bytearray, dict]) -> dict: def decode_example( self, value: dict, token_per_repo_id: Optional[dict[str, Union[str, bool, None]]] = None - ) -> dict: + ) -> "AudioDecoder": """Decode example audio file into audio data. Args: @@ -140,34 +164,24 @@ def decode_example( a dictionary repo_id (`str`) -> token (`bool` or `str`) Returns: - `dict` + `torchcodec.decoders.AudioDecoder` """ + if config.TORCHCODEC_AVAILABLE: + from ._torchcodec import AudioDecoder + else: + raise ImportError("To support decoding audio data, please install 'torchcodec'.") + if not self.decode: raise RuntimeError("Decoding is disabled for this feature. Please use Audio(decode=True) instead.") - path, file = (value["path"], BytesIO(value["bytes"])) if value["bytes"] is not None else (value["path"], None) - if path is None and file is None: + path, bytes = (value["path"], BytesIO(value["bytes"])) if value["bytes"] is not None else (value["path"], None) + if path is None and bytes is None: raise ValueError(f"An audio sample should have one of 'path' or 'bytes' but both are None in {value}.") - try: - import librosa - import soundfile as sf - except ImportError as err: - raise ImportError("To support decoding audio files, please install 'librosa' and 'soundfile'.") from err - - audio_format = xsplitext(path)[1][1:].lower() if path is not None else None - if not config.IS_OPUS_SUPPORTED and audio_format == "opus": - raise RuntimeError( - "Decoding 'opus' files requires system library 'libsndfile'>=1.0.31, " - 'You can try to update `soundfile` python library: `pip install "soundfile>=0.12.1"`. ' - ) - elif not config.IS_MP3_SUPPORTED and audio_format == "mp3": - raise RuntimeError( - "Decoding 'mp3' files requires system library 'libsndfile'>=1.1.0, " - 'You can try to update `soundfile` python library: `pip install "soundfile>=0.12.1"`. ' - ) + if bytes is None and is_local_path(path): + audio = AudioDecoder(path, stream_index=self.stream_index, sample_rate=self.sampling_rate) - if file is None: + elif bytes is None: token_per_repo_id = token_per_repo_id or {} source_url = path.split("::")[-1] pattern = ( @@ -177,20 +191,14 @@ def decode_example( token = token_per_repo_id.get(source_url_fields["repo_id"]) if source_url_fields is not None else None download_config = DownloadConfig(token=token) - with xopen(path, "rb", download_config=download_config) as f: - array, sampling_rate = sf.read(f) + f = xopen(path, "rb", download_config=download_config) + audio = AudioDecoder(f, stream_index=self.stream_index, sample_rate=self.sampling_rate) else: - array, sampling_rate = sf.read(file) - - array = array.T - if self.mono: - array = librosa.to_mono(array) - if self.sampling_rate and self.sampling_rate != sampling_rate: - array = librosa.resample(array, orig_sr=sampling_rate, target_sr=self.sampling_rate) - sampling_rate = self.sampling_rate - - return {"path": path, "array": array, "sampling_rate": sampling_rate} + audio = AudioDecoder(bytes, stream_index=self.stream_index, sample_rate=self.sampling_rate) + audio._hf_encoded = {"path": path, "bytes": bytes} + audio.metadata.path = path + return audio def flatten(self) -> Union["FeatureType", dict[str, "FeatureType"]]: """If in the decodable state, raise an error, otherwise flatten the feature into a dictionary.""" @@ -280,3 +288,19 @@ def path_to_bytes(path): ) storage = pa.StructArray.from_arrays([bytes_array, path_array], ["bytes", "path"], mask=bytes_array.is_null()) return array_cast(storage, self.pa_type) + + +def encode_torchcodec_audio(audio: "AudioDecoder") -> dict: + if hasattr(audio, "_hf_encoded"): + return audio._hf_encoded + else: + try: + import soundfile as sf # needed to write audio files + except ImportError as err: + raise ImportError("To support encoding audio data, please install 'soundfile'.") from err + + samples = audio.get_all_samples() + array = samples.data.cpu().numpy() + buffer = BytesIO() + sf.write(buffer, array.T, samples.sample_rate, format="wav") + return {"bytes": buffer.getvalue(), "path": None} diff --git a/src/datasets/features/features.py b/src/datasets/features/features.py index 9ccdb3da7a9..e69947fa61b 100644 --- a/src/datasets/features/features.py +++ b/src/datasets/features/features.py @@ -302,6 +302,9 @@ def _cast_to_python_objects(obj: Any, only_1d_for_numpy: bool, optimize_list_cas if config.PDFPLUMBER_AVAILABLE and "pdfplumber" in sys.modules: import pdfplumber + if config.TORCHCODEC_AVAILABLE and "torchcodec" in sys.modules: + from torchcodec.decoders import AudioDecoder, VideoDecoder + if isinstance(obj, np.ndarray): if obj.ndim == 0: return obj[()], True @@ -438,6 +441,12 @@ def _cast_to_python_objects(obj: Any, only_1d_for_numpy: bool, optimize_list_cas return list(obj), True else: return obj, False + elif config.TORCHCODEC_AVAILABLE and "torchcodec" in sys.modules and isinstance(obj, VideoDecoder): + v = Video() + return v.encode_example(obj), True + elif config.TORCHCODEC_AVAILABLE and "torchcodec" in sys.modules and isinstance(obj, AudioDecoder): + a = Audio() + return a.encode_example(obj), True else: return obj, False diff --git a/src/datasets/features/image.py b/src/datasets/features/image.py index 79794beb12f..ad2e6bdfaec 100644 --- a/src/datasets/features/image.py +++ b/src/datasets/features/image.py @@ -53,11 +53,13 @@ class Image: - `path`: String with relative path of the image file to the archive file. - `bytes`: Bytes of the image file. - This is useful for archived files with sequential access. + This is useful for parquet or webdataset files which embed image files. - An `np.ndarray`: NumPy array representing an image. - A `PIL.Image.Image`: PIL image object. + Output: The Image features output data as `PIL.Image.Image` objects. + Args: mode (`str`, *optional*): The mode to convert the image to. If `None`, the native mode of the image is used. diff --git a/src/datasets/features/video.py b/src/datasets/features/video.py index 09bd4291890..6b4bdc97a6c 100644 --- a/src/datasets/features/video.py +++ b/src/datasets/features/video.py @@ -1,6 +1,6 @@ import os from dataclasses import dataclass, field -from typing import TYPE_CHECKING, Any, ClassVar, Optional, TypedDict, Union +from typing import TYPE_CHECKING, Any, ClassVar, Literal, Optional, TypedDict, Union import numpy as np import pyarrow as pa @@ -13,7 +13,8 @@ if TYPE_CHECKING: - from torchvision.io import VideoReader + import torch + from torchcodec.decoders import VideoDecoder from .features import FeatureType @@ -26,7 +27,7 @@ class Example(TypedDict): @dataclass class Video: """ - **Experimental.** Video [`Feature`] to read video data from a video file. + Video [`Feature`] to read video data from a video file. Input: The Video feature accepts as input: - A `str`: Absolute path to the video file (i.e. random access is allowed). @@ -35,9 +36,11 @@ class Video: - `path`: String with relative path of the video file in a dataset repository. - `bytes`: Bytes of the video file. - This is useful for archived files with sequential access. + This is useful for parquet or webdataset files which embed video files. - - A `torchvision.io.VideoReader`: torchvision video reader object. + - A `torchcodec.decoders.VideoDecoder`: torchcodec video decoder object. + + Output: The Video features output data as `torchcodec.decoders.VideoDecoder` objects. Args: mode (`str`, *optional*): @@ -45,6 +48,21 @@ class Video: decode (`bool`, defaults to `True`): Whether to decode the video data. If `False`, returns the underlying dictionary in the format `{"path": video_path, "bytes": video_bytes}`. + stream_index (`int`, *optional*): + The streaming index to use from the file. If `None` defaults to the "best" index. + dimension_order (`str`, defaults to `NCHW`): + The dimension order of the decoded frames. + where N is the batch size, C is the number of channels, + H is the height, and W is the width of the frames. + num_ffmpeg_threads (`int`, defaults to `1`): + The number of threads to use for decoding the video. (Recommended to keep this at 1) + device (`str` or `torch.device`, defaults to `cpu`): + The device to use for decoding the video. + seek_mode (`str`, defaults to `exact`): + Determines if frame access will be “exact” or “approximate”. + Exact guarantees that requesting frame i will always return frame i, but doing so requires an initial scan of the file. + Approximate is faster as it avoids scanning the file, but less accurate as it uses the file's metadata to calculate where i probably is. + read more [here](https://docs.pytorch.org/torchcodec/stable/generated_examples/approximate_mode.html#sphx-glr-generated-examples-approximate-mode-py) Examples: @@ -54,38 +72,53 @@ class Video: >>> ds.features["video"] Video(decode=True, id=None) >>> ds[0]["video"] - - >>> ds = ds.cast_column('video', Video(decode=False)) + + >>> video = ds[0]["video"] + >>> video.get_frames_in_range(0, 10) + FrameBatch: + data (shape): torch.Size([10, 3, 50, 66]) + pts_seconds: tensor([0.4333, 0.4333, 0.4333, 0.4333, 0.4333, 0.4333, 0.4333, 0.4333, 0.4333, + 0.4333], dtype=torch.float64) + duration_seconds: tensor([0.0167, 0.0167, 0.0167, 0.0167, 0.0167, 0.0167, 0.0167, 0.0167, 0.0167, + 0.0167], dtype=torch.float64) + >>> ds.cast_column('video', Video(decode=False))[0]["video] {'bytes': None, 'path': 'path/to/Screen Recording.mov'} ``` """ decode: bool = True + stream_index: Optional[int] = None + dimension_order: Literal["NCHW", "NHWC"] = "NCHW" + num_ffmpeg_threads: int = 1 + device: Optional[Union[str, "torch.device"]] = "cpu" + seek_mode: Literal["exact", "approximate"] = "exact" id: Optional[str] = field(default=None, repr=False) # Automatically constructed - dtype: ClassVar[str] = "torchvision.io.VideoReader" + dtype: ClassVar[str] = "torchcodec.decoders.VideoDecoder" pa_type: ClassVar[Any] = pa.struct({"bytes": pa.binary(), "path": pa.string()}) _type: str = field(default="Video", init=False, repr=False) def __call__(self): return self.pa_type - def encode_example(self, value: Union[str, bytes, bytearray, Example, np.ndarray, "VideoReader"]) -> Example: + def encode_example(self, value: Union[str, bytes, bytearray, Example, np.ndarray, "VideoDecoder"]) -> Example: """Encode example into a format for Arrow. Args: - value (`str`, `np.ndarray`, `VideoReader` or `dict`): + value (`str`, `np.ndarray`, `bytes`, `bytearray`, `VideoDecoder` or `dict`): Data passed as input to Video feature. Returns: `dict` with "path" and "bytes" fields """ - if config.TORCHVISION_AVAILABLE: - from torchvision.io import VideoReader + if value is None: + raise ValueError("value must be provided") + if config.TORCHCODEC_AVAILABLE: + from torchcodec.decoders import VideoDecoder else: - VideoReader = None + VideoDecoder = None if isinstance(value, list): value = np.array(value) @@ -97,9 +130,9 @@ def encode_example(self, value: Union[str, bytes, bytearray, Example, np.ndarray elif isinstance(value, np.ndarray): # convert the video array to bytes return encode_np_array(value) - elif VideoReader is not None and isinstance(value, VideoReader): - # convert the torchvision video reader to bytes - return encode_torchvision_video(value) + elif VideoDecoder is not None and isinstance(value, VideoDecoder): + # convert the torchcodec video decoder to bytes + return encode_torchcodec_video(value) elif isinstance(value, dict): path, bytes_ = value.get("path"), value.get("bytes") if path is not None and os.path.isfile(path): @@ -119,7 +152,7 @@ def decode_example( self, value: Union[str, Example], token_per_repo_id: Optional[dict[str, Union[bool, str]]] = None, - ) -> "VideoReader": + ) -> "VideoDecoder": """Decode example video file into video data. Args: @@ -135,16 +168,16 @@ def decode_example( a dictionary repo_id (`str`) -> token (`bool` or `str`). Returns: - `torchvision.io.VideoReader` + `torchcodec.decoders.VideoDecoder` """ if not self.decode: raise RuntimeError("Decoding is disabled for this feature. Please use Video(decode=True) instead.") - if config.TORCHVISION_AVAILABLE: - from torchvision.io import VideoReader + if config.TORCHCODEC_AVAILABLE: + from torchcodec.decoders import VideoDecoder else: - raise ImportError("To support decoding videos, please install 'torchvision'.") + raise ImportError("To support decoding videos, please install 'torchcodec'.") if token_per_repo_id is None: token_per_repo_id = {} @@ -158,12 +191,34 @@ def decode_example( if path is None: raise ValueError(f"A video should have one of 'path' or 'bytes' but both are None in {value}.") elif is_local_path(path): - video = VideoReader(path) + video = VideoDecoder( + path, + stream_index=self.stream_index, + dimension_order=self.dimension_order, + num_ffmpeg_threads=self.num_ffmpeg_threads, + device=self.device, + seek_mode=self.seek_mode, + ) else: - video = hf_video_reader(path, token_per_repo_id=token_per_repo_id) + video = hf_video_reader( + path, + token_per_repo_id=token_per_repo_id, + dimension_order=self.dimension_order, + num_ffmpeg_threads=self.num_ffmpeg_threads, + device=self.device, + seek_mode=self.seek_mode, + ) else: - video = VideoReader(bytes_) + video = VideoDecoder( + bytes_, + stream_index=self.stream_index, + dimension_order=self.dimension_order, + num_ffmpeg_threads=self.num_ffmpeg_threads, + device=self.device, + seek_mode=self.seek_mode, + ) video._hf_encoded = {"path": path, "bytes": bytes_} + video.metadata.path = path return video def flatten(self) -> Union["FeatureType", dict[str, "FeatureType"]]: @@ -226,17 +281,17 @@ def cast_storage(self, storage: Union[pa.StringArray, pa.StructArray, pa.ListArr return array_cast(storage, self.pa_type) -def video_to_bytes(video: "VideoReader") -> bytes: - """Convert a torchvision Video object to bytes using native compression if possible""" +def video_to_bytes(video: "VideoDecoder") -> bytes: + """Convert a torchcodec Video object to bytes using native compression if possible""" raise NotImplementedError() -def encode_torchvision_video(video: "VideoReader") -> Example: +def encode_torchcodec_video(video: "VideoDecoder") -> Example: if hasattr(video, "_hf_encoded"): return video._hf_encoded else: raise NotImplementedError( - "Encoding a VideoReader that doesn't come from datasets.Video.decode() is not implemented" + "Encoding a VideoDecoder that doesn't come from datasets.Video.decode() is not implemented" ) @@ -244,18 +299,21 @@ def encode_np_array(array: np.ndarray) -> Example: raise NotImplementedError() -# Patching torchvision a little bit to: +# No monkey patch needed! # 1. store the encoded video data {"path": ..., "bytes": ...} in `video._hf_encoded`` # 2. add support for hf:// files -# This doesn't affect the normal usage of torchvision. def hf_video_reader( - path: str, token_per_repo_id: Optional[dict[str, Union[bool, str]]] = None, stream: str = "video" -) -> "VideoReader": - import av - from torchvision import get_video_backend - from torchvision.io import VideoReader + path: str, + token_per_repo_id: Optional[dict[str, Union[bool, str]]] = None, + stream: str = "video", + dimension_order: Literal["NCHW", "NHWC"] = "NCHW", + num_ffmpeg_threads: int = 1, + device: Optional[Union[str, "torch.device"]] = "cpu", + seek_mode: Literal["exact", "approximate"] = "exact", +) -> "VideoDecoder": + from torchcodec.decoders import VideoDecoder # Load the file from HF if token_per_repo_id is None: @@ -267,14 +325,14 @@ def hf_video_reader( download_config = DownloadConfig(token=token) f = xopen(path, "rb", download_config=download_config) - # Instantiate the VideoReader - vr = object.__new__(VideoReader) - vr.backend = get_video_backend() - if vr.backend != "pyav": - raise RuntimeError(f"Unsupported video backend for VideoReader from HF files: {vr.backend}") - vr.container = av.open(f, metadata_errors="ignore") - stream_type = stream.split(":")[0] + # Instantiate the VideoDecoder stream_id = 0 if len(stream.split(":")) == 1 else int(stream.split(":")[1]) - vr.pyav_stream = {stream_type: stream_id} - vr._c = vr.container.decode(**vr.pyav_stream) - return vr + vd = VideoDecoder( + f, + stream_index=stream_id, + dimension_order=dimension_order, + num_ffmpeg_threads=num_ffmpeg_threads, + device=device, + seek_mode=seek_mode, + ) + return vd diff --git a/src/datasets/formatting/jax_formatter.py b/src/datasets/formatting/jax_formatter.py index ad414279ac8..c52ef7a4d59 100644 --- a/src/datasets/formatting/jax_formatter.py +++ b/src/datasets/formatting/jax_formatter.py @@ -111,6 +111,11 @@ def _tensorize(self, value): if isinstance(value, VideoReader): return value # TODO(QL): set output to jax arrays ? + if config.TORCHCODEC_AVAILABLE and "torchcodec" in sys.modules: + from torchcodec.decoders import AudioDecoder, VideoDecoder + + if isinstance(value, (VideoDecoder, AudioDecoder)): + return value # TODO(QL): set output to jax arrays ? # using global variable since `jaxlib.xla_extension.Device` is not serializable neither # with `pickle` nor with `dill`, so we need to use a global variable instead diff --git a/src/datasets/formatting/np_formatter.py b/src/datasets/formatting/np_formatter.py index c12ecd4f386..062d199c6f6 100644 --- a/src/datasets/formatting/np_formatter.py +++ b/src/datasets/formatting/np_formatter.py @@ -68,6 +68,11 @@ def _tensorize(self, value): if isinstance(value, VideoReader): return value # TODO(QL): set output to np arrays ? + if config.TORCHCODEC_AVAILABLE and "torchcodec" in sys.modules: + from torchcodec.decoders import AudioDecoder, VideoDecoder + + if isinstance(value, (VideoDecoder, AudioDecoder)): + return value # TODO(QL): set output to np arrays ? return np.asarray(value, **{**default_dtype, **self.np_array_kwargs}) diff --git a/src/datasets/formatting/tf_formatter.py b/src/datasets/formatting/tf_formatter.py index 964bc4b5589..1a20eb31d1d 100644 --- a/src/datasets/formatting/tf_formatter.py +++ b/src/datasets/formatting/tf_formatter.py @@ -75,6 +75,11 @@ def _tensorize(self, value): if isinstance(value, VideoReader): return value # TODO(QL): set output to tf tensors ? + if config.TORCHCODEC_AVAILABLE and "torchcodec" in sys.modules: + from torchcodec.decoders import AudioDecoder, VideoDecoder + + if isinstance(value, (VideoDecoder, AudioDecoder)): + return value # TODO(QL): set output to jax arrays ? return tf.convert_to_tensor(value, **{**default_dtype, **self.tf_tensor_kwargs}) diff --git a/src/datasets/formatting/torch_formatter.py b/src/datasets/formatting/torch_formatter.py index cfc3fd9abe8..3501f9368be 100644 --- a/src/datasets/formatting/torch_formatter.py +++ b/src/datasets/formatting/torch_formatter.py @@ -81,6 +81,11 @@ def _tensorize(self, value): if isinstance(value, VideoReader): return value # TODO(QL): set output to torch tensors ? + if config.TORCHCODEC_AVAILABLE and "torchcodec" in sys.modules: + from torchcodec.decoders import AudioDecoder, VideoDecoder + + if isinstance(value, (VideoDecoder, AudioDecoder)): + return value # TODO(QL): set output to jax arrays ? return torch.tensor(value, **{**default_dtype, **self.torch_tensor_kwargs}) diff --git a/src/datasets/packaged_modules/audiofolder/audiofolder.py b/src/datasets/packaged_modules/audiofolder/audiofolder.py index 12ed9efdf04..96f5e9d3c8a 100644 --- a/src/datasets/packaged_modules/audiofolder/audiofolder.py +++ b/src/datasets/packaged_modules/audiofolder/audiofolder.py @@ -63,5 +63,23 @@ class AudioFolder(folder_based_builder.FolderBasedBuilder): ".xi", ".mp3", ".opus", + ".3gp", + ".3g2", + ".avi", + ".asf", + ".flv", + ".mp4", + ".mov", + ".m4v", + ".mkv", + ".mpg", + ".webm", + ".f4v", + ".wmv", + ".wma", + ".ogg", + ".ogm", + ".mxf", + ".nut", ] AudioFolder.EXTENSIONS = AUDIO_EXTENSIONS diff --git a/src/datasets/packaged_modules/json/README.md b/src/datasets/packaged_modules/json/README.md deleted file mode 100644 index a07cb902a4f..00000000000 --- a/src/datasets/packaged_modules/json/README.md +++ /dev/null @@ -1,30 +0,0 @@ ---- -dataset_info: - features: - - name: tokens - list: string - - name: ner_tags - list: - class_label: - names: - '0': O - '1': B-PER - '2': I-PER - '3': B-ORG - '4': I-ORG - '5': B-LOC - '6': I-LOC - - name: langs - list: string - - name: spans - list: string - splits: - - name: train - num_bytes: 2351563 - num_examples: 10000 - - name: validation - num_bytes: 238418 - num_examples: 1000 - download_size: 3940680 - dataset_size: 2589981 ---- diff --git a/tests/features/test_audio.py b/tests/features/test_audio.py index 547c16e27e1..38999e64b4e 100644 --- a/tests/features/test_audio.py +++ b/tests/features/test_audio.py @@ -1,16 +1,15 @@ import os import tarfile +from itertools import product +import numpy as np import pyarrow as pa import pytest from datasets import Column, Dataset, concatenate_datasets, load_dataset from datasets.features import Audio, Features, Sequence, Value -from ..utils import ( - require_librosa, - require_sndfile, -) +from ..utils import require_sndfile, require_torchcodec @pytest.fixture() @@ -42,8 +41,9 @@ def iter_archive(archive_path): def test_audio_instantiation(): audio = Audio() assert audio.sampling_rate is None - assert audio.mono is True assert audio.id is None + assert audio.stream_index is None + assert audio.dtype == "dict" assert audio.pa_type == pa.struct({"bytes": pa.binary(), "path": pa.string()}) assert audio._type == "Audio" @@ -58,7 +58,8 @@ def test_audio_feature_type_to_arrow(): assert features.arrow_schema == pa.schema({"sequence_of_audios": pa.list_(Audio().pa_type)}) -@require_librosa +@require_torchcodec +@require_sndfile @pytest.mark.parametrize( "build_example", [ @@ -69,10 +70,12 @@ def test_audio_feature_type_to_arrow(): lambda audio_path: {"path": audio_path, "bytes": open(audio_path, "rb").read()}, lambda audio_path: {"path": None, "bytes": open(audio_path, "rb").read()}, lambda audio_path: {"bytes": open(audio_path, "rb").read()}, - lambda audio_path: {"array": [0.1, 0.2, 0.3], "sampling_rate": 16_000}, + lambda audio_path: {"array": np.array([0.1, 0.2, 0.3]), "sampling_rate": 16_000}, ], ) def test_audio_feature_encode_example(shared_datadir, build_example): + from torchcodec.decoders import AudioDecoder + audio_path = str(shared_datadir / "test_audio_44100.wav") audio = Audio() encoded_example = audio.encode_example(build_example(audio_path)) @@ -80,20 +83,23 @@ def test_audio_feature_encode_example(shared_datadir, build_example): assert encoded_example.keys() == {"bytes", "path"} assert encoded_example["bytes"] is not None or encoded_example["path"] is not None decoded_example = audio.decode_example(encoded_example) - assert decoded_example.keys() == {"path", "array", "sampling_rate"} + assert isinstance(decoded_example, AudioDecoder) -@require_librosa +@require_torchcodec +@require_sndfile @pytest.mark.parametrize( "build_example", [ lambda audio_path: {"path": audio_path, "sampling_rate": 16_000}, lambda audio_path: {"path": audio_path, "bytes": None, "sampling_rate": 16_000}, lambda audio_path: {"path": audio_path, "bytes": open(audio_path, "rb").read(), "sampling_rate": 16_000}, - lambda audio_path: {"array": [0.1, 0.2, 0.3], "sampling_rate": 16_000}, + lambda audio_path: {"array": np.array([0.1, 0.2, 0.3]), "sampling_rate": 16_000}, ], ) def test_audio_feature_encode_example_pcm(shared_datadir, build_example): + from torchcodec.decoders import AudioDecoder + audio_path = str(shared_datadir / "test_audio_16000.pcm") audio = Audio(sampling_rate=16_000) encoded_example = audio.encode_example(build_example(audio_path)) @@ -101,124 +107,189 @@ def test_audio_feature_encode_example_pcm(shared_datadir, build_example): assert encoded_example.keys() == {"bytes", "path"} assert encoded_example["bytes"] is not None or encoded_example["path"] is not None decoded_example = audio.decode_example(encoded_example) - assert decoded_example.keys() == {"path", "array", "sampling_rate"} + assert isinstance(decoded_example, AudioDecoder) + +sample_rates = [16_000, 48_000] -@require_librosa + +@require_torchcodec +@require_sndfile +@pytest.mark.parametrize( + "in_sample_rate,out_sample_rate", + list(product(sample_rates, sample_rates)), +) +def test_audio_feature_encode_example_audiodecoder(shared_datadir, in_sample_rate, out_sample_rate): + from torchcodec.decoders import AudioDecoder + + audio_path = str(shared_datadir / "test_audio_44100.wav") + audio = Audio(sampling_rate=out_sample_rate) + example = AudioDecoder(audio_path, sample_rate=in_sample_rate) + encoded_example = audio.encode_example(example) + assert isinstance(encoded_example, dict) + assert encoded_example.keys() == {"bytes", "path"} + assert encoded_example["bytes"] is not None or encoded_example["path"] is not None + decoded_example = audio.decode_example(encoded_example) + assert isinstance(decoded_example, AudioDecoder) + + +@require_torchcodec @require_sndfile def test_audio_decode_example(shared_datadir): + from torchcodec.decoders import AudioDecoder + audio_path = str(shared_datadir / "test_audio_44100.wav") audio = Audio() decoded_example = audio.decode_example(audio.encode_example(audio_path)) - assert decoded_example.keys() == {"path", "array", "sampling_rate"} - assert decoded_example["path"] == audio_path - assert decoded_example["array"].shape == (202311,) - assert decoded_example["sampling_rate"] == 44100 + assert isinstance(decoded_example, AudioDecoder) + samples = decoded_example.get_all_samples() + assert samples.sample_rate == 44100 + assert samples.data.shape == (2, 202311) with pytest.raises(RuntimeError): Audio(decode=False).decode_example(audio_path) -@require_librosa +@require_torchcodec @require_sndfile def test_audio_resampling(shared_datadir): + from torchcodec.decoders import AudioDecoder + audio_path = str(shared_datadir / "test_audio_44100.wav") audio = Audio(sampling_rate=16000) decoded_example = audio.decode_example(audio.encode_example(audio_path)) - assert decoded_example.keys() == {"path", "array", "sampling_rate"} - assert decoded_example["path"] == audio_path - assert decoded_example["array"].shape == (73401,) - assert decoded_example["sampling_rate"] == 16000 + assert isinstance(decoded_example, AudioDecoder) + samples = decoded_example.get_all_samples() + assert samples.sample_rate == 16000 + assert samples.data.shape == (2, 73401) -@require_librosa +@require_torchcodec @require_sndfile def test_audio_decode_example_mp3(shared_datadir): + from torchcodec.decoders import AudioDecoder + audio_path = str(shared_datadir / "test_audio_44100.mp3") audio = Audio() decoded_example = audio.decode_example(audio.encode_example(audio_path)) - assert decoded_example.keys() == {"path", "array", "sampling_rate"} - assert decoded_example["path"] == audio_path - assert decoded_example["array"].shape == (110592,) - assert decoded_example["sampling_rate"] == 44100 + print("decoded_example", decoded_example) + assert isinstance(decoded_example, AudioDecoder) + samples = decoded_example.get_all_samples() + assert samples.sample_rate == 44100 + assert samples.data.shape == (2, 110592) -@require_librosa +@require_torchcodec @require_sndfile def test_audio_decode_example_opus(shared_datadir): + from torchcodec.decoders import AudioDecoder + audio_path = str(shared_datadir / "test_audio_48000.opus") audio = Audio() decoded_example = audio.decode_example(audio.encode_example(audio_path)) - assert decoded_example.keys() == {"path", "array", "sampling_rate"} - assert decoded_example["path"] == audio_path - assert decoded_example["array"].shape == (48000,) - assert decoded_example["sampling_rate"] == 48000 + assert isinstance(decoded_example, AudioDecoder) + samples = decoded_example.get_all_samples() + assert samples.sample_rate == 48000 + assert samples.data.shape == (1, 48000) -@require_librosa +@require_torchcodec +@require_sndfile @pytest.mark.parametrize("sampling_rate", [16_000, 48_000]) def test_audio_decode_example_pcm(shared_datadir, sampling_rate): + from torchcodec.decoders import AudioDecoder + audio_path = str(shared_datadir / "test_audio_16000.pcm") audio_input = {"path": audio_path, "sampling_rate": 16_000} audio = Audio(sampling_rate=sampling_rate) decoded_example = audio.decode_example(audio.encode_example(audio_input)) - assert decoded_example.keys() == {"path", "array", "sampling_rate"} - assert decoded_example["path"] is None - assert decoded_example["array"].shape == (16208 * sampling_rate // 16_000,) - assert decoded_example["sampling_rate"] == sampling_rate + assert isinstance(decoded_example, AudioDecoder) + samples = decoded_example.get_all_samples() + assert samples.sample_rate == sampling_rate + assert samples.data.shape == (1, 16208 * sampling_rate // 16_000) -@require_librosa +@require_torchcodec @require_sndfile def test_audio_resampling_mp3_different_sampling_rates(shared_datadir): + from torchcodec.decoders import AudioDecoder + audio_path = str(shared_datadir / "test_audio_44100.mp3") audio_path2 = str(shared_datadir / "test_audio_16000.mp3") audio = Audio(sampling_rate=48000) decoded_example = audio.decode_example(audio.encode_example(audio_path)) - assert decoded_example.keys() == {"path", "array", "sampling_rate"} - assert decoded_example["path"] == audio_path - assert decoded_example["array"].shape == (120373,) - assert decoded_example["sampling_rate"] == 48000 + assert isinstance(decoded_example, AudioDecoder) + samples = decoded_example.get_all_samples() + assert samples.sample_rate == 48000 + assert samples.data.shape == (2, 120373) decoded_example = audio.decode_example(audio.encode_example(audio_path2)) - assert decoded_example.keys() == {"path", "array", "sampling_rate"} - assert decoded_example["path"] == audio_path2 - assert decoded_example["array"].shape == (122688,) - assert decoded_example["sampling_rate"] == 48000 + assert isinstance(decoded_example, AudioDecoder) + samples = decoded_example.get_all_samples() + assert samples.sample_rate == 48000 + assert samples.data.shape == (2, 122688) -@require_librosa +@require_torchcodec +@require_sndfile +def test_backwards_compatibility(shared_datadir): + from torchcodec.decoders import AudioDecoder + + audio_path = str(shared_datadir / "test_audio_44100.mp3") + audio_path2 = str(shared_datadir / "test_audio_16000.mp3") + audio = Audio(sampling_rate=48000) + + decoded_example = audio.decode_example(audio.encode_example(audio_path)) + assert isinstance(decoded_example, AudioDecoder) + samples = decoded_example.get_all_samples() + assert decoded_example["sampling_rate"] == samples.sample_rate + assert decoded_example["array"].ndim == 1 # mono + assert abs(decoded_example["array"].shape[0] - samples.data.shape[1]) < 2 # can have off by one error + + decoded_example = audio.decode_example(audio.encode_example(audio_path2)) + assert isinstance(decoded_example, AudioDecoder) + samples = decoded_example.get_all_samples() + assert decoded_example["sampling_rate"] == samples.sample_rate + assert decoded_example["array"].ndim == 1 # mono + assert abs(decoded_example["array"].shape[0] - samples.data.shape[1]) < 2 # can have off by one error + + +@require_torchcodec @require_sndfile def test_dataset_with_audio_feature(shared_datadir): + from torchcodec.decoders import AudioDecoder + audio_path = str(shared_datadir / "test_audio_44100.wav") data = {"audio": [audio_path]} features = Features({"audio": Audio()}) dset = Dataset.from_dict(data, features=features) item = dset[0] assert item.keys() == {"audio"} - assert item["audio"].keys() == {"path", "array", "sampling_rate"} - assert item["audio"]["path"] == audio_path - assert item["audio"]["array"].shape == (202311,) - assert item["audio"]["sampling_rate"] == 44100 + assert isinstance(item["audio"], AudioDecoder) + samples = item["audio"].get_all_samples() + assert samples.sample_rate == 44100 + assert samples.data.shape == (2, 202311) batch = dset[:1] assert batch.keys() == {"audio"} assert len(batch["audio"]) == 1 - assert batch["audio"][0].keys() == {"path", "array", "sampling_rate"} - assert batch["audio"][0]["path"] == audio_path - assert batch["audio"][0]["array"].shape == (202311,) - assert batch["audio"][0]["sampling_rate"] == 44100 + assert isinstance(batch["audio"][0], AudioDecoder) + samples = batch["audio"][0].get_all_samples() + assert samples.sample_rate == 44100 + assert samples.data.shape == (2, 202311) column = dset["audio"] assert len(column) == 1 - assert column[0].keys() == {"path", "array", "sampling_rate"} - assert column[0]["path"] == audio_path - assert column[0]["array"].shape == (202311,) - assert column[0]["sampling_rate"] == 44100 + assert isinstance(column[0], AudioDecoder) + samples = column[0].get_all_samples() + assert samples.sample_rate == 44100 + assert samples.data.shape == (2, 202311) -@require_librosa +@require_torchcodec @require_sndfile def test_dataset_with_audio_feature_tar_wav(tar_wav_path): + from torchcodec.decoders import AudioDecoder + audio_filename = "test_audio_44100.wav" data = {"audio": []} for file_path, file_obj in iter_archive(tar_wav_path): @@ -228,28 +299,32 @@ def test_dataset_with_audio_feature_tar_wav(tar_wav_path): dset = Dataset.from_dict(data, features=features) item = dset[0] assert item.keys() == {"audio"} - assert item["audio"].keys() == {"path", "array", "sampling_rate"} - assert item["audio"]["path"] == audio_filename - assert item["audio"]["array"].shape == (202311,) - assert item["audio"]["sampling_rate"] == 44100 + assert isinstance(item["audio"], AudioDecoder) + samples = item["audio"].get_all_samples() + assert samples.sample_rate == 44100 + assert samples.data.shape == (2, 202311) + assert item["audio"].metadata.path == audio_filename batch = dset[:1] assert batch.keys() == {"audio"} assert len(batch["audio"]) == 1 - assert batch["audio"][0].keys() == {"path", "array", "sampling_rate"} - assert batch["audio"][0]["path"] == audio_filename - assert batch["audio"][0]["array"].shape == (202311,) - assert batch["audio"][0]["sampling_rate"] == 44100 + assert isinstance(batch["audio"][0], AudioDecoder) + samples = batch["audio"][0].get_all_samples() + assert samples.sample_rate == 44100 + assert samples.data.shape == (2, 202311) + assert batch["audio"][0].metadata.path == audio_filename column = dset["audio"] assert len(column) == 1 - assert column[0].keys() == {"path", "array", "sampling_rate"} - assert column[0]["path"] == audio_filename - assert column[0]["array"].shape == (202311,) - assert column[0]["sampling_rate"] == 44100 + assert isinstance(column[0], AudioDecoder) + samples = column[0].get_all_samples() + assert samples.sample_rate == 44100 + assert samples.data.shape == (2, 202311) -@require_librosa +@require_torchcodec @require_sndfile def test_dataset_with_audio_feature_tar_mp3(tar_mp3_path): + from torchcodec.decoders import AudioDecoder + audio_filename = "test_audio_44100.mp3" data = {"audio": []} for file_path, file_obj in iter_archive(tar_mp3_path): @@ -259,25 +334,28 @@ def test_dataset_with_audio_feature_tar_mp3(tar_mp3_path): dset = Dataset.from_dict(data, features=features) item = dset[0] assert item.keys() == {"audio"} - assert item["audio"].keys() == {"path", "array", "sampling_rate"} - assert item["audio"]["path"] == audio_filename - assert item["audio"]["array"].shape == (110592,) - assert item["audio"]["sampling_rate"] == 44100 + assert isinstance(item["audio"], AudioDecoder) + samples = item["audio"].get_all_samples() + assert samples.sample_rate == 44100 + assert samples.data.shape == (2, 110592) + assert item["audio"].metadata.path == audio_filename batch = dset[:1] assert batch.keys() == {"audio"} assert len(batch["audio"]) == 1 - assert batch["audio"][0].keys() == {"path", "array", "sampling_rate"} - assert batch["audio"][0]["path"] == audio_filename - assert batch["audio"][0]["array"].shape == (110592,) - assert batch["audio"][0]["sampling_rate"] == 44100 + assert isinstance(batch["audio"][0], AudioDecoder) + samples = batch["audio"][0].get_all_samples() + assert samples.sample_rate == 44100 + assert samples.data.shape == (2, 110592) + assert batch["audio"][0].metadata.path == audio_filename column = dset["audio"] assert len(column) == 1 - assert column[0].keys() == {"path", "array", "sampling_rate"} - assert column[0]["path"] == audio_filename - assert column[0]["array"].shape == (110592,) - assert column[0]["sampling_rate"] == 44100 + assert isinstance(column[0], AudioDecoder) + samples = column[0].get_all_samples() + assert samples.sample_rate == 44100 + assert samples.data.shape == (2, 110592) +@require_torchcodec @require_sndfile def test_dataset_with_audio_feature_with_none(): data = {"audio": [None]} @@ -312,125 +390,135 @@ def test_dataset_with_audio_feature_with_none(): assert item["nested"]["audio"] is None -@require_librosa +@require_torchcodec @require_sndfile def test_resampling_at_loading_dataset_with_audio_feature(shared_datadir): + from torchcodec.decoders import AudioDecoder + audio_path = str(shared_datadir / "test_audio_44100.wav") data = {"audio": [audio_path]} features = Features({"audio": Audio(sampling_rate=16000)}) dset = Dataset.from_dict(data, features=features) item = dset[0] assert item.keys() == {"audio"} - assert item["audio"].keys() == {"path", "array", "sampling_rate"} - assert item["audio"]["path"] == audio_path - assert item["audio"]["array"].shape == (73401,) - assert item["audio"]["sampling_rate"] == 16000 + assert isinstance(item["audio"], AudioDecoder) + samples = item["audio"].get_all_samples() + assert samples.sample_rate == 16000 + assert samples.data.shape == (2, 73401) batch = dset[:1] assert batch.keys() == {"audio"} assert len(batch["audio"]) == 1 - assert batch["audio"][0].keys() == {"path", "array", "sampling_rate"} - assert batch["audio"][0]["path"] == audio_path - assert batch["audio"][0]["array"].shape == (73401,) - assert batch["audio"][0]["sampling_rate"] == 16000 + assert isinstance(batch["audio"][0], AudioDecoder) + samples = batch["audio"][0].get_all_samples() + assert samples.sample_rate == 16000 + assert samples.data.shape == (2, 73401) column = dset["audio"] assert len(column) == 1 - assert column[0].keys() == {"path", "array", "sampling_rate"} - assert column[0]["path"] == audio_path - assert column[0]["array"].shape == (73401,) - assert column[0]["sampling_rate"] == 16000 + assert isinstance(column[0], AudioDecoder) + samples = column[0].get_all_samples() + assert samples.sample_rate == 16000 + assert samples.data.shape == (2, 73401) -@require_librosa +@require_torchcodec @require_sndfile def test_resampling_at_loading_dataset_with_audio_feature_mp3(shared_datadir): + from torchcodec.decoders import AudioDecoder + audio_path = str(shared_datadir / "test_audio_44100.mp3") data = {"audio": [audio_path]} features = Features({"audio": Audio(sampling_rate=16000)}) dset = Dataset.from_dict(data, features=features) item = dset[0] assert item.keys() == {"audio"} - assert item["audio"].keys() == {"path", "array", "sampling_rate"} - assert item["audio"]["path"] == audio_path - assert item["audio"]["array"].shape == (40125,) - assert item["audio"]["sampling_rate"] == 16000 + assert isinstance(item["audio"], AudioDecoder) + samples = item["audio"].get_all_samples() + assert samples.sample_rate == 16000 + assert samples.data.shape == (2, 40124) batch = dset[:1] assert batch.keys() == {"audio"} assert len(batch["audio"]) == 1 - assert batch["audio"][0].keys() == {"path", "array", "sampling_rate"} - assert batch["audio"][0]["path"] == audio_path - assert batch["audio"][0]["array"].shape == (40125,) - assert batch["audio"][0]["sampling_rate"] == 16000 + assert isinstance(batch["audio"][0], AudioDecoder) + samples = batch["audio"][0].get_all_samples() + assert samples.sample_rate == 16000 + assert samples.data.shape == (2, 40124) column = dset["audio"] assert len(column) == 1 - assert column[0].keys() == {"path", "array", "sampling_rate"} - assert column[0]["path"] == audio_path - assert column[0]["array"].shape == (40125,) - assert column[0]["sampling_rate"] == 16000 + assert isinstance(column[0], AudioDecoder) + samples = column[0].get_all_samples() + assert samples.sample_rate == 16000 + assert samples.data.shape == (2, 40124) -@require_librosa +@require_torchcodec @require_sndfile def test_resampling_after_loading_dataset_with_audio_feature(shared_datadir): + from torchcodec.decoders import AudioDecoder + audio_path = str(shared_datadir / "test_audio_44100.wav") data = {"audio": [audio_path]} features = Features({"audio": Audio()}) dset = Dataset.from_dict(data, features=features) item = dset[0] - assert item["audio"]["sampling_rate"] == 44100 + samples = item["audio"].get_all_samples() + assert samples.sample_rate == 44100 dset = dset.cast_column("audio", Audio(sampling_rate=16000)) item = dset[0] assert item.keys() == {"audio"} - assert item["audio"].keys() == {"path", "array", "sampling_rate"} - assert item["audio"]["path"] == audio_path - assert item["audio"]["array"].shape == (73401,) - assert item["audio"]["sampling_rate"] == 16000 + assert isinstance(item["audio"], AudioDecoder) + samples = item["audio"].get_all_samples() + assert samples.sample_rate == 16000 + assert samples.data.shape == (2, 73401) batch = dset[:1] assert batch.keys() == {"audio"} assert len(batch["audio"]) == 1 - assert batch["audio"][0].keys() == {"path", "array", "sampling_rate"} - assert batch["audio"][0]["path"] == audio_path - assert batch["audio"][0]["array"].shape == (73401,) - assert batch["audio"][0]["sampling_rate"] == 16000 + assert isinstance(batch["audio"][0], AudioDecoder) + samples = batch["audio"][0].get_all_samples() + assert samples.sample_rate == 16000 + assert samples.data.shape == (2, 73401) column = dset["audio"] assert len(column) == 1 - assert column[0].keys() == {"path", "array", "sampling_rate"} - assert column[0]["path"] == audio_path - assert column[0]["array"].shape == (73401,) - assert column[0]["sampling_rate"] == 16000 + assert isinstance(column[0], AudioDecoder) + samples = column[0].get_all_samples() + assert samples.sample_rate == 16000 + assert samples.data.shape == (2, 73401) -@require_librosa +@require_torchcodec @require_sndfile def test_resampling_after_loading_dataset_with_audio_feature_mp3(shared_datadir): + from torchcodec.decoders import AudioDecoder + audio_path = str(shared_datadir / "test_audio_44100.mp3") data = {"audio": [audio_path]} features = Features({"audio": Audio()}) dset = Dataset.from_dict(data, features=features) item = dset[0] - assert item["audio"]["sampling_rate"] == 44100 + samples = item["audio"].get_all_samples() + assert samples.sample_rate == 44100 dset = dset.cast_column("audio", Audio(sampling_rate=16000)) item = dset[0] assert item.keys() == {"audio"} - assert item["audio"].keys() == {"path", "array", "sampling_rate"} - assert item["audio"]["path"] == audio_path - assert item["audio"]["array"].shape == (40125,) - assert item["audio"]["sampling_rate"] == 16000 + assert isinstance(item["audio"], AudioDecoder) + samples = item["audio"].get_all_samples() + assert samples.sample_rate == 16000 + assert samples.data.shape == (2, 40124) batch = dset[:1] assert batch.keys() == {"audio"} assert len(batch["audio"]) == 1 - assert batch["audio"][0].keys() == {"path", "array", "sampling_rate"} - assert batch["audio"][0]["path"] == audio_path - assert batch["audio"][0]["array"].shape == (40125,) - assert batch["audio"][0]["sampling_rate"] == 16000 + assert isinstance(batch["audio"][0], AudioDecoder) + samples = batch["audio"][0].get_all_samples() + assert samples.sample_rate == 16000 + assert samples.data.shape == (2, 40124) column = dset["audio"] assert len(column) == 1 - assert column[0].keys() == {"path", "array", "sampling_rate"} - assert column[0]["path"] == audio_path - assert column[0]["array"].shape == (40125,) - assert column[0]["sampling_rate"] == 16000 + assert isinstance(column[0], AudioDecoder) + samples = column[0].get_all_samples() + assert samples.sample_rate == 16000 + assert samples.data.shape == (2, 40124) -@require_librosa +@require_torchcodec @pytest.mark.parametrize( "build_data", [ @@ -444,18 +532,21 @@ def test_resampling_after_loading_dataset_with_audio_feature_mp3(shared_datadir) ], ) def test_dataset_cast_to_audio_features(shared_datadir, build_data): + from torchcodec.decoders import AudioDecoder + audio_path = str(shared_datadir / "test_audio_44100.wav") data = build_data(audio_path) dset = Dataset.from_dict(data) item = dset.cast(Features({"audio": Audio()}))[0] assert item.keys() == {"audio"} - assert item["audio"].keys() == {"path", "array", "sampling_rate"} + assert isinstance(item["audio"], AudioDecoder) item = dset.cast_column("audio", Audio())[0] assert item.keys() == {"audio"} - assert item["audio"].keys() == {"path", "array", "sampling_rate"} + assert isinstance(item["audio"], AudioDecoder) -@require_librosa +@require_torchcodec +@require_sndfile def test_dataset_concatenate_audio_features(shared_datadir): # we use a different data structure between 1 and 2 to make sure they are compatible with each other audio_path = str(shared_datadir / "test_audio_44100.wav") @@ -465,11 +556,16 @@ def test_dataset_concatenate_audio_features(shared_datadir): dset2 = Dataset.from_dict(data2, features=Features({"audio": Audio()})) concatenated_dataset = concatenate_datasets([dset1, dset2]) assert len(concatenated_dataset) == len(dset1) + len(dset2) - assert concatenated_dataset[0]["audio"]["array"].shape == dset1[0]["audio"]["array"].shape - assert concatenated_dataset[1]["audio"]["array"].shape == dset2[0]["audio"]["array"].shape + assert ( + concatenated_dataset[0]["audio"].get_all_samples().data.shape == dset1[0]["audio"].get_all_samples().data.shape + ) + assert ( + concatenated_dataset[1]["audio"].get_all_samples().data.shape == dset2[0]["audio"].get_all_samples().data.shape + ) -@require_librosa +@require_torchcodec +@require_sndfile def test_dataset_concatenate_nested_audio_features(shared_datadir): # we use a different data structure between 1 and 2 to make sure they are compatible with each other audio_path = str(shared_datadir / "test_audio_44100.wav") @@ -481,16 +577,17 @@ def test_dataset_concatenate_nested_audio_features(shared_datadir): concatenated_dataset = concatenate_datasets([dset1, dset2]) assert len(concatenated_dataset) == len(dset1) + len(dset2) assert ( - concatenated_dataset[0]["list_of_structs_of_audios"][0]["audio"]["array"].shape - == dset1[0]["list_of_structs_of_audios"][0]["audio"]["array"].shape + concatenated_dataset[0]["list_of_structs_of_audios"][0]["audio"].get_all_samples().data.shape + == dset1[0]["list_of_structs_of_audios"][0]["audio"].get_all_samples().data.shape ) assert ( - concatenated_dataset[1]["list_of_structs_of_audios"][0]["audio"]["array"].shape - == dset2[0]["list_of_structs_of_audios"][0]["audio"]["array"].shape + concatenated_dataset[1]["list_of_structs_of_audios"][0]["audio"].get_all_samples().data.shape + == dset2[0]["list_of_structs_of_audios"][0]["audio"].get_all_samples().data.shape ) @require_sndfile +@require_torchcodec def test_dataset_with_audio_feature_map_is_not_decoded(shared_datadir): audio_path = str(shared_datadir / "test_audio_44100.wav") data = {"audio": [audio_path], "text": ["Hello"]} @@ -512,8 +609,8 @@ def process_text(example): assert item == {"audio": expected_audio, "text": "Hello World!"} -@require_librosa @require_sndfile +@require_torchcodec def test_dataset_with_audio_feature_map_is_decoded(shared_datadir): audio_path = str(shared_datadir / "test_audio_44100.wav") data = {"audio": [audio_path], "text": ["Hello"]} @@ -521,7 +618,8 @@ def test_dataset_with_audio_feature_map_is_decoded(shared_datadir): dset = Dataset.from_dict(data, features=features) def process_audio_sampling_rate_by_example(example): - example["double_sampling_rate"] = 2 * example["audio"]["sampling_rate"] + sample_rate = example["audio"].get_all_samples().sample_rate + example["double_sampling_rate"] = 2 * sample_rate return example decoded_dset = dset.map(process_audio_sampling_rate_by_example) @@ -532,7 +630,7 @@ def process_audio_sampling_rate_by_example(example): def process_audio_sampling_rate_by_batch(batch): double_sampling_rates = [] for audio in batch["audio"]: - double_sampling_rates.append(2 * audio["sampling_rate"]) + double_sampling_rates.append(2 * audio.get_all_samples().sample_rate) batch["double_sampling_rate"] = double_sampling_rates return batch @@ -542,9 +640,11 @@ def process_audio_sampling_rate_by_batch(batch): assert item["double_sampling_rate"] == 88200 -@require_librosa +@require_torchcodec @require_sndfile def test_formatted_dataset_with_audio_feature(shared_datadir): + from torchcodec.decoders import AudioDecoder + audio_path = str(shared_datadir / "test_audio_44100.wav") data = {"audio": [audio_path, audio_path]} features = Features({"audio": Audio()}) @@ -552,45 +652,45 @@ def test_formatted_dataset_with_audio_feature(shared_datadir): with dset.formatted_as("numpy"): item = dset[0] assert item.keys() == {"audio"} - assert item["audio"].keys() == {"path", "array", "sampling_rate"} - assert item["audio"]["path"] == audio_path - assert item["audio"]["array"].shape == (202311,) - assert item["audio"]["sampling_rate"] == 44100 + assert isinstance(item["audio"], AudioDecoder) + samples = item["audio"].get_all_samples() + assert samples.sample_rate == 44100 + assert samples.data.shape == (2, 202311) batch = dset[:1] assert batch.keys() == {"audio"} assert len(batch["audio"]) == 1 - assert batch["audio"][0].keys() == {"path", "array", "sampling_rate"} - assert batch["audio"][0]["path"] == audio_path - assert batch["audio"][0]["array"].shape == (202311,) - assert batch["audio"][0]["sampling_rate"] == 44100 + assert isinstance(batch["audio"][0], AudioDecoder) + samples = batch["audio"][0].get_all_samples() + assert samples.sample_rate == 44100 + assert samples.data.shape == (2, 202311) column = dset["audio"] assert len(column) == 2 - assert column[0].keys() == {"path", "array", "sampling_rate"} - assert column[0]["path"] == audio_path - assert column[0]["array"].shape == (202311,) - assert column[0]["sampling_rate"] == 44100 + assert isinstance(column[0], AudioDecoder) + samples = column[0].get_all_samples() + assert samples.sample_rate == 44100 + assert samples.data.shape == (2, 202311) with dset.formatted_as("pandas"): item = dset[0] assert item.shape == (1, 1) assert item.columns == ["audio"] - assert item["audio"][0].keys() == {"path", "array", "sampling_rate"} - assert item["audio"][0]["path"] == audio_path - assert item["audio"][0]["array"].shape == (202311,) - assert item["audio"][0]["sampling_rate"] == 44100 + assert isinstance(item["audio"][0], AudioDecoder) + samples = item["audio"][0].get_all_samples() + assert samples.sample_rate == 44100 + assert samples.data.shape == (2, 202311) batch = dset[:1] assert batch.shape == (1, 1) assert batch.columns == ["audio"] - assert batch["audio"][0].keys() == {"path", "array", "sampling_rate"} - assert batch["audio"][0]["path"] == audio_path - assert batch["audio"][0]["array"].shape == (202311,) - assert batch["audio"][0]["sampling_rate"] == 44100 + assert isinstance(batch["audio"][0], AudioDecoder) + samples = batch["audio"][0].get_all_samples() + assert samples.sample_rate == 44100 + assert samples.data.shape == (2, 202311) column = dset["audio"] assert len(column) == 2 - assert column[0].keys() == {"path", "array", "sampling_rate"} - assert column[0]["path"] == audio_path - assert column[0]["array"].shape == (202311,) - assert column[0]["sampling_rate"] == 44100 + assert isinstance(column[0], AudioDecoder) + samples = column[0].get_all_samples() + assert samples.sample_rate == 44100 + assert samples.data.shape == (2, 202311) @pytest.fixture @@ -606,20 +706,23 @@ def jsonl_audio_dataset_path(shared_datadir, tmp_path_factory): return path -@require_librosa +@require_torchcodec @require_sndfile @pytest.mark.parametrize("streaming", [False, True]) def test_load_dataset_with_audio_feature(streaming, jsonl_audio_dataset_path, shared_datadir): + from torchcodec.decoders import AudioDecoder + audio_path = str(shared_datadir / "test_audio_44100.wav") data_files = jsonl_audio_dataset_path features = Features({"audio": Audio(), "text": Value("string")}) dset = load_dataset("json", split="train", data_files=data_files, features=features, streaming=streaming) item = dset[0] if not streaming else next(iter(dset)) assert item.keys() == {"audio", "text"} - assert item["audio"].keys() == {"path", "array", "sampling_rate"} - assert item["audio"]["path"] == audio_path - assert item["audio"]["array"].shape == (202311,) - assert item["audio"]["sampling_rate"] == 44100 + assert isinstance(item["audio"], AudioDecoder) + samples = item["audio"].get_all_samples() + assert samples.sample_rate == 44100 + assert samples.data.shape == (2, 202311) + assert item["audio"].metadata.path == audio_path @require_sndfile diff --git a/tests/features/test_video.py b/tests/features/test_video.py index b8185f955df..64c1441227c 100644 --- a/tests/features/test_video.py +++ b/tests/features/test_video.py @@ -1,11 +1,11 @@ import pytest -from datasets import Column, Dataset, Features, Video +from datasets import Column, Dataset, Features, Value, Video, load_dataset -from ..utils import require_torchvision +from ..utils import require_torchcodec -@require_torchvision +@require_torchcodec @pytest.mark.parametrize( "build_example", [ @@ -19,7 +19,7 @@ ], ) def test_video_feature_encode_example(shared_datadir, build_example): - from torchvision.io import VideoReader + from torchcodec.decoders import VideoDecoder video_path = str(shared_datadir / "test_video_66x50.mov") video = Video() @@ -28,13 +28,13 @@ def test_video_feature_encode_example(shared_datadir, build_example): assert encoded_example.keys() == {"bytes", "path"} assert encoded_example["bytes"] is not None or encoded_example["path"] is not None decoded_example = video.decode_example(encoded_example) - assert isinstance(decoded_example, VideoReader) + assert isinstance(decoded_example, VideoDecoder) -@require_torchvision +@require_torchcodec def test_dataset_with_video_feature(shared_datadir): import torch - from torchvision.io import VideoReader + from torchcodec.decoders import VideoDecoder video_path = str(shared_datadir / "test_video_66x50.mov") data = {"video": [video_path]} @@ -42,20 +42,21 @@ def test_dataset_with_video_feature(shared_datadir): dset = Dataset.from_dict(data, features=features) item = dset[0] assert item.keys() == {"video"} - assert isinstance(item["video"], VideoReader) - assert next(item["video"])["data"].shape == (3, 50, 66) - assert isinstance(next(item["video"])["data"], torch.Tensor) + assert isinstance(item["video"], VideoDecoder) + assert item["video"].get_frame_at(0).data.shape == (3, 50, 66) + assert isinstance(item["video"].get_frame_at(0).data, torch.Tensor) batch = dset[:1] assert len(batch) == 1 assert batch.keys() == {"video"} - assert isinstance(batch["video"], list) and all(isinstance(item, VideoReader) for item in batch["video"]) - assert next(batch["video"][0])["data"].shape == (3, 50, 66) - assert isinstance(next(batch["video"][0])["data"], torch.Tensor) + assert isinstance(batch["video"], list) and all(isinstance(item, VideoDecoder) for item in batch["video"]) + assert batch["video"][0].get_frame_at(0).data.shape == (3, 50, 66) + assert isinstance(batch["video"][0].get_frame_at(0).data, torch.Tensor) column = dset["video"] assert len(column) == 1 - assert isinstance(column, Column) and all(isinstance(item, VideoReader) for item in column) - assert next(column[0])["data"].shape == (3, 50, 66) - assert isinstance(next(column[0])["data"], torch.Tensor) + + assert isinstance(column, Column) and all(isinstance(item, VideoDecoder) for item in column) + assert next(iter(column)).get_frame_at(0).data.shape == (3, 50, 66) + assert isinstance(next(iter(column)).get_frame_at(0).data, torch.Tensor) # from bytes with open(video_path, "rb") as f: @@ -63,14 +64,14 @@ def test_dataset_with_video_feature(shared_datadir): dset = Dataset.from_dict(data, features=features) item = dset[0] assert item.keys() == {"video"} - assert isinstance(item["video"], VideoReader) - assert next(item["video"])["data"].shape == (3, 50, 66) - assert isinstance(next(item["video"])["data"], torch.Tensor) + assert isinstance(item["video"], VideoDecoder) + assert item["video"].get_frame_at(0).data.shape == (3, 50, 66) + assert isinstance(item["video"].get_frame_at(0).data, torch.Tensor) -@require_torchvision +@require_torchcodec def test_dataset_with_video_map_and_formatted(shared_datadir): - from torchvision.io import VideoReader + from torchcodec.decoders import VideoDecoder video_path = str(shared_datadir / "test_video_66x50.mov") data = {"video": [video_path]} @@ -78,7 +79,7 @@ def test_dataset_with_video_map_and_formatted(shared_datadir): dset = Dataset.from_dict(data, features=features) dset = dset.map(lambda x: x).with_format("numpy") example = dset[0] - assert isinstance(example["video"], VideoReader) + assert isinstance(example["video"], VideoDecoder) # assert isinstance(example["video"][0], np.ndarray) # from bytes @@ -87,5 +88,65 @@ def test_dataset_with_video_map_and_formatted(shared_datadir): dset = Dataset.from_dict(data, features=features) dset = dset.map(lambda x: x).with_format("numpy") example = dset[0] - assert isinstance(example["video"], VideoReader) + assert isinstance(example["video"], VideoDecoder) # assert isinstance(example["video"][0], np.ndarray) + + +# Dataset casting and mapping +@require_torchcodec +def test_dataset_with_video_feature_map_is_decoded(shared_datadir): + video_path = str(shared_datadir / "test_video_66x50.mov") + data = {"video": [video_path], "text": ["Hello"]} + features = Features({"video": Video(), "text": Value("string")}) + dset = Dataset.from_dict(data, features=features) + + def process_audio_sampling_rate_by_example(example): + begin_stream_seconds = example["video"].metadata.begin_stream_seconds + example["double_begin_stream_seconds"] = 2 * begin_stream_seconds + return example + + decoded_dset = dset.map(process_audio_sampling_rate_by_example) + for item in decoded_dset.cast_column("video", Video(decode=False)): + assert item.keys() == {"video", "text", "double_begin_stream_seconds"} + assert item["double_begin_stream_seconds"] == 0.0 + + def process_audio_sampling_rate_by_batch(batch): + double_fps = [] + for video in batch["video"]: + double_fps.append(2 * video.metadata.begin_stream_seconds) + batch["double_begin_stream_seconds"] = double_fps + return batch + + decoded_dset = dset.map(process_audio_sampling_rate_by_batch, batched=True) + for item in decoded_dset.cast_column("video", Video(decode=False)): + assert item.keys() == {"video", "text", "double_begin_stream_seconds"} + assert item["double_begin_stream_seconds"] == 0.0 + + +@pytest.fixture +def jsonl_video_dataset_path(shared_datadir, tmp_path_factory): + import json + + video_path = str(shared_datadir / "test_video_66x50.mov") + data = [{"video": video_path, "text": "Hello world!"}] + path = str(tmp_path_factory.mktemp("data") / "video_dataset.jsonl") + with open(path, "w") as f: + for item in data: + f.write(json.dumps(item) + "\n") + return path + + +@require_torchcodec +@pytest.mark.parametrize("streaming", [False, True]) +def test_load_dataset_with_video_feature(streaming, jsonl_video_dataset_path, shared_datadir): + from torchcodec.decoders import VideoDecoder + + video_path = str(shared_datadir / "test_video_66x50.mov") + data_files = jsonl_video_dataset_path + features = Features({"video": Video(), "text": Value("string")}) + dset = load_dataset("json", split="train", data_files=data_files, features=features, streaming=streaming) + item = dset[0] if not streaming else next(iter(dset)) + assert item.keys() == {"video", "text"} + assert isinstance(item["video"], VideoDecoder) + assert item["video"].get_frame_at(0).data.shape == (3, 50, 66) + assert item["video"].metadata.path == video_path diff --git a/tests/fixtures/files.py b/tests/fixtures/files.py index fe9549c30b4..25b1448ae46 100644 --- a/tests/fixtures/files.py +++ b/tests/fixtures/files.py @@ -551,6 +551,16 @@ def audio_file(): return os.path.join("tests", "features", "data", "test_audio_44100.wav") +@pytest.fixture(scope="session") +def audio_file_44100(): + return os.path.join("tests", "features", "data", "test_audio_44100.mp3") + + +@pytest.fixture(scope="session") +def audio_file_16000(): + return os.path.join("tests", "features", "data", "test_audio_16000.mp3") + + @pytest.fixture(scope="session") def tensor_file(tmp_path_factory): import torch diff --git a/tests/packaged_modules/test_audiofolder.py b/tests/packaged_modules/test_audiofolder.py index a7aabb8fdf3..1dc87c1f6f0 100644 --- a/tests/packaged_modules/test_audiofolder.py +++ b/tests/packaged_modules/test_audiofolder.py @@ -10,7 +10,7 @@ from datasets.download.streaming_download_manager import StreamingDownloadManager from datasets.packaged_modules.audiofolder.audiofolder import AudioFolder, AudioFolderConfig -from ..utils import require_librosa, require_sndfile +from ..utils import require_sndfile, require_torchcodec @pytest.fixture @@ -148,10 +148,7 @@ def data_files_with_two_splits_and_metadata(request, tmp_path, audio_file): @pytest.fixture -def data_files_with_zip_archives(tmp_path, audio_file): - import librosa - import soundfile as sf - +def data_files_with_zip_archives(tmp_path, audio_file_44100, audio_file_16000): data_dir = tmp_path / "audiofolder_data_dir_with_zip_archives" data_dir.mkdir(parents=True, exist_ok=True) archive_dir = data_dir / "archive" @@ -159,19 +156,16 @@ def data_files_with_zip_archives(tmp_path, audio_file): subdir = archive_dir / "subdir" subdir.mkdir(parents=True, exist_ok=True) - audio_filename = archive_dir / "audio_file.wav" - shutil.copyfile(audio_file, audio_filename) - audio_filename2 = subdir / "audio_file2.wav" # in subdir - # make sure they're two different audios - # Indeed we won't be able to compare the audio filenames, since the archive is not extracted in streaming mode - array, sampling_rate = librosa.load(str(audio_filename), sr=16000) # original sampling rate is 44100 - sf.write(str(audio_filename2), array, samplerate=16000) + audio_filename = archive_dir / "audio_file.mp3" + shutil.copyfile(audio_file_44100, audio_filename) + audio_filename2 = subdir / "audio_file2.mp3" # in subdir + shutil.copyfile(audio_file_16000, audio_filename2) audio_metadata_filename = archive_dir / "metadata.jsonl" audio_metadata = textwrap.dedent( """\ - {"file_name": "audio_file.wav", "text": "First audio transcription"} - {"file_name": "subdir/audio_file2.wav", "text": "Second audio transcription (in subdir)"} + {"file_name": "audio_file.mp3", "text": "First audio transcription"} + {"file_name": "subdir/audio_file2.mp3", "text": "Second audio transcription (in subdir)"} """ ) @@ -199,7 +193,7 @@ def test_config_raises_when_invalid_data_files(data_files) -> None: _ = AudioFolderConfig(name="name", data_files=data_files) -@require_librosa +@require_torchcodec @require_sndfile # check that labels are inferred correctly from dir names def test_generate_examples_with_labels(data_files_with_labels_no_metadata, cache_dir): @@ -265,7 +259,7 @@ def test_generate_examples_drop_metadata(audio_file_with_metadata, drop_metadata assert example[column] is not None -@require_librosa +@require_torchcodec @require_sndfile @pytest.mark.parametrize("streaming", [False, True]) def test_data_files_with_metadata_and_single_split(streaming, cache_dir, data_files_with_one_split_and_metadata): @@ -279,12 +273,12 @@ def test_data_files_with_metadata_and_single_split(streaming, cache_dir, data_fi dataset = list(datasets[split]) assert len(dataset) == expected_num_of_audios # make sure each sample has its own audio and metadata - assert len({example["audio"]["path"] for example in dataset}) == expected_num_of_audios + assert len({example["audio"].metadata.path for example in dataset}) == expected_num_of_audios assert len({example["text"] for example in dataset}) == expected_num_of_audios assert all(example["text"] is not None for example in dataset) -@require_librosa +@require_torchcodec @require_sndfile @pytest.mark.parametrize("streaming", [False, True]) def test_data_files_with_metadata_and_multiple_splits(streaming, cache_dir, data_files_with_two_splits_and_metadata): @@ -298,12 +292,12 @@ def test_data_files_with_metadata_and_multiple_splits(streaming, cache_dir, data dataset = list(datasets[split]) assert len(dataset) == expected_num_of_audios # make sure each sample has its own audio and metadata - assert len({example["audio"]["path"] for example in dataset}) == expected_num_of_audios + assert len({example["audio"].metadata.path for example in dataset}) == expected_num_of_audios assert len({example["text"] for example in dataset}) == expected_num_of_audios assert all(example["text"] is not None for example in dataset) -@require_librosa +@require_torchcodec @require_sndfile @pytest.mark.parametrize("streaming", [False, True]) def test_data_files_with_metadata_and_archives(streaming, cache_dir, data_files_with_zip_archives): @@ -318,7 +312,12 @@ def test_data_files_with_metadata_and_archives(streaming, cache_dir, data_files_ assert len(dataset) == expected_num_of_audios # make sure each sample has its own audio (all arrays are different) and metadata assert ( - sum(np.array_equal(dataset[0]["audio"]["array"], example["audio"]["array"]) for example in dataset[1:]) + sum( + np.array_equal( + dataset[0]["audio"].get_all_samples().data.numpy(), example["audio"].get_all_samples().data.numpy() + ) + for example in dataset[1:] + ) == 0 ) assert len({example["text"] for example in dataset}) == expected_num_of_audios diff --git a/tests/packaged_modules/test_webdataset.py b/tests/packaged_modules/test_webdataset.py index 128f13022fc..12aa6275382 100644 --- a/tests/packaged_modules/test_webdataset.py +++ b/tests/packaged_modules/test_webdataset.py @@ -1,13 +1,18 @@ import json import tarfile -import numpy as np import pytest from datasets import Audio, DownloadManager, Features, Image, Sequence, Value from datasets.packaged_modules.webdataset.webdataset import WebDataset -from ..utils import require_librosa, require_numpy1_on_windows, require_pil, require_sndfile, require_torch +from ..utils import ( + require_numpy1_on_windows, + require_pil, + require_sndfile, + require_torch, + require_torchcodec, +) @pytest.fixture @@ -159,9 +164,11 @@ def test_image_webdataset_missing_keys(image_wds_file): assert decoded["txt"] is None -@require_librosa +@require_torchcodec @require_sndfile def test_audio_webdataset(audio_wds_file): + from torchcodec.decoders import AudioDecoder + data_files = {"train": [audio_wds_file]} webdataset = WebDataset(data_files=data_files) split_generators = webdataset._split_generators(DownloadManager()) @@ -187,8 +194,7 @@ def test_audio_webdataset(audio_wds_file): decoded = webdataset.info.features.decode_example(encoded) assert isinstance(decoded["json"], dict) assert isinstance(decoded["json"]["transcript"], str) - assert isinstance(decoded["wav"], dict) - assert isinstance(decoded["wav"]["array"], np.ndarray) + assert isinstance(decoded["wav"], AudioDecoder) def test_webdataset_errors_on_bad_file(bad_wds_file): diff --git a/tests/test_formatting.py b/tests/test_formatting.py index 147822fa8a1..9b4d9f235d4 100644 --- a/tests/test_formatting.py +++ b/tests/test_formatting.py @@ -20,13 +20,13 @@ from .utils import ( require_jax, - require_librosa, require_numpy1_on_windows, require_pil, require_polars, require_sndfile, require_tf, require_torch, + require_torchcodec, ) @@ -309,17 +309,17 @@ def test_numpy_formatter_image(self): self.assertEqual(batch["image"][0].dtype, np.uint8) self.assertEqual(batch["image"][0].shape, (480, 640, 3)) - @require_librosa + @require_torchcodec @require_sndfile def test_numpy_formatter_audio(self): pa_table = pa.table({"audio": [{"bytes": None, "path": str(AUDIO_PATH_1)}]}) formatter = NumpyFormatter(features=Features({"audio": Audio()})) row = formatter.format_row(pa_table) - self.assertEqual(row["audio"]["array"].dtype, np.dtype(np.float32)) + self.assertEqual(row["audio"].get_all_samples().data.cpu().numpy().dtype, np.dtype(np.float32)) col = formatter.format_column(pa_table) - self.assertEqual(col[0]["array"].dtype, np.float32) + self.assertEqual(col[0].get_all_samples().data.cpu().numpy().dtype, np.float32) batch = formatter.format_batch(pa_table) - self.assertEqual(batch["audio"][0]["array"].dtype, np.dtype(np.float32)) + self.assertEqual(batch["audio"][0].get_all_samples().data.cpu().numpy().dtype, np.dtype(np.float32)) def test_pandas_formatter(self): pa_table = self._create_dummy_table() @@ -432,7 +432,7 @@ def test_torch_formatter_image(self): self.assertEqual(batch["image"][0].shape, (3, 480, 640)) @require_torch - @require_librosa + @require_torchcodec @require_sndfile def test_torch_formatter_audio(self): import torch @@ -442,11 +442,11 @@ def test_torch_formatter_audio(self): pa_table = pa.table({"audio": [{"bytes": None, "path": str(AUDIO_PATH_1)}]}) formatter = TorchFormatter(features=Features({"audio": Audio()})) row = formatter.format_row(pa_table) - self.assertEqual(row["audio"]["array"].dtype, torch.float32) + self.assertEqual(row["audio"].get_all_samples().data.dtype, torch.float32) col = formatter.format_column(pa_table) - self.assertEqual(col[0]["array"].dtype, torch.float32) + self.assertEqual(col[0].get_all_samples().data.dtype, torch.float32) batch = formatter.format_batch(pa_table) - self.assertEqual(batch["audio"][0]["array"].dtype, torch.float32) + self.assertEqual(batch["audio"][0].get_all_samples().data.dtype, torch.float32) @require_tf def test_tf_formatter(self): @@ -535,11 +535,14 @@ def test_tf_formatter_audio(self): pa_table = pa.table({"audio": [{"bytes": None, "path": str(AUDIO_PATH_1)}]}) formatter = TFFormatter(features=Features({"audio": Audio()})) row = formatter.format_row(pa_table) - self.assertEqual(row["audio"]["array"].dtype, tf.float32) + tf_row = tf.convert_to_tensor(row["audio"].get_all_samples().data.cpu().numpy()) + self.assertEqual(tf_row.dtype, tf.float32) col = formatter.format_column(pa_table) - self.assertEqual(col[0]["array"].dtype, tf.float32) + tf_col_0 = tf.convert_to_tensor(col[0].get_all_samples().data.cpu().numpy()) + self.assertEqual(tf_col_0.dtype, tf.float32) batch = formatter.format_batch(pa_table) - self.assertEqual(batch["audio"][0]["array"].dtype, tf.float32) + tf_batch_0 = tf.convert_to_tensor(batch["audio"][0].get_all_samples().data.cpu().numpy()) + self.assertEqual(tf_batch_0.dtype, tf.float32) @require_jax def test_jax_formatter(self): @@ -616,7 +619,7 @@ def test_jax_formatter_image(self): self.assertEqual(batch["image"][0].shape, (480, 640, 3)) @require_jax - @require_librosa + @require_torchcodec @require_sndfile def test_jax_formatter_audio(self): import jax.numpy as jnp diff --git a/tests/test_hub.py b/tests/test_hub.py index e773745e579..d0394c34441 100644 --- a/tests/test_hub.py +++ b/tests/test_hub.py @@ -39,7 +39,8 @@ def test_delete_from_hub(temporary_repo, hf_api, hf_token, csv_path, ci_hub_conf ) hf_api.upload_file( token=hf_token, - path_or_fileobj=dedent(f"""\ + path_or_fileobj=dedent( + f"""\ --- {METADATA_CONFIGS_FIELD}: - config_name: cats @@ -51,7 +52,8 @@ def test_delete_from_hub(temporary_repo, hf_api, hf_token, csv_path, ci_hub_conf - split: train path: dogs/train/* --- - """).encode(), + """ + ).encode(), path_in_repo="README.md", repo_id=repo_id, repo_type="dataset", @@ -68,7 +70,8 @@ def test_delete_from_hub(temporary_repo, hf_api, hf_token, csv_path, ci_hub_conf CommitOperationDelete(path_in_repo="dogs/train/0000.csv", is_folder=False), CommitOperationAdd( path_in_repo="README.md", - path_or_fileobj=dedent(f"""\ + path_or_fileobj=dedent( + f"""\ --- {METADATA_CONFIGS_FIELD}: - config_name: cats @@ -76,7 +79,8 @@ def test_delete_from_hub(temporary_repo, hf_api, hf_token, csv_path, ci_hub_conf - split: train path: cats/train/* --- - """).encode(), + """ + ).encode(), ), ] assert mock_method.call_args.kwargs.get("operations") == expected_operations diff --git a/tests/test_load.py b/tests/test_load.py index c2f17b7b1b7..a532452eb4c 100644 --- a/tests/test_load.py +++ b/tests/test_load.py @@ -43,7 +43,6 @@ assert_arrow_memory_doesnt_increase, assert_arrow_memory_increases, offline, - require_moto, require_pil, require_sndfile, set_current_working_directory_to_temp_dir, @@ -1213,51 +1212,6 @@ def test_load_dataset_local_with_default_in_memory(max_in_memory_dataset_size, d assert (dataset["train"].dataset_size < max_in_memory_dataset_size) is expected_in_memory -@pytest.fixture -def moto_server(monkeypatch): - from moto.server import ThreadedMotoServer - - monkeypatch.setattr( - "os.environ", - { - "AWS_ENDPOINT_URL": "http://localhost:5000", - "AWS_DEFAULT_REGION": "us-east-1", - "AWS_ACCESS_KEY_ID": "FOO", - "AWS_SECRET_ACCESS_KEY": "BAR", - }, - ) - server = ThreadedMotoServer() - server.start() - try: - yield - finally: - server.stop() - - -@require_moto -def test_load_file_from_s3(moto_server): - # we need server mode here because of an aiobotocore incompatibility with moto.mock_aws - # (https://github.com/getmoto/moto/issues/6836) - import boto3 - - # Create a mock S3 bucket - bucket_name = "test-bucket" - s3 = boto3.client("s3", region_name="us-east-1") - s3.create_bucket(Bucket=bucket_name) - - # Upload a file to the mock bucket - key = "test-file.csv" - csv_data = "Island\nIsabela\nBaltra" - - s3.put_object(Bucket=bucket_name, Key=key, Body=csv_data) - - # Load the file from the mock bucket - ds = datasets.load_dataset("csv", data_files={"train": "s3://test-bucket/test-file.csv"}) - - # Check if the loaded content matches the original content - assert list(ds["train"]) == [{"Island": "Isabela"}, {"Island": "Baltra"}] - - @pytest.mark.integration def test_remote_data_files(): repo_id = "hf-internal-testing/raw_jsonl" diff --git a/tests/test_upstream_hub.py b/tests/test_upstream_hub.py index 2379f0b1679..b118f174264 100644 --- a/tests/test_upstream_hub.py +++ b/tests/test_upstream_hub.py @@ -38,7 +38,7 @@ from datasets.utils.hub import hf_dataset_url from .fixtures.hub import CI_HUB_ENDPOINT, CI_HUB_USER, CI_HUB_USER_TOKEN -from .utils import for_all_test_methods, require_librosa, require_pil, require_sndfile, xfail_if_500_502_http_error +from .utils import for_all_test_methods, require_pil, require_sndfile, require_torchcodec, xfail_if_500_502_http_error pytestmark = pytest.mark.integration @@ -387,7 +387,7 @@ def test_push_dataset_to_hub_custom_features(self, temporary_repo): assert ds.features == hub_ds.features assert ds[:] == hub_ds[:] - @require_librosa + @require_torchcodec @require_sndfile def test_push_dataset_to_hub_custom_features_audio(self, temporary_repo): audio_path = os.path.join(os.path.dirname(__file__), "features", "data", "test_audio_44100.wav") @@ -403,7 +403,10 @@ def test_push_dataset_to_hub_custom_features_audio(self, temporary_repo): assert ds.column_names == hub_ds.column_names assert list(ds.features.keys()) == list(hub_ds.features.keys()) assert ds.features == hub_ds.features - np.testing.assert_equal(ds[0]["x"]["array"], hub_ds[0]["x"]["array"]) + np.testing.assert_equal( + ds[0]["x"].get_all_samples().data.cpu().numpy(), + hub_ds[0]["x"].get_all_samples().data.cpu().numpy(), + ) assert ds[1] == hub_ds[1] # don't test hub_ds[0] since audio decoding might be slightly different hub_ds = hub_ds.cast_column("x", Audio(decode=False)) elem = hub_ds[0]["x"] diff --git a/tests/utils.py b/tests/utils.py index 827404fd13d..66341e70220 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -48,7 +48,6 @@ def parse_flag_from_env(key, default=False): require_zstandard = pytest.mark.skipif(not config.ZSTANDARD_AVAILABLE, reason="test requires zstandard") # Audio -require_librosa = pytest.mark.skipif(find_spec("librosa") is None, reason="test requires librosa") require_sndfile = pytest.mark.skipif( # On Windows and OS X, soundfile installs sndfile find_spec("soundfile") is None or version.parse(importlib.metadata.version("soundfile")) < version.parse("0.12.0"), @@ -190,6 +189,18 @@ def require_torchvision(test_case): return test_case +def require_torchcodec(test_case): + """ + Decorator marking a test that requires torchvision. + + These tests are skipped when torchvision isn't installed. + + """ + if not config.TORCHCODEC_AVAILABLE: + test_case = unittest.skip("test requires torchvision")(test_case) + return test_case + + def require_pdfplumber(test_case): """ Decorator marking a test that requires pdfplumber.