From e0d9bbb404372b327a363aedc1f05f1597e76246 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Thu, 25 Sep 2025 15:35:44 +0200 Subject: [PATCH] update tips in docs --- docs/source/about_cache.mdx | 14 ++----- docs/source/about_dataset_features.mdx | 28 ++++---------- docs/source/about_dataset_load.mdx | 7 +--- docs/source/audio_dataset.mdx | 21 +++------- docs/source/audio_load.mdx | 7 +--- docs/source/cache.mdx | 7 +--- docs/source/cli.mdx | 13 +++---- docs/source/dataset_card.mdx | 7 +--- docs/source/document_dataset.mdx | 21 +++------- docs/source/document_load.mdx | 28 ++++---------- docs/source/how_to.md | 7 +--- docs/source/image_classification.mdx | 11 ++---- docs/source/image_dataset.mdx | 21 +++------- docs/source/image_load.mdx | 21 +++------- docs/source/image_process.mdx | 7 +--- docs/source/installation.md | 7 +--- docs/source/loading.mdx | 42 ++++++-------------- docs/source/object_detection.mdx | 11 ++---- docs/source/process.mdx | 28 ++++---------- docs/source/quickstart.mdx | 7 +--- docs/source/repository_structure.mdx | 26 +++++-------- docs/source/semantic_segmentation.mdx | 11 ++---- docs/source/share.mdx | 7 +--- docs/source/stream.mdx | 53 ++++++++------------------ docs/source/tutorial.md | 7 +--- docs/source/use_dataset.mdx | 7 +--- docs/source/use_with_jax.mdx | 34 ++++++----------- docs/source/use_with_numpy.mdx | 25 ++++-------- docs/source/use_with_pytorch.mdx | 25 ++++-------- docs/source/use_with_spark.mdx | 7 +--- docs/source/use_with_tensorflow.mdx | 25 ++++-------- docs/source/video_dataset.mdx | 21 +++------- docs/source/video_load.mdx | 28 ++++---------- src/datasets/arrow_dataset.py | 7 +--- src/datasets/utils/logging.py | 17 ++++----- 35 files changed, 189 insertions(+), 426 deletions(-) diff --git a/docs/source/about_cache.mdx b/docs/source/about_cache.mdx index cac9eb7634e..9e8a561b269 100644 --- a/docs/source/about_cache.mdx +++ b/docs/source/about_cache.mdx @@ -6,11 +6,8 @@ The cache is one of the reasons why 🤗 Datasets is so efficient. It stores pre How does the cache keeps track of what transforms are applied to a dataset? Well, 🤗 Datasets assigns a fingerprint to the cache file. A fingerprint keeps track of the current state of a dataset. The initial fingerprint is computed using a hash from the Arrow table, or a hash of the Arrow files if the dataset is on disk. Subsequent fingerprints are computed by combining the fingerprint of the previous state, and a hash of the latest transform applied. - - -Transforms are any of the processing methods from the [How-to Process](./process) guides such as [`Dataset.map`] or [`Dataset.shuffle`]. - - +> [!TIP] +> Transforms are any of the processing methods from the [How-to Process](./process) guides such as [`Dataset.map`] or [`Dataset.shuffle`]. Here are what the actual fingerprints look like: @@ -28,11 +25,8 @@ When you use a non-hashable transform, 🤗 Datasets uses a random fingerprint i An example of when 🤗 Datasets recomputes everything is when caching is disabled. When this happens, the cache files are generated every time and they get written to a temporary directory. Once your Python session ends, the cache files in the temporary directory are deleted. A random hash is assigned to these cache files, instead of a fingerprint. - - -When caching is disabled, use [`Dataset.save_to_disk`] to save your transformed dataset or it will be deleted once the session ends. - - +> [!TIP] +> When caching is disabled, use [`Dataset.save_to_disk`] to save your transformed dataset or it will be deleted once the session ends. ## Hashing diff --git a/docs/source/about_dataset_features.mdx b/docs/source/about_dataset_features.mdx index 5cfac6739e3..b87bd6c8f26 100644 --- a/docs/source/about_dataset_features.mdx +++ b/docs/source/about_dataset_features.mdx @@ -24,11 +24,8 @@ The [`Value`] feature tells 🤗 Datasets: 🤗 Datasets supports many other data types such as `bool`, `float32` and `binary` to name just a few. - - -Refer to [`Value`] for a full list of supported data types. - - +> [!TIP] +> Refer to [`Value`] for a full list of supported data types. The [`ClassLabel`] feature informs 🤗 Datasets the `label` column contains two classes. The classes are labeled `not_equivalent` and `equivalent`. Labels are stored as integers in the dataset. When you retrieve the labels, [`ClassLabel.int2str`] and [`ClassLabel.str2int`] carries out the conversion from integer value to label name, and vice versa. @@ -48,11 +45,8 @@ If your data type contains a list of objects, then you want to use the [`List`] The `answers` field is constructed using the dict of features because and contains two subfields, `text` and `answer_start`, which are lists of `string` and `int32`, respectively. - - -See the [flatten](./process#flatten) section to learn how you can extract the nested subfields as their own independent columns. - - +> [!TIP] +> See the [flatten](./process#flatten) section to learn how you can extract the nested subfields as their own independent columns. The array feature type is useful for creating arrays of various sizes. You can create arrays with two dimensions using [`Array2D`], and even arrays with five dimensions using [`Array5D`]. @@ -84,11 +78,8 @@ When you load an audio dataset and call the audio column, the [`Audio`] feature ``` - - -Index into an audio dataset using the row index first and then the `audio` column - `dataset[0]["audio"]` - to avoid decoding and resampling all the audio files in the dataset. Otherwise, this can be a slow and time-consuming process if you have a large dataset. - - +> [!WARNING] +> Index into an audio dataset using the row index first and then the `audio` column - `dataset[0]["audio"]` - to avoid decoding and resampling all the audio files in the dataset. Otherwise, this can be a slow and time-consuming process if you have a large dataset. With `decode=False`, the [`Audio`] type simply gives you the path or the bytes of the audio file, without decoding it into an torchcodec `AudioDecoder` object, @@ -118,11 +109,8 @@ When you load an image dataset and call the image column, the [`Image`] feature ``` - - -Index into an image dataset using the row index first and then the `image` column - `dataset[0]["image"]` - to avoid decoding all the image files in the dataset. Otherwise, this can be a slow and time-consuming process if you have a large dataset. - - +> [!WARNING] +> Index into an image dataset using the row index first and then the `image` column - `dataset[0]["image"]` - to avoid decoding all the image files in the dataset. Otherwise, this can be a slow and time-consuming process if you have a large dataset. With `decode=False`, the [`Image`] type simply gives you the path or the bytes of the image file, without decoding it into an `PIL.Image`, diff --git a/docs/source/about_dataset_load.mdx b/docs/source/about_dataset_load.mdx index c49644e1b66..a5ac45077e6 100644 --- a/docs/source/about_dataset_load.mdx +++ b/docs/source/about_dataset_load.mdx @@ -26,11 +26,8 @@ Under the hood, 🤗 Datasets will use an appropriate [`DatasetBuilder`] based o * [`datasets.packaged_modules.imagefolder.ImageFolder`] for image folders * [`datasets.packaged_modules.audiofolder.AudioFolder`] for audio folders - - -Read the [Share](./upload_dataset) section to learn more about how to share a dataset. - - +> [!TIP] +> Read the [Share](./upload_dataset) section to learn more about how to share a dataset. 🤗 Datasets downloads the dataset files from the original URL, generates the dataset and caches it in an Arrow table on your drive. If you've downloaded the dataset before, then 🤗 Datasets will reload it from the cache to save you the trouble of downloading it again. diff --git a/docs/source/audio_dataset.mdx b/docs/source/audio_dataset.mdx index 8419a70f698..5a29a27bc47 100644 --- a/docs/source/audio_dataset.mdx +++ b/docs/source/audio_dataset.mdx @@ -14,11 +14,8 @@ There are several methods for creating and sharing an audio dataset: - Create an audio dataset repository with the `AudioFolder` builder. This is a no-code solution for quickly creating an audio dataset with several thousand audio files. - - -You can control access to your dataset by requiring users to share their contact information first. Check out the [Gated datasets](https://huggingface.co/docs/hub/datasets-gated) guide for more information about how to enable this feature on the Hub. - - +> [!TIP] +> You can control access to your dataset by requiring users to share their contact information first. Check out the [Gated datasets](https://huggingface.co/docs/hub/datasets-gated) guide for more information about how to enable this feature on the Hub. ## Local files @@ -49,11 +46,8 @@ my_dataset/ The `AudioFolder` is a dataset builder designed to quickly load an audio dataset with several thousand audio files without requiring you to write any code. - - -💡 Take a look at the [Split pattern hierarchy](repository_structure#split-pattern-hierarchy) to learn more about how `AudioFolder` creates dataset splits based on your dataset repository structure. - - +> [!TIP] +> 💡 Take a look at the [Split pattern hierarchy](repository_structure#split-pattern-hierarchy) to learn more about how `AudioFolder` creates dataset splits based on your dataset repository structure. `AudioFolder` automatically infers the class labels of your dataset based on the directory name. Store your dataset in a directory structure like: @@ -90,11 +84,8 @@ folder/test/dog/german_shepherd.mp3 folder/test/cat/bengal.mp3 ``` - - -If all audio files are contained in a single directory or if they are not on the same level of directory structure, `label` column won't be added automatically. If you need it, set `drop_labels=False` explicitly. - - +> [!WARNING] +> If all audio files are contained in a single directory or if they are not on the same level of directory structure, `label` column won't be added automatically. If you need it, set `drop_labels=False` explicitly. If there is additional information you'd like to include about your dataset, like text captions or bounding boxes, add it as a `metadata.csv` file in your folder. This lets you quickly create datasets for different computer vision tasks like text captioning or object detection. You can also use a JSONL file `metadata.jsonl` or a Parquet file `metadata.parquet`. diff --git a/docs/source/audio_load.mdx b/docs/source/audio_load.mdx index 5703fcb0ba7..1d815548473 100644 --- a/docs/source/audio_load.mdx +++ b/docs/source/audio_load.mdx @@ -85,11 +85,8 @@ Finally the `filters` argument lets you load only a subset of the dataset, based >>> dataset = load_dataset("username/dataset_name", streaming=True, filters=filters) ``` - - -For more information about creating your own `AudioFolder` dataset, take a look at the [Create an audio dataset](./audio_dataset) guide. - - +> [!TIP] +> For more information about creating your own `AudioFolder` dataset, take a look at the [Create an audio dataset](./audio_dataset) guide. For a guide on how to load any type of dataset, take a look at the general loading guide. diff --git a/docs/source/cache.mdx b/docs/source/cache.mdx index a18a3d957e9..338b3733c71 100644 --- a/docs/source/cache.mdx +++ b/docs/source/cache.mdx @@ -96,11 +96,8 @@ Disable caching on a global scale with [`disable_caching`]: When you disable caching, 🤗 Datasets will no longer reload cached files when applying transforms to datasets. Any transform you apply on your dataset will be need to be reapplied. - - -If you want to reuse a dataset from scratch, try setting the `download_mode` parameter in [`load_dataset`] instead. - - +> [!TIP] +> If you want to reuse a dataset from scratch, try setting the `download_mode` parameter in [`load_dataset`] instead. diff --git a/docs/source/cli.mdx b/docs/source/cli.mdx index b85feda8ff1..e1c2207d529 100644 --- a/docs/source/cli.mdx +++ b/docs/source/cli.mdx @@ -41,11 +41,8 @@ For example: >>> datasets-cli delete_from_hub USERNAME/DATASET_NAME CONFIG_NAME ``` - - -Do not forget that you need to log in first to your Hugging Face account: -```bash ->>> hf auth login -``` - - +> [!TIP] +> Do not forget that you need to log in first to your Hugging Face account: +> ```bash +> >>> hf auth login +> ``` diff --git a/docs/source/dataset_card.mdx b/docs/source/dataset_card.mdx index 629050cf054..f1067697fb2 100644 --- a/docs/source/dataset_card.mdx +++ b/docs/source/dataset_card.mdx @@ -15,11 +15,8 @@ Creating a dataset card is easy and can be done in just a few steps: - - - For a complete, but not required, set of tag options you can also look at the [Dataset Card specifications](https://github.com/huggingface/hub-docs/blob/main/datasetcard.md?plain=1). This'll have a few more tag options like `multilinguality` and `language_creators` which are useful but not absolutely necessary. - - + > [!TIP] + > For a complete, but not required, set of tag options you can also look at the [Dataset Card specifications](https://github.com/huggingface/hub-docs/blob/main/datasetcard.md?plain=1). This'll have a few more tag options like `multilinguality` and `language_creators` which are useful but not absolutely necessary. 3. Click on the **Import dataset card template** link to automatically create a template with all the relevant fields to complete. Fill out the template sections to the best of your ability. Take a look at the [Dataset Card Creation Guide](https://github.com/huggingface/datasets/blob/main/templates/README_guide.md) for more detailed information about what to include in each section of the card. For fields you are unable to complete, you can write **[More Information Needed]**. diff --git a/docs/source/document_dataset.mdx b/docs/source/document_dataset.mdx index a4bbb5bb88e..30cc1bd3121 100644 --- a/docs/source/document_dataset.mdx +++ b/docs/source/document_dataset.mdx @@ -2,21 +2,15 @@ This guide will show you how to create a document dataset with `PdfFolder` and some metadata. This is a no-code solution for quickly creating a document dataset with several thousand pdfs. - - -You can control access to your dataset by requiring users to share their contact information first. Check out the [Gated datasets](https://huggingface.co/docs/hub/datasets-gated) guide for more information about how to enable this feature on the Hub. - - +> [!TIP] +> You can control access to your dataset by requiring users to share their contact information first. Check out the [Gated datasets](https://huggingface.co/docs/hub/datasets-gated) guide for more information about how to enable this feature on the Hub. ## PdfFolder The `PdfFolder` is a dataset builder designed to quickly load a document dataset with several thousand pdfs without requiring you to write any code. - - -💡 Take a look at the [Split pattern hierarchy](repository_structure#split-pattern-hierarchy) to learn more about how `PdfFolder` creates dataset splits based on your dataset repository structure. - - +> [!TIP] +> 💡 Take a look at the [Split pattern hierarchy](repository_structure#split-pattern-hierarchy) to learn more about how `PdfFolder` creates dataset splits based on your dataset repository structure. `PdfFolder` automatically infers the class labels of your dataset based on the directory name. Store your dataset in a directory structure like: @@ -53,11 +47,8 @@ folder/test/invoice/0001.pdf folder/test/invoice/0002.pdf ``` - - -If all PDF files are contained in a single directory or if they are not on the same level of directory structure, `label` column won't be added automatically. If you need it, set `drop_labels=False` explicitly. - - +> [!WARNING] +> If all PDF files are contained in a single directory or if they are not on the same level of directory structure, `label` column won't be added automatically. If you need it, set `drop_labels=False` explicitly. If there is additional information you'd like to include about your dataset, like text captions or bounding boxes, add it as a `metadata.csv` file in your folder. This lets you quickly create datasets for different computer vision tasks like text captioning or object detection. You can also use a JSONL file `metadata.jsonl` or a Parquet file `metadata.parquet`. diff --git a/docs/source/document_load.mdx b/docs/source/document_load.mdx index 97c684a808d..2d1717fb648 100644 --- a/docs/source/document_load.mdx +++ b/docs/source/document_load.mdx @@ -1,18 +1,12 @@ # Load pdf data - - -Pdf support is experimental and is subject to change. - - +> [!WARNING] +> Pdf support is experimental and is subject to change. Pdf datasets have [`Pdf`] type columns, which contain `pdfplumber` objects. - - -To work with pdf datasets, you need to have the `pdfplumber` package installed. Check out the [installation](https://github.com/jsvine/pdfplumber#installation) guide to learn how to install it. - - +> [!TIP] +> To work with pdf datasets, you need to have the `pdfplumber` package installed. Check out the [installation](https://github.com/jsvine/pdfplumber#installation) guide to learn how to install it. When you load a pdf dataset and call the pdf column, the pdfs are decoded as `pdfplumber` Pdfs: @@ -24,11 +18,8 @@ When you load a pdf dataset and call the pdf column, the pdfs are decoded as `pd ``` - - -Index into a pdf dataset using the row index first and then the `pdf` column - `dataset[0]["pdf"]` - to avoid creating all the pdf objects in the dataset. Otherwise, this can be a slow and time-consuming process if you have a large dataset. - - +> [!WARNING] +> Index into a pdf dataset using the row index first and then the `pdf` column - `dataset[0]["pdf"]` - to avoid creating all the pdf objects in the dataset. Otherwise, this can be a slow and time-consuming process if you have a large dataset. For a guide on how to load any type of dataset, take a look at the general loading guide. @@ -183,11 +174,8 @@ Finally the `filters` argument lets you load only a subset of the dataset, based >>> dataset = load_dataset("username/dataset_name", streaming=True, filters=filters) ``` - - -For more information about creating your own `PdfFolder` dataset, take a look at the [Create a pdf dataset](./document_dataset) guide. - - +> [!TIP] +> For more information about creating your own `PdfFolder` dataset, take a look at the [Create a pdf dataset](./document_dataset) guide. ## Pdf decoding diff --git a/docs/source/how_to.md b/docs/source/how_to.md index 223a7c2c4c0..79fd76eacd4 100644 --- a/docs/source/how_to.md +++ b/docs/source/how_to.md @@ -4,11 +4,8 @@ The how-to guides offer a more comprehensive overview of all the tools 🤗 Data The guides assume you are familiar and comfortable with the 🤗 Datasets basics. We recommend newer users check out our [tutorials](tutorial) first. - - -Interested in learning more? Take a look at [Chapter 5](https://huggingface.co/course/chapter5/1?fw=pt) of the Hugging Face course! - - +> [!TIP] +> Interested in learning more? Take a look at [Chapter 5](https://huggingface.co/course/chapter5/1?fw=pt) of the Hugging Face course! The guides are organized into six sections: diff --git a/docs/source/image_classification.mdx b/docs/source/image_classification.mdx index 6861bfec866..ea52a444fc2 100644 --- a/docs/source/image_classification.mdx +++ b/docs/source/image_classification.mdx @@ -80,10 +80,7 @@ You can verify the transformation worked by indexing into the `pixel_values` of - - -Now that you know how to process a dataset for image classification, learn -[how to train an image classification model](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb) -and use it for inference. - - \ No newline at end of file +> [!TIP] +> Now that you know how to process a dataset for image classification, learn +> [how to train an image classification model](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb) +> and use it for inference. \ No newline at end of file diff --git a/docs/source/image_dataset.mdx b/docs/source/image_dataset.mdx index 464c9316220..02ecadc37ee 100644 --- a/docs/source/image_dataset.mdx +++ b/docs/source/image_dataset.mdx @@ -6,21 +6,15 @@ There are two methods for creating and sharing an image dataset. This guide will * Create an image dataset with `ImageFolder` and some metadata. This is a no-code solution for quickly creating an image dataset with several thousand images. - - -You can control access to your dataset by requiring users to share their contact information first. Check out the [Gated datasets](https://huggingface.co/docs/hub/datasets-gated) guide for more information about how to enable this feature on the Hub. - - +> [!TIP] +> You can control access to your dataset by requiring users to share their contact information first. Check out the [Gated datasets](https://huggingface.co/docs/hub/datasets-gated) guide for more information about how to enable this feature on the Hub. ## ImageFolder The `ImageFolder` is a dataset builder designed to quickly load an image dataset with several thousand images without requiring you to write any code. - - -💡 Take a look at the [Split pattern hierarchy](repository_structure#split-pattern-hierarchy) to learn more about how `ImageFolder` creates dataset splits based on your dataset repository structure. - - +> [!TIP] +> 💡 Take a look at the [Split pattern hierarchy](repository_structure#split-pattern-hierarchy) to learn more about how `ImageFolder` creates dataset splits based on your dataset repository structure. `ImageFolder` automatically infers the class labels of your dataset based on the directory name. Store your dataset in a directory structure like: @@ -57,11 +51,8 @@ folder/test/dog/german_shepherd.png folder/test/cat/bengal.png ``` - - -If all image files are contained in a single directory or if they are not on the same level of directory structure, `label` column won't be added automatically. If you need it, set `drop_labels=False` explicitly. - - +> [!WARNING] +> If all image files are contained in a single directory or if they are not on the same level of directory structure, `label` column won't be added automatically. If you need it, set `drop_labels=False` explicitly. If there is additional information you'd like to include about your dataset, like text captions or bounding boxes, add it as a `metadata.csv` file in your folder. This lets you quickly create datasets for different computer vision tasks like text captioning or object detection. You can also use a JSONL file `metadata.jsonl` or a Parquet file `metadata.parquet`. diff --git a/docs/source/image_load.mdx b/docs/source/image_load.mdx index 33afd521772..676b3f51653 100644 --- a/docs/source/image_load.mdx +++ b/docs/source/image_load.mdx @@ -2,11 +2,8 @@ Image datasets have [`Image`] type columns, which contain PIL objects. - - -To work with image datasets, you need to have the `vision` dependency installed. Check out the [installation](./installation#vision) guide to learn how to install it. - - +> [!TIP] +> To work with image datasets, you need to have the `vision` dependency installed. Check out the [installation](./installation#vision) guide to learn how to install it. When you load an image dataset and call the image column, the images are decoded as PIL Images: @@ -17,11 +14,8 @@ When you load an image dataset and call the image column, the images are decoded >>> dataset[0]["image"] ``` - - -Index into an image dataset using the row index first and then the `image` column - `dataset[0]["image"]` - to avoid decoding and resampling all the image objects in the dataset. Otherwise, this can be a slow and time-consuming process if you have a large dataset. - - +> [!WARNING] +> Index into an image dataset using the row index first and then the `image` column - `dataset[0]["image"]` - to avoid decoding and resampling all the image objects in the dataset. Otherwise, this can be a slow and time-consuming process if you have a large dataset. For a guide on how to load any type of dataset, take a look at the general loading guide. @@ -119,11 +113,8 @@ Finally the `filters` argument lets you load only a subset of the dataset, based >>> dataset = load_dataset("username/dataset_name", streaming=True, filters=filters) ``` - - -For more information about creating your own `ImageFolder` dataset, take a look at the [Create an image dataset](./image_dataset) guide. - - +> [!TIP] +> For more information about creating your own `ImageFolder` dataset, take a look at the [Create an image dataset](./image_dataset) guide. ## WebDataset diff --git a/docs/source/image_process.mdx b/docs/source/image_process.mdx index d07ee37fb2f..c1653a1af5d 100644 --- a/docs/source/image_process.mdx +++ b/docs/source/image_process.mdx @@ -41,11 +41,8 @@ Both parameter values default to 1000, which can be expensive if you are storing 🤗 Datasets applies data augmentations from any library or package to your dataset. Transforms can be applied on-the-fly on batches of data with [`~Dataset.set_transform`], which consumes less disk space. - - -The following example uses [torchvision](https://pytorch.org/vision/stable/index.html), but feel free to use other data augmentation libraries like [Albumentations](https://albumentations.ai/docs/), [Kornia](https://kornia.readthedocs.io/en/latest/), and [imgaug](https://imgaug.readthedocs.io/en/latest/). - - +> [!TIP] +> The following example uses [torchvision](https://pytorch.org/vision/stable/index.html), but feel free to use other data augmentation libraries like [Albumentations](https://albumentations.ai/docs/), [Kornia](https://kornia.readthedocs.io/en/latest/), and [imgaug](https://imgaug.readthedocs.io/en/latest/). For example, if you'd like to change the color properties of an image randomly: diff --git a/docs/source/installation.md b/docs/source/installation.md index c52b72cfc12..6decbac20ef 100644 --- a/docs/source/installation.md +++ b/docs/source/installation.md @@ -2,11 +2,8 @@ Before you start, you'll need to setup your environment and install the appropriate packages. 🤗 Datasets is tested on **Python 3.9+**. - - -If you want to use 🤗 Datasets with TensorFlow or PyTorch, you'll need to install them separately. Refer to the [TensorFlow installation page](https://www.tensorflow.org/install/pip#tensorflow-2-packages-are-available) or the [PyTorch installation page](https://pytorch.org/get-started/locally/#start-locally) for the specific install command for your framework. - - +> [!TIP] +> If you want to use 🤗 Datasets with TensorFlow or PyTorch, you'll need to install them separately. Refer to the [TensorFlow installation page](https://www.tensorflow.org/install/pip#tensorflow-2-packages-are-available) or the [PyTorch installation page](https://pytorch.org/get-started/locally/#start-locally) for the specific install command for your framework. ## Virtual environment diff --git a/docs/source/loading.mdx b/docs/source/loading.mdx index 443148babed..eb73ab84b5a 100644 --- a/docs/source/loading.mdx +++ b/docs/source/loading.mdx @@ -34,11 +34,8 @@ Some datasets may have more than one version based on Git tags, branches, or com ... ) ``` - - -Refer to the [Upload a dataset to the Hub](./upload_dataset) tutorial for more details on how to create a dataset repository on the Hub, and how to upload your data files. - - +> [!TIP] +> Refer to the [Upload a dataset to the Hub](./upload_dataset) tutorial for more details on how to create a dataset repository on the Hub, and how to upload your data files. A dataset loads by default all the data into the `train` split, or checks for mentions or split names in the data files names (e.g. "train", "test" and "validation"). Use the `data_files` parameter to map data files to splits like `train`, `validation` and `test`: @@ -47,11 +44,8 @@ A dataset loads by default all the data into the `train` split, or checks for me >>> dataset = load_dataset("namespace/your_dataset_name", data_files=data_files) ``` - - -If you don't specify which data files to use, [`load_dataset`] will return all the data files. This can take a long time if you load a large dataset like C4, which is approximately 13TB of data. - - +> [!WARNING] +> If you don't specify which data files to use, [`load_dataset`] will return all the data files. This can take a long time if you load a large dataset like C4, which is approximately 13TB of data. You can also load a specific subset of the files with the `data_files` or `data_dir` parameter. These parameters can accept a relative path which resolves to the base path corresponding to where the dataset is loaded from. @@ -85,11 +79,8 @@ Datasets can be loaded from local files stored on your computer and from remote >>> dataset = load_dataset("csv", data_files="my_file.csv") ``` - - -For more details, check out the [how to load tabular datasets from CSV files](tabular_load#csv-files) guide. - - +> [!TIP] +> For more details, check out the [how to load tabular datasets from CSV files](tabular_load#csv-files) guide. ### JSON @@ -201,11 +192,8 @@ Read database contents with [`~datasets.Dataset.from_sql`] by specifying the URI >>> dataset = Dataset.from_sql("SELECT text FROM table WHERE length(text) > 100 LIMIT 10", con="sqlite:///sqlite_file.db") ``` - - -For more details, check out the [how to load tabular datasets from SQL databases](tabular_load#databases) guide. - - +> [!TIP] +> For more details, check out the [how to load tabular datasets from SQL databases](tabular_load#databases) guide. ### WebDataset @@ -312,11 +300,8 @@ Load Pandas DataFrames with [`~Dataset.from_pandas`]: >>> dataset = Dataset.from_pandas(df) ``` - - -For more details, check out the [how to load tabular datasets from Pandas DataFrames](tabular_load#pandas-dataframes) guide. - - +> [!TIP] +> For more details, check out the [how to load tabular datasets from Pandas DataFrames](tabular_load#pandas-dataframes) guide. ## Offline @@ -395,11 +380,8 @@ If you want equal sized splits, use `pct1_dropremainder` rounding instead. This >>> train_52_54pct1_ds = datasets.load_dataset("ajibawa-2023/General-Stories-Collection", split="train[52%:54%](pct1_dropremainder)") ``` - - -`pct1_dropremainder` rounding may truncate the last examples in a dataset if the number of examples in your dataset don't divide evenly by 100. - - +> [!WARNING] +> `pct1_dropremainder` rounding may truncate the last examples in a dataset if the number of examples in your dataset don't divide evenly by 100. diff --git a/docs/source/object_detection.mdx b/docs/source/object_detection.mdx index 75d9dbb61f7..f612de28fdc 100644 --- a/docs/source/object_detection.mdx +++ b/docs/source/object_detection.mdx @@ -155,10 +155,7 @@ You can verify the transform works by visualizing the 10th example: - - -Now that you know how to process a dataset for object detection, learn -[how to train an object detection model](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/YOLOS/Fine_tuning_YOLOS_for_object_detection_on_custom_dataset_(balloon).ipynb) -and use it for inference. - - +> [!TIP] +> Now that you know how to process a dataset for object detection, learn +> [how to train an object detection model](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/YOLOS/Fine_tuning_YOLOS_for_object_detection_on_custom_dataset_(balloon).ipynb) +> and use it for inference. diff --git a/docs/source/process.mdx b/docs/source/process.mdx index 39b9fd94077..652fc7b4a4d 100644 --- a/docs/source/process.mdx +++ b/docs/source/process.mdx @@ -20,11 +20,8 @@ The examples in this guide use the MRPC dataset, but feel free to load any datas >>> dataset = load_dataset("nyu-mll/glue", "mrpc", split="train") ``` - - -All processing methods in this guide return a new [`Dataset`] object. Modification is not done in-place. Be careful about overriding your previous dataset! - - +> [!WARNING] +> All processing methods in this guide return a new [`Dataset`] object. Modification is not done in-place. Be careful about overriding your previous dataset! ## Sort, shuffle, select, split, and shard @@ -240,11 +237,8 @@ The [`~Dataset.cast`] function transforms the feature type of one or more column 'idx': Value('int64')} ``` - - -Casting only works if the original feature type and new feature type are compatible. For example, you can cast a column with the feature type `Value("int32")` to `Value("bool")` if the original column only contains ones and zeros. - - +> [!TIP] +> Casting only works if the original feature type and new feature type are compatible. For example, you can cast a column with the feature type `Value("int32")` to `Value("bool")` if the original column only contains ones and zeros. Use the [`~Dataset.cast_column`] function to change the feature type of a single column. Pass the column name and its new feature type as arguments: @@ -322,11 +316,8 @@ Specify the column to remove with the `remove_columns` parameter in [`~Dataset.m ['sentence2', 'label', 'idx', 'new_sentence'] ``` - - -🤗 Datasets also has a [`~Dataset.remove_columns`] function which is faster because it doesn't copy the data of the remaining columns. - - +> [!TIP] +> 🤗 Datasets also has a [`~Dataset.remove_columns`] function which is faster because it doesn't copy the data of the remaining columns. You can also use [`~Dataset.map`] with indices if you set `with_indices=True`. The example below adds the index to the beginning of each sentence: @@ -713,11 +704,8 @@ Here is the list of supported tensors or arrays formats: - TensorFlow: format name is "tensorflow", for more information see [Using Datasets with TensorFlow](use_with_tensorflow) - JAX: format name is "jax", for more information see [Using Datasets with JAX](use_with_jax) - - -Check out the [Using Datasets with TensorFlow](use_with_tensorflow#using-totfdataset) guide for more details on how to efficiently create a TensorFlow dataset. - - +> [!TIP] +> Check out the [Using Datasets with TensorFlow](use_with_tensorflow#using-totfdataset) guide for more details on how to efficiently create a TensorFlow dataset. When a dataset is formatted in a tensor or array format, all the data are formatted as tensors or arrays (except unsupported types like strings for example for PyTorch): diff --git a/docs/source/quickstart.mdx b/docs/source/quickstart.mdx index 092940de95c..6be8bee907c 100644 --- a/docs/source/quickstart.mdx +++ b/docs/source/quickstart.mdx @@ -59,11 +59,8 @@ Each dataset is unique, and depending on the task, some datasets may require add - - -Check out [Chapter 5](https://huggingface.co/course/chapter5/1?fw=pt) of the Hugging Face course to learn more about other important topics such as loading remote or local datasets, tools for cleaning up a dataset, and creating your own dataset. - - +> [!TIP] +> Check out [Chapter 5](https://huggingface.co/course/chapter5/1?fw=pt) of the Hugging Face course to learn more about other important topics such as loading remote or local datasets, tools for cleaning up a dataset, and creating your own dataset. Start by installing 🤗 Datasets: diff --git a/docs/source/repository_structure.mdx b/docs/source/repository_structure.mdx index c527a3ca730..acb2bb1ed3d 100644 --- a/docs/source/repository_structure.mdx +++ b/docs/source/repository_structure.mdx @@ -90,11 +90,8 @@ configs: --- ``` - - -Note that `config_name` field is required even if you have a single configuration. - - +> [!WARNING] +> Note that `config_name` field is required even if you have a single configuration. ## Configurations @@ -144,17 +141,14 @@ configs: Refer to [specific builders' documentation](./package_reference/builder_classes) to see what configuration parameters they have. - - -You can set a default configuration using `default: true`, e.g. you can run `main_data = load_dataset("my_dataset_repository")` if you set - -```yaml -- config_name: main_data - data_files: "main_data.csv" - default: true -``` - - +> [!TIP] +> You can set a default configuration using `default: true`, e.g. you can run `main_data = load_dataset("my_dataset_repository")` if you set +> +> ```yaml +> - config_name: main_data +> data_files: "main_data.csv" +> default: true +> ``` ## Automatic splits detection diff --git a/docs/source/semantic_segmentation.mdx b/docs/source/semantic_segmentation.mdx index b7ee935b79a..44ceb2a13f1 100644 --- a/docs/source/semantic_segmentation.mdx +++ b/docs/source/semantic_segmentation.mdx @@ -168,10 +168,7 @@ In this guide, you have used `albumentations` for augmenting the dataset. It's a - - -Now that you know how to process a dataset for semantic segmentation, learn -[how to train a semantic segmentation model](https://huggingface.co/docs/transformers/tasks/semantic_segmentation) -and use it for inference. - - \ No newline at end of file +> [!TIP] +> Now that you know how to process a dataset for semantic segmentation, learn +> [how to train a semantic segmentation model](https://huggingface.co/docs/transformers/tasks/semantic_segmentation) +> and use it for inference. \ No newline at end of file diff --git a/docs/source/share.mdx b/docs/source/share.mdx index 9aa6af4e4b8..bbaaa56c144 100644 --- a/docs/source/share.mdx +++ b/docs/source/share.mdx @@ -86,11 +86,8 @@ To upload the current directory at the root of the repo, use: https://huggingface.co/datasets/Wauplin/my-cool-dataset/tree/main/ ``` - - -If the repo doesn't exist yet, it will be created automatically. - - +> [!TIP] +> If the repo doesn't exist yet, it will be created automatically. You can also upload a specific folder: diff --git a/docs/source/stream.mdx b/docs/source/stream.mdx index 914dffd69f6..8375e0057a8 100644 --- a/docs/source/stream.mdx +++ b/docs/source/stream.mdx @@ -43,13 +43,10 @@ For example, you can stream a local dataset of hundreds of compressed JSONL file Loading a dataset in streaming mode creates a new dataset type instance (instead of the classic [`Dataset`] object), known as an [`IterableDataset`]. This special type of dataset has its own set of processing methods shown below. - - -An [`IterableDataset`] is useful for iterative jobs like training a model. -You shouldn't use a [`IterableDataset`] for jobs that require random access to examples because you have to iterate all over it using a for loop. Getting the last example in an iterable dataset would require you to iterate over all the previous examples. -You can find more details in the [Dataset vs. IterableDataset guide](./about_mapstyle_vs_iterable). - - +> [!TIP] +> An [`IterableDataset`] is useful for iterative jobs like training a model. +> You shouldn't use a [`IterableDataset`] for jobs that require random access to examples because you have to iterate all over it using a for loop. Getting the last example in an iterable dataset would require you to iterate over all the previous examples. +> You can find more details in the [Dataset vs. IterableDataset guide](./about_mapstyle_vs_iterable). ## Column indexing @@ -101,11 +98,8 @@ The `buffer_size` argument controls the size of the buffer to randomly sample ex >>> shuffled_dataset = dataset.shuffle(seed=42, buffer_size=10_000) ``` - - -[`IterableDataset.shuffle`] will also shuffle the order of the shards if the dataset is sharded into multiple files. - - +> [!TIP] +> [`IterableDataset.shuffle`] will also shuffle the order of the shards if the dataset is sharded into multiple files. ## Reshuffle @@ -140,11 +134,8 @@ You can split your dataset one of two ways: >>> train_dataset = shuffled_dataset.skip(1000) ``` - - -`take` and `skip` prevent future calls to `shuffle` because they lock in the order of the shards. You should `shuffle` your dataset before splitting it. - - +> [!WARNING] +> `take` and `skip` prevent future calls to `shuffle` because they lock in the order of the shards. You should `shuffle` your dataset before splitting it. @@ -258,11 +249,8 @@ When you need to remove one or more columns, give [`IterableDataset.remove_colum 'idx': Value('int64')} ``` - - -Casting only works if the original feature type and new feature type are compatible. For example, you can cast a column with the feature type `Value('int32')` to `Value('bool')` if the original column only contains ones and zeros. - - +> [!TIP] +> Casting only works if the original feature type and new feature type are compatible. For example, you can cast a column with the feature type `Value('int32')` to `Value('bool')` if the original column only contains ones and zeros. Use [`IterableDataset.cast_column`] to change the feature type of just one column. Pass the column name and its new feature type as arguments: @@ -339,11 +327,8 @@ Specify the column to remove with the `remove_columns` argument in [`IterableDat 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..., 0, 0]} ``` - - -See other examples of batch processing in the [batched map processing](./process#batch-processing) documentation. They work the same for iterable datasets. - - +> [!TIP] +> See other examples of batch processing in the [batched map processing](./process#batch-processing) documentation. They work the same for iterable datasets. ### Filter @@ -371,11 +356,8 @@ You can filter rows in the dataset based on a predicate function using [`Dataset The `batch` method transforms your `IterableDataset` into an iterable of batches. This is particularly useful when you want to work with batches in your training loop or when using frameworks that expect batched inputs. - - -There is also a "Batch Processing" option when using the `map` function to apply a function to batches of data, which is discussed in the [Map section](#map) above. The `batch` method described here is different and provides a more direct way to create batches from your dataset. - - +> [!TIP] +> There is also a "Batch Processing" option when using the `map` function to apply a function to batches of data, which is discussed in the [Map section](#map) above. The `batch` method described here is different and provides a more direct way to create batches from your dataset. You can use the `batch` method like this: @@ -500,11 +482,8 @@ This can be used with the `StatefulDataLoader` from `torchdata`: >>> dataloader.load_state_dict(state_dict) # uses iterable_dataset.load_state_dict() under the hood ``` - - -Resuming returns exactly where the checkpoint was saved except if `.shuffle()` is used: examples from shuffle buffers are lost when resuming and the buffers are refilled with new data. - - +> [!TIP] +> Resuming returns exactly where the checkpoint was saved except if `.shuffle()` is used: examples from shuffle buffers are lost when resuming and the buffers are refilled with new data. ## Save diff --git a/docs/source/tutorial.md b/docs/source/tutorial.md index 559575df8dd..2f5b5acf0e0 100644 --- a/docs/source/tutorial.md +++ b/docs/source/tutorial.md @@ -4,11 +4,8 @@ Welcome to the 🤗 Datasets tutorials! These beginner-friendly tutorials will g The tutorials assume some basic knowledge of Python and a machine learning framework like PyTorch or TensorFlow. If you're already familiar with these, feel free to check out the [quickstart](./quickstart) to see what you can do with 🤗 Datasets. - - -The tutorials only cover the basic skills you need to use 🤗 Datasets. There are many other useful functionalities and applications that aren't discussed here. If you're interested in learning more, take a look at [Chapter 5](https://huggingface.co/course/chapter5/1?fw=pt) of the Hugging Face course. - - +> [!TIP] +> The tutorials only cover the basic skills you need to use 🤗 Datasets. There are many other useful functionalities and applications that aren't discussed here. If you're interested in learning more, take a look at [Chapter 5](https://huggingface.co/course/chapter5/1?fw=pt) of the Hugging Face course. If you have any questions about 🤗 Datasets, feel free to join and ask the community on our [forum](https://discuss.huggingface.co/c/datasets/10). diff --git a/docs/source/use_dataset.mdx b/docs/source/use_dataset.mdx index 849f3f1e09d..29f8364f9f3 100644 --- a/docs/source/use_dataset.mdx +++ b/docs/source/use_dataset.mdx @@ -22,11 +22,8 @@ Grab a dataset of your choice and follow along! Models cannot process raw text, so you'll need to convert the text into numbers. Tokenization provides a way to do this by dividing text into individual words called _tokens_. Tokens are finally converted to numbers. - - -Check out the [Tokenizers](https://huggingface.co/course/chapter2/4?fw=pt) section in Chapter 2 of the Hugging Face course to learn more about tokenization and different tokenization algorithms. - - +> [!TIP] +> Check out the [Tokenizers](https://huggingface.co/course/chapter2/4?fw=pt) section in Chapter 2 of the Hugging Face course to learn more about tokenization and different tokenization algorithms. **1**. Start by loading the [rotten_tomatoes](https://huggingface.co/datasets/rotten_tomatoes) dataset and the tokenizer corresponding to a pretrained [BERT](https://huggingface.co/bert-base-uncased) model. Using the same tokenizer as the pretrained model is important because you want to make sure the text is split in the same way. diff --git a/docs/source/use_with_jax.mdx b/docs/source/use_with_jax.mdx index 89d1628df06..a38dc7928ad 100644 --- a/docs/source/use_with_jax.mdx +++ b/docs/source/use_with_jax.mdx @@ -3,12 +3,9 @@ This document is a quick introduction to using `datasets` with JAX, with a particular focus on how to get `jax.Array` objects out of our datasets, and how to use them to train JAX models. - - -`jax` and `jaxlib` are required to reproduce to code above, so please make sure you -install them as `pip install datasets[jax]`. - - +> [!TIP] +> `jax` and `jaxlib` are required to reproduce to code above, so please make sure you +> install them as `pip install datasets[jax]`. ## Dataset format @@ -30,11 +27,8 @@ To get JAX arrays (numpy-like) instead, you can set the format of the dataset to [3, 4]], dtype=int32)} ``` - - -A [`Dataset`] object is a wrapper of an Arrow table, which allows fast reads from arrays in the dataset to JAX arrays. - - +> [!TIP] +> A [`Dataset`] object is a wrapper of an Arrow table, which allows fast reads from arrays in the dataset to JAX arrays. Note that the exact same procedure applies to `DatasetDict` objects, so that when setting the format of a `DatasetDict` to `jax`, all the `Dataset`s there @@ -138,12 +132,9 @@ String and binary objects are unchanged, since JAX only supports numbers. The [`Image`] and [`Audio`] feature types are also supported. - - -To use the [`Image`] feature type, you'll need to install the `vision` extra as -`pip install datasets[vision]`. - - +> [!TIP] +> To use the [`Image`] feature type, you'll need to install the `vision` extra as +> `pip install datasets[vision]`. ```py >>> from datasets import Dataset, Features, Image @@ -169,12 +160,9 @@ To use the [`Image`] feature type, you'll need to install the `vision` extra as [ 255, 255, 255]]]], dtype=uint8)} ``` - - -To use the [`Audio`] feature type, you'll need to install the `audio` extra as -`pip install datasets[audio]`. - - +> [!TIP] +> To use the [`Audio`] feature type, you'll need to install the `audio` extra as +> `pip install datasets[audio]`. ```py >>> from datasets import Dataset, Features, Audio diff --git a/docs/source/use_with_numpy.mdx b/docs/source/use_with_numpy.mdx index a8084655915..bd0cd6877b7 100644 --- a/docs/source/use_with_numpy.mdx +++ b/docs/source/use_with_numpy.mdx @@ -23,11 +23,8 @@ To get NumPy arrays instead, you can set the format of the dataset to `numpy`: [3, 4]])} ``` - - -A [`Dataset`] object is a wrapper of an Arrow table, which allows fast reads from arrays in the dataset to NumPy arrays. - - +> [!TIP] +> A [`Dataset`] object is a wrapper of an Arrow table, which allows fast reads from arrays in the dataset to NumPy arrays. Note that the exact same procedure applies to `DatasetDict` objects, so that when setting the format of a `DatasetDict` to `numpy`, all the `Dataset`s there @@ -106,12 +103,9 @@ String and binary objects are unchanged, since NumPy only supports numbers. The [`Image`] and [`Audio`] feature types are also supported. - - -To use the [`Image`] feature type, you'll need to install the `vision` extra as -`pip install datasets[vision]`. - - +> [!TIP] +> To use the [`Image`] feature type, you'll need to install the `vision` extra as +> `pip install datasets[vision]`. ```py >>> from datasets import Dataset, Features, Image @@ -137,12 +131,9 @@ To use the [`Image`] feature type, you'll need to install the `vision` extra as [ 255, 255, 255]]]], dtype=uint8)} ``` - - -To use the [`Audio`] feature type, you'll need to install the `audio` extra as -`pip install datasets[audio]`. - - +> [!TIP] +> To use the [`Audio`] feature type, you'll need to install the `audio` extra as +> `pip install datasets[audio]`. ```py >>> from datasets import Dataset, Features, Audio diff --git a/docs/source/use_with_pytorch.mdx b/docs/source/use_with_pytorch.mdx index 375d6facc3e..8612f35bd27 100644 --- a/docs/source/use_with_pytorch.mdx +++ b/docs/source/use_with_pytorch.mdx @@ -22,11 +22,8 @@ To get PyTorch tensors instead, you can set the format of the dataset to `pytorc [3, 4]])} ``` - - -A [`Dataset`] object is a wrapper of an Arrow table, which allows fast zero-copy reads from arrays in the dataset to PyTorch tensors. - - +> [!TIP] +> A [`Dataset`] object is a wrapper of an Arrow table, which allows fast zero-copy reads from arrays in the dataset to PyTorch tensors. To load the data as tensors on a GPU, specify the `device` argument: @@ -100,12 +97,9 @@ String and binary objects are unchanged, since PyTorch only supports numbers. The [`Image`] and [`Audio`] feature types are also supported. - - -To use the [`Image`] feature type, you'll need to install the `vision` extra as -`pip install datasets[vision]`. - - +> [!TIP] +> To use the [`Image`] feature type, you'll need to install the `vision` extra as +> `pip install datasets[vision]`. ```py >>> from datasets import Dataset, Features, Audio, Image @@ -131,12 +125,9 @@ torch.Size([2, 512, 512, 4]) [255, 255, 255, 255]]]], dtype=torch.uint8)} ``` - - -To use the [`Audio`] feature type, you'll need to install the `audio` extra as -`pip install datasets[audio]`. - - +> [!TIP] +> To use the [`Audio`] feature type, you'll need to install the `audio` extra as +> `pip install datasets[audio]`. ```py >>> from datasets import Dataset, Features, Audio, Image diff --git a/docs/source/use_with_spark.mdx b/docs/source/use_with_spark.mdx index 07767ca447f..cfd7d531115 100644 --- a/docs/source/use_with_spark.mdx +++ b/docs/source/use_with_spark.mdx @@ -43,11 +43,8 @@ times on the same DataFrame it won't re-run the Spark job that writes the datase You can set the cache location by passing `cache_dir=` to [`Dataset.from_spark`]. Make sure to use a disk that is available to both your workers and your current machine (the driver). - - -In a different session, a Spark DataFrame doesn't have the same [semantic hash](https://spark.apache.org/docs/3.2.0/api/python/reference/api/pyspark.sql.DataFrame.semanticHash.html), and it will rerun a Spark job and store it in a new cache. - - +> [!WARNING] +> In a different session, a Spark DataFrame doesn't have the same [semantic hash](https://spark.apache.org/docs/3.2.0/api/python/reference/api/pyspark.sql.DataFrame.semanticHash.html), and it will rerun a Spark job and store it in a new cache. ### Feature types diff --git a/docs/source/use_with_tensorflow.mdx b/docs/source/use_with_tensorflow.mdx index 14e0ac2b33c..b022f67edbd 100644 --- a/docs/source/use_with_tensorflow.mdx +++ b/docs/source/use_with_tensorflow.mdx @@ -23,11 +23,8 @@ array([[1, 2], [3, 4]])>} ``` - - -A [`Dataset`] object is a wrapper of an Arrow table, which allows fast reads from arrays in the dataset to TensorFlow tensors. - - +> [!TIP] +> A [`Dataset`] object is a wrapper of an Arrow table, which allows fast reads from arrays in the dataset to TensorFlow tensors. This can be useful for converting your dataset to a dict of `Tensor` objects, or for writing a generator to load TF samples from it. If you wish to convert the entire dataset to `Tensor`, simply query the full dataset: @@ -128,12 +125,9 @@ String and binary objects are unchanged, since PyTorch only supports numbers. The [`Image`] and [`Audio`] feature types are also supported. - - -To use the [`Image`] feature type, you'll need to install the `vision` extra as -`pip install datasets[vision]`. - - +> [!TIP] +> To use the [`Image`] feature type, you'll need to install the `vision` extra as +> `pip install datasets[vision]`. ```py >>> from datasets import Dataset, Features, Audio, Image @@ -157,12 +151,9 @@ To use the [`Image`] feature type, you'll need to install the `vision` extra as [255, 255, 255, 255]]]], dtype=uint8)>} ``` - - -To use the [`Audio`] feature type, you'll need to install the `audio` extra as -`pip install datasets[audio]`. - - +> [!TIP] +> To use the [`Audio`] feature type, you'll need to install the `audio` extra as +> `pip install datasets[audio]`. ```py >>> from datasets import Dataset, Features, Audio, Image diff --git a/docs/source/video_dataset.mdx b/docs/source/video_dataset.mdx index dcc514611ce..28f02a9d751 100644 --- a/docs/source/video_dataset.mdx +++ b/docs/source/video_dataset.mdx @@ -2,21 +2,15 @@ This guide will show you how to create a video dataset with `VideoFolder` and some metadata. This is a no-code solution for quickly creating a video dataset with several thousand videos. - - -You can control access to your dataset by requiring users to share their contact information first. Check out the [Gated datasets](https://huggingface.co/docs/hub/datasets-gated) guide for more information about how to enable this feature on the Hub. - - +> [!TIP] +> You can control access to your dataset by requiring users to share their contact information first. Check out the [Gated datasets](https://huggingface.co/docs/hub/datasets-gated) guide for more information about how to enable this feature on the Hub. ## VideoFolder The `VideoFolder` is a dataset builder designed to quickly load a video dataset with several thousand videos without requiring you to write any code. - - -💡 Take a look at the [Split pattern hierarchy](repository_structure#split-pattern-hierarchy) to learn more about how `VideoFolder` creates dataset splits based on your dataset repository structure. - - +> [!TIP] +> 💡 Take a look at the [Split pattern hierarchy](repository_structure#split-pattern-hierarchy) to learn more about how `VideoFolder` creates dataset splits based on your dataset repository structure. `VideoFolder` automatically infers the class labels of your dataset based on the directory name. Store your dataset in a directory structure like: @@ -53,11 +47,8 @@ folder/test/dog/german_shepherd.mp4 folder/test/cat/bengal.mp4 ``` - - -If all video files are contained in a single directory or if they are not on the same level of directory structure, `label` column won't be added automatically. If you need it, set `drop_labels=False` explicitly. - - +> [!WARNING] +> If all video files are contained in a single directory or if they are not on the same level of directory structure, `label` column won't be added automatically. If you need it, set `drop_labels=False` explicitly. If there is additional information you'd like to include about your dataset, like text captions or bounding boxes, add it as a `metadata.csv` file in your folder. This lets you quickly create datasets for different computer vision tasks like text captioning or object detection. You can also use a JSONL file `metadata.jsonl` or a Parquet file `metadata.parquet`. diff --git a/docs/source/video_load.mdx b/docs/source/video_load.mdx index 5aad7aa0709..86330841f6f 100644 --- a/docs/source/video_load.mdx +++ b/docs/source/video_load.mdx @@ -1,18 +1,12 @@ # Load video data - - -Video support is experimental and is subject to change. - - +> [!WARNING] +> Video support is experimental and is subject to change. Video datasets have [`Video`] type columns, which contain `torchvision` objects. - - -To work with video datasets, you need to have the `torchvision` and `av` packages installed. Check out the [installation](https://github.com/pytorch/vision#installation) guide to learn how to install them. - - +> [!TIP] +> To work with video datasets, you need to have the `torchvision` and `av` packages installed. Check out the [installation](https://github.com/pytorch/vision#installation) guide to learn how to install them. When you load a video dataset and call the video column, the videos are decoded as `torchvision` Videos: @@ -24,11 +18,8 @@ When you load a video dataset and call the video column, the videos are decoded ``` - - -Index into a video dataset using the row index first and then the `video` column - `dataset[0]["video"]` - to avoid creating all the video objects in the dataset. Otherwise, this can be a slow and time-consuming process if you have a large dataset. - - +> [!WARNING] +> Index into a video dataset using the row index first and then the `video` column - `dataset[0]["video"]` - to avoid creating all the video objects in the dataset. Otherwise, this can be a slow and time-consuming process if you have a large dataset. For a guide on how to load any type of dataset, take a look at the general loading guide. @@ -149,11 +140,8 @@ Finally the `filters` argument lets you load only a subset of the dataset, based >>> dataset = load_dataset("username/dataset_name", streaming=True, filters=filters) ``` - - -For more information about creating your own `VideoFolder` dataset, take a look at the [Create a video dataset](./video_dataset) guide. - - +> [!TIP] +> For more information about creating your own `VideoFolder` dataset, take a look at the [Create a video dataset](./video_dataset) guide. ## WebDataset diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 36740e458b7..695bf310562 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -1458,11 +1458,8 @@ def from_sql( >>> ds = Dataset.from_sql(stmt, "postgres:///db_name") ``` - - - The returned dataset can only be cached if `con` is specified as URI string. - - + > [!TIP] + > The returned dataset can only be cached if `con` is specified as URI string. """ from .io.sql import SqlDatasetReader diff --git a/src/datasets/utils/logging.py b/src/datasets/utils/logging.py index 0d673db4683..851c1a2b73d 100644 --- a/src/datasets/utils/logging.py +++ b/src/datasets/utils/logging.py @@ -96,16 +96,13 @@ def get_verbosity() -> int: Returns: Logging level, e.g., `datasets.logging.DEBUG` and `datasets.logging.INFO`. - - - HuggingFace datasets library has following logging levels: - - `datasets.logging.CRITICAL`, `datasets.logging.FATAL` - - `datasets.logging.ERROR` - - `datasets.logging.WARNING`, `datasets.logging.WARN` - - `datasets.logging.INFO` - - `datasets.logging.DEBUG` - - + > [!TIP] + > HuggingFace datasets library has following logging levels: + > - `datasets.logging.CRITICAL`, `datasets.logging.FATAL` + > - `datasets.logging.ERROR` + > - `datasets.logging.WARNING`, `datasets.logging.WARN` + > - `datasets.logging.INFO` + > - `datasets.logging.DEBUG` """ return _get_library_root_logger().getEffectiveLevel()