huggingface · lhoestq · Nov 28, 2023 · Nov 16, 2023 · Nov 24, 2023 · Nov 24, 2023
diff --git a/docs/source/about_mapstyle_vs_iterable.mdx b/docs/source/about_mapstyle_vs_iterable.mdx
@@ -138,7 +138,7 @@ But using a shuffle buffer is not enough to provide a satisfactory shuffling for
 
 ```python
 # Stream from the internet
-my_iterable_dataset = load_dataset("c4", "en", split="train", streaming=True)
+my_iterable_dataset = load_dataset("c4", "en", split="train", streaming=True, trust_remote_code=True)
 my_iterable_dataset.n_shards  # 1024
 
 # Stream from local files

diff --git a/docs/source/dataset_script.mdx b/docs/source/dataset_script.mdx
@@ -3,12 +3,19 @@
 
 <Tip>
 
-The dataset script is likely not needed if your dataset is in one of the following formats: CSV, JSON, JSON lines, text or Parquet.
+The dataset loading script is likely not needed if your dataset is in one of the following formats: CSV, JSON, JSON lines, text, images, audio or Parquet.
 With those formats, you should be able to load your dataset automatically with [`~datasets.load_dataset`],
 as long as your dataset repository has a [required structure](./repository_structure).
 
 </Tip>
 
+
+<Tip warning=true>
+
+In the next major release, the new safety features of 🤗 Datasets will disable running dataset loading scripts by default, and you will have to pass `trust_remote_code=True` to load datasets that require running a dataset script.
+
+</Tip>
+
 Write a dataset script to load and share datasets that consist of data files in unsupported formats or require more complex data preparation.
 This is a more advanced way to define a dataset than using [YAML metadata in the dataset card](./repository_structure#define-your-splits-in-yaml).
 A dataset script is a Python file that defines the different configurations and splits of your dataset, as well as how to download and process the data.

diff --git a/docs/source/load_hub.mdx b/docs/source/load_hub.mdx
@@ -99,3 +99,27 @@ Then load the configuration you want:
 
 >>> mindsFR = load_dataset("PolyAI/minds14", "fr-FR", split="train")
 ```
+
+## Remote code
+
+Certain datasets repositories contain a loading script with the python code used to generate the dataset.
+Those datasets are generally exported to Parquet by Hugging Face, so that 🤗 Datasets can load the dataset fast and without running a loading script.
+
+Even if a Parquet export is not available, you can still use any dataset with python code in its repository with `load_dataset`.
+All files and code uploaded to the Hub are scanned for malware (refer to the Hub security documentation for more information), but you should still review the dataset loading scripts and authors to avoid executing malicious code on your machine. You should set `trust_remote_code=True` to use a dataset with a loading script, or you will get a warning:
+
+```py
+>>> from datasets import get_dataset_config_names, get_dataset_split_names, load_dataset
+
+>>> c4 = load_dataset("c4", "en", split="train", trust_remote_code=True)
+>>> get_dataset_config_names("c4", trust_remote_code=True)
+['en', 'realnewslike', 'en.noblocklist', 'en.noclean']
+>>> get_dataset_split_names("c4", "en", trust_remote_code=True)
+['train', 'validation']
+```
+
+<Tip warning=true>
+
+In the next major release, the new safety features of 🤗 Datasets will disable running dataset loading scripts by default, and you will have to pass `trust_remote_code=True` to load datasets that require running a dataset script.
+
+</Tip>
diff --git a/docs/source/loading.mdx b/docs/source/loading.mdx
@@ -80,9 +80,11 @@ You may have a 🤗 Datasets loading script locally on your computer. In this ca
 - The local path to the loading script file.
 - The local path to the directory containing the loading script file (only if the script file has the same name as the directory).
 
+Pass `trust_remote_code=True` to allow 🤗 Datasets to execute the loading script:
+
 ```py
->>> dataset = load_dataset("path/to/local/loading_script/loading_script.py", split="train")
->>> dataset = load_dataset("path/to/local/loading_script", split="train")  # equivalent because the file has the same name as the directory
+>>> dataset = load_dataset("path/to/local/loading_script/loading_script.py", split="train", trust_remote_code=True)
+>>> dataset = load_dataset("path/to/local/loading_script", split="train", trust_remote_code=True)  # equivalent because the file has the same name as the directory
 ```
 
 ### Edit loading script
@@ -234,7 +236,7 @@ In this case, each process is given a subset of shards to prepare:
 ```python
 from datasets import load_dataset
 
-oscar_afrikaans = load_dataset("oscar-corpus/OSCAR-2201", "af", num_proc=8)
+oscar_afrikaans = load_dataset("oscar-corpus/OSCAR-2201", "af", num_proc=8, trust_remote_code=True)
 imagenet = load_dataset("imagenet-1k", num_proc=8)
 ml_librispeech_spanish = load_dataset("facebook/multilingual_librispeech", "spanish", num_proc=8)
 ```

diff --git a/docs/source/stream.mdx b/docs/source/stream.mdx
@@ -17,7 +17,7 @@ For example, the English split of the [oscar-corpus/OSCAR-2201](https://huggingf
 
 ```py
 >>> from datasets import load_dataset
->>> dataset = load_dataset('oscar-corpus/OSCAR-2201', 'en', split='train', streaming=True)
+>>> dataset = load_dataset('oscar-corpus/OSCAR-2201', 'en', split='train', streaming=True, trust_remote_code=True)
 >>> print(next(iter(dataset)))
 {'id': 0, 'text': 'Founded in 2015, Golden Bees is a leading programmatic recruitment platform dedicated to employers, HR agencies and job boards. The company has developed unique HR-custom technologies and predictive algorithms to identify and attract the best candidates for a job opportunity.', ...
 ```
@@ -142,8 +142,8 @@ You can split your dataset one of two ways:
 
 ```py
 >>> from datasets import interleave_datasets
->>> en_dataset = load_dataset('oscar', "unshuffled_deduplicated_en", split='train', streaming=True)
->>> fr_dataset = load_dataset('oscar', "unshuffled_deduplicated_fr", split='train', streaming=True)
+>>> en_dataset = load_dataset('oscar', "unshuffled_deduplicated_en", split='train', streaming=True, trust_remote_code=True)
+>>> fr_dataset = load_dataset('oscar', "unshuffled_deduplicated_fr", split='train', streaming=True, trust_remote_code=True)
 
 >>> multilingual_dataset = interleave_datasets([en_dataset, fr_dataset])
 >>> list(multilingual_dataset.take(2))
@@ -176,7 +176,7 @@ Provide [`IterableDataset.rename_column`] with the name of the original column,
 
 ```py
 >>> from datasets import load_dataset
->>> dataset = load_dataset('mc4', 'en', streaming=True, split='train')
+>>> dataset = load_dataset('mc4', 'en', streaming=True, split='train', trust_remote_code=True)
 >>> dataset = dataset.rename_column("text", "content")
 ```
 
@@ -186,7 +186,7 @@ When you need to remove one or more columns, give [`IterableDataset.remove_colum
 
 ```py
 >>> from datasets import load_dataset
->>> dataset = load_dataset('mc4', 'en', streaming=True, split='train')
+>>> dataset = load_dataset('mc4', 'en', streaming=True, split='train', trust_remote_code=True)
 >>> dataset = dataset.remove_columns('timestamp')
 ```
 
@@ -251,7 +251,7 @@ Next, apply this function to the dataset with [`IterableDataset.map`]:
 
 ```py
 >>> from datasets import load_dataset
->>> dataset = load_dataset('oscar', 'unshuffled_deduplicated_en', streaming=True, split='train')
+>>> dataset = load_dataset('oscar', 'unshuffled_deduplicated_en', streaming=True, split='train', trust_remote_code=True)
 >>> updated_dataset = dataset.map(add_prefix)
 >>> list(updated_dataset.take(3))
 [{'id': 0, 'text': 'My text: Mtendere Village was inspired by...'},
@@ -280,7 +280,7 @@ Specify the column to remove with the `remove_columns` argument in [`IterableDat
 ```py
 >>> from datasets import load_dataset
 >>> from transformers import AutoTokenizer
->>> dataset = load_dataset("mc4", "en", streaming=True, split="train")
+>>> dataset = load_dataset("mc4", "en", streaming=True, split="train", trust_remote_code=True)
 >>> tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
 >>> def encode(examples):
 ...     return tokenizer(examples['text'], truncation=True, padding='max_length')
@@ -302,7 +302,7 @@ You can filter rows in the dataset based on a predicate function using [`Dataset
 
 ```py
 >>> from datasets import load_dataset
->>> dataset = load_dataset('oscar', 'unshuffled_deduplicated_en', streaming=True, split='train')
+>>> dataset = load_dataset('oscar', 'unshuffled_deduplicated_en', streaming=True, split='train', trust_remote_code=True)
 >>> start_with_ar = dataset.filter(lambda example: example['text'].startswith('Ar'))
 >>> next(iter(start_with_ar))
 {'id': 4, 'text': 'Are you looking for Number the Stars (Essential Modern Classics)?...'}

diff --git a/docs/source/use_with_pytorch.mdx b/docs/source/use_with_pytorch.mdx
@@ -205,7 +205,7 @@ An iterable dataset from `datasets` inherits from `torch.utils.data.IterableData
 If the dataset is split in several shards (i.e. if the dataset consists of multiple data files), then you can stream in parallel using `num_workers`:
 
 ```py
->>> my_iterable_dataset = load_dataset("c4", "en", streaming=True, split="train")
+>>> my_iterable_dataset = load_dataset("c4", "en", streaming=True, split="train", trust_remote_code=True)
 >>> my_iterable_dataset.n_shards
 1024
 >>> dataloader = DataLoader(my_iterable_dataset, batch_size=32, num_workers=4)

diff --git a/src/datasets/config.py b/src/datasets/config.py
@@ -3,6 +3,7 @@
 import os
 import platform
 from pathlib import Path
+from typing import Optional
 
 from packaging import version
 
@@ -31,7 +32,9 @@
 
 # General environment variables accepted values for booleans
 ENV_VARS_TRUE_VALUES = {"1", "ON", "YES", "TRUE"}
+ENV_VARS_FALSE_VALUES = {"0", "OFF", "NO", "FALSE"}
 ENV_VARS_TRUE_AND_AUTO_VALUES = ENV_VARS_TRUE_VALUES.union({"AUTO"})
+ENV_VARS_FALSE_AND_AUTO_VALUES = ENV_VARS_FALSE_VALUES.union({"AUTO"})
 
 
 # Imports
@@ -170,6 +173,17 @@
     os.environ.get("HF_UPDATE_DOWNLOAD_COUNTS", "AUTO").upper() in ENV_VARS_TRUE_AND_AUTO_VALUES
 )
 
+# Remote dataset scripts support
+__HF_DATASETS_TRUST_REMOTE_CODE_DEFAULT = os.environ.get("HF_DATASETS_TRUST_REMOTE_CODE_DEFAULT", "1")
+HF_DATASETS_TRUST_REMOTE_CODE_DEFAULT: Optional[bool] = (
+    True
+    if __HF_DATASETS_TRUST_REMOTE_CODE_DEFAULT.upper() in ENV_VARS_TRUE_VALUES
+    else False
+    if __HF_DATASETS_TRUST_REMOTE_CODE_DEFAULT.upper() in ENV_VARS_FALSE_VALUES
+    else None
+)
+TIME_OUT_REMOTE_CODE = 15
+
 # Batch size constants. For more info, see:
 # https://github.com/apache/arrow/blob/master/docs/source/cpp/arrays.rst#size-limitations-and-recommendations)
 DEFAULT_MAX_BATCH_SIZE = 1000