From 993bddd436859e7a51c3919fe414b3c5dd38cd5b Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Thu, 20 May 2021 15:50:05 +0200 Subject: [PATCH 1/2] Add args description to DatasetInfo --- src/datasets/info.py | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/src/datasets/info.py b/src/datasets/info.py index 8e323bd4c33..a93dbb4bd22 100644 --- a/src/datasets/info.py +++ b/src/datasets/info.py @@ -93,23 +93,24 @@ class DatasetInfo: Note: Not all fields are known on construction and may be updated later. Attributes: - description (str): - citation (str): - homepage (str): - license (str): - features (Features, optional): + description (str): A description of the dataset. + citation (str): A BibTeX citation of the dataset. + homepage (str): A URL to the official homepage for the dataset. + license (str): The dataset's license. + features (Features, optional): The features used to specify the dataset's columns, types and conversion methods. post_processed (PostProcessedInfo, optional): - supervised_keys (SupervisedKeysData, optional): - builder_name (str, optional) - config_name (str, optional) - version (str or Version, optional): - splits (dict, optional): - download_checksums (dict, optional): - download_size (int, optional): + supervised_keys (SupervisedKeysData, optional): Specifies the input feature and the label for supervised learning if applicable for the dataset. + builder_name (str, optional): The name of the :class:`GeneratorBasedBuilder` subclass used to create the dataset. Usually matched to the corresponding script name, but with CamelCase instead of snake_case. + config_name (str, optional): The name of the configuration derived from :class:`BuilderConfig` + version (str or Version, optional): The version of the dataset. + splits (dict, optional): The mapping between split name and metadata. + download_checksums (dict, optional): The mapping between the URL to download the dataset's checksums and corresponding metadata. + download_size (int, optional): The size of the compressed dataset in bytes. post_processing_size (int, optional): - dataset_size (int, optional): - size_in_bytes (int, optional): - task_templates (List[TaskTemplate], optional): + dataset_size (int, optional): The combined size of the Apache Arrow tables for all splits in bytes. + size_in_bytes (int, optional): The combined size of all files associated with the dataset. + task_templates (List[TaskTemplate], optional): The task templates to prepare the dataset for during training and evaluation. Each template casts the dataset's :class:`Features` to standardized column names and types as detailed in :py:mod:`datasets.tasks`. + **config_kwargs: Keyword arguments to be passed to the :class:`BuilderConfig` and used in the :class:`DatasetBuilder`. """ # Set in the dataset scripts From c70ab243a1acebe52d33e29a5848671df8dc8106 Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Fri, 21 May 2021 15:38:23 +0200 Subject: [PATCH 2/2] Add missing attributes descriptions --- src/datasets/info.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/datasets/info.py b/src/datasets/info.py index a93dbb4bd22..6b65f294716 100644 --- a/src/datasets/info.py +++ b/src/datasets/info.py @@ -96,19 +96,19 @@ class DatasetInfo: description (str): A description of the dataset. citation (str): A BibTeX citation of the dataset. homepage (str): A URL to the official homepage for the dataset. - license (str): The dataset's license. - features (Features, optional): The features used to specify the dataset's columns, types and conversion methods. - post_processed (PostProcessedInfo, optional): - supervised_keys (SupervisedKeysData, optional): Specifies the input feature and the label for supervised learning if applicable for the dataset. - builder_name (str, optional): The name of the :class:`GeneratorBasedBuilder` subclass used to create the dataset. Usually matched to the corresponding script name, but with CamelCase instead of snake_case. + license (str): The dataset's license. It can be the name of the license or a paragraph containing the terms of the license. + features (Features, optional): The features used to specify the dataset's column types. + post_processed (PostProcessedInfo, optional): Information regarding the resources of a possible post-processing of a dataset. For example, it can contain the information of an index. + supervised_keys (SupervisedKeysData, optional): Specifies the input feature and the label for supervised learning if applicable for the dataset (legacy from TFDS). + builder_name (str, optional): The name of the :class:`GeneratorBasedBuilder` subclass used to create the dataset. Usually matched to the corresponding script name. It is also the snake_case version of the dataset builder class name. config_name (str, optional): The name of the configuration derived from :class:`BuilderConfig` version (str or Version, optional): The version of the dataset. splits (dict, optional): The mapping between split name and metadata. download_checksums (dict, optional): The mapping between the URL to download the dataset's checksums and corresponding metadata. - download_size (int, optional): The size of the compressed dataset in bytes. - post_processing_size (int, optional): - dataset_size (int, optional): The combined size of the Apache Arrow tables for all splits in bytes. - size_in_bytes (int, optional): The combined size of all files associated with the dataset. + download_size (int, optional): The size of the files to download to generate the dataset, in bytes. + post_processing_size (int, optional): Size of the dataset in bytes after post-processing, if any. + dataset_size (int, optional): The combined size in bytes of the Arrow tables for all splits. + size_in_bytes (int, optional): The combined size in bytes of all files associated with the dataset (downloaded files + Arrow files). task_templates (List[TaskTemplate], optional): The task templates to prepare the dataset for during training and evaluation. Each template casts the dataset's :class:`Features` to standardized column names and types as detailed in :py:mod:`datasets.tasks`. **config_kwargs: Keyword arguments to be passed to the :class:`BuilderConfig` and used in the :class:`DatasetBuilder`. """