diff --git a/.github/ISSUE_TEMPLATE/bug-report.md b/.github/ISSUE_TEMPLATE/bug-report.md index b8febfe0595..3a00b42073a 100644 --- a/.github/ISSUE_TEMPLATE/bug-report.md +++ b/.github/ISSUE_TEMPLATE/bug-report.md @@ -21,16 +21,9 @@ A clear and concise description of the expected results. ## Actual results Specify the actual results or traceback. -## Versions -Paste the output of the following code: -```python -import datasets -import sys -import platform - -print(f""" -- Datasets: {datasets.__version__} -- Python: {sys.version} -- Platform: {platform.platform()} -""") -``` +## Environment info + +- `datasets` version: +- Platform: +- Python version: +- PyArrow version: diff --git a/src/datasets/commands/__init__.py b/src/datasets/commands/__init__.py index 13171f42853..905e753955a 100644 --- a/src/datasets/commands/__init__.py +++ b/src/datasets/commands/__init__.py @@ -2,7 +2,7 @@ from argparse import ArgumentParser -class BaseTransformersCLICommand(ABC): +class BaseDatasetsCLICommand(ABC): @staticmethod @abstractmethod def register_subcommand(parser: ArgumentParser): diff --git a/src/datasets/commands/convert.py b/src/datasets/commands/convert.py index 3f30cfc36c2..b3de12e4695 100644 --- a/src/datasets/commands/convert.py +++ b/src/datasets/commands/convert.py @@ -3,7 +3,7 @@ import shutil from argparse import ArgumentParser, Namespace -from datasets.commands import BaseTransformersCLICommand +from datasets.commands import BaseDatasetsCLICommand from datasets.utils.logging import get_logger @@ -42,22 +42,24 @@ def convert_command_factory(args: Namespace): """ Factory function used to convert a model TF 1.0 checkpoint in a PyTorch checkpoint. - :return: ServeCommand + + Returns: ConvertCommand """ return ConvertCommand(args.tfds_path, args.datasets_directory) -class ConvertCommand(BaseTransformersCLICommand): +class ConvertCommand(BaseDatasetsCLICommand): @staticmethod def register_subcommand(parser: ArgumentParser): """ - Register this command to argparse so it's available for the transformer-cli - :param parser: Root parser to register command-specific arguments - :return: + Register this command to argparse so it's available for the datasets-cli + + Args: + parser: Root parser to register command-specific arguments """ train_parser = parser.add_parser( "convert", - help="CLI tool to convert a (nlp) TensorFlow-Dataset in a HuggingFace-NLP dataset.", + help="Convert a TensorFlow Datasets dataset to a HuggingFace Datasets dataset.", ) train_parser.add_argument( "--tfds_path", @@ -66,7 +68,7 @@ def register_subcommand(parser: ArgumentParser): help="Path to a TensorFlow Datasets folder to convert or a single tfds file to convert.", ) train_parser.add_argument( - "--datasets_directory", type=str, required=True, help="Path to the HuggingFace NLP folder." + "--datasets_directory", type=str, required=True, help="Path to the HuggingFace Datasets folder." ) train_parser.set_defaults(func=convert_command_factory) diff --git a/src/datasets/commands/datasets_cli.py b/src/datasets/commands/datasets_cli.py index ff4d28704a8..7adc2de4200 100644 --- a/src/datasets/commands/datasets_cli.py +++ b/src/datasets/commands/datasets_cli.py @@ -2,7 +2,6 @@ from argparse import ArgumentParser from datasets.commands.convert import ConvertCommand -from datasets.commands.download import DownloadCommand from datasets.commands.dummy_data import DummyDataCommand from datasets.commands.env import EnvironmentCommand from datasets.commands.run_beam import RunBeamCommand @@ -17,7 +16,6 @@ def main(): # Register commands ConvertCommand.register_subcommand(commands_parser) - DownloadCommand.register_subcommand(commands_parser) EnvironmentCommand.register_subcommand(commands_parser) TestCommand.register_subcommand(commands_parser) RunBeamCommand.register_subcommand(commands_parser) diff --git a/src/datasets/commands/download.py b/src/datasets/commands/download.py deleted file mode 100644 index 6fd223490b2..00000000000 --- a/src/datasets/commands/download.py +++ /dev/null @@ -1,32 +0,0 @@ -from argparse import ArgumentParser - -from datasets.commands import BaseTransformersCLICommand - - -def download_command_factory(args): - return DownloadCommand(args.model, args.cache_dir, args.force) - - -class DownloadCommand(BaseTransformersCLICommand): - @staticmethod - def register_subcommand(parser: ArgumentParser): - download_parser = parser.add_parser("download") - download_parser.add_argument( - "--cache-dir", type=str, default=None, help="Path to location to store the models" - ) - download_parser.add_argument( - "--force", action="store_true", help="Force the model to be download even if already in cache-dir" - ) - download_parser.add_argument("model", type=str, help="Name of the model to download") - download_parser.set_defaults(func=download_command_factory) - - def __init__(self, model: str, cache: str, force: bool): - self._model = model - self._cache = cache - self._force = force - - def run(self): - from transformers import AutoModel, AutoTokenizer - - AutoModel.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force) - AutoTokenizer.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force) diff --git a/src/datasets/commands/dummy_data.py b/src/datasets/commands/dummy_data.py index efca1b60820..43626362b3a 100644 --- a/src/datasets/commands/dummy_data.py +++ b/src/datasets/commands/dummy_data.py @@ -9,7 +9,7 @@ from typing import Optional from datasets import config -from datasets.commands import BaseTransformersCLICommand +from datasets.commands import BaseDatasetsCLICommand from datasets.load import import_main_class, prepare_module from datasets.utils import MockDownloadManager from datasets.utils.download_manager import DownloadManager @@ -212,10 +212,10 @@ def compress_autogenerated_dummy_data(self, path_to_dataset): shutil.rmtree(base_name) -class DummyDataCommand(BaseTransformersCLICommand): +class DummyDataCommand(BaseDatasetsCLICommand): @staticmethod def register_subcommand(parser: ArgumentParser): - test_parser = parser.add_parser("dummy_data") + test_parser = parser.add_parser("dummy_data", help="Generate dummy data.") test_parser.add_argument("--auto_generate", action="store_true", help="Automatically generate dummy data") test_parser.add_argument( "--n_lines", type=int, default=5, help="Number of lines or samples to keep when auto-generating dummy data" diff --git a/src/datasets/commands/env.py b/src/datasets/commands/env.py index baf3434e76f..95eef550501 100644 --- a/src/datasets/commands/env.py +++ b/src/datasets/commands/env.py @@ -1,54 +1,31 @@ import platform from argparse import ArgumentParser +import pyarrow + from datasets import __version__ as version -from datasets import config -from datasets.commands import BaseTransformersCLICommand +from datasets.commands import BaseDatasetsCLICommand def info_command_factory(_): return EnvironmentCommand() -class EnvironmentCommand(BaseTransformersCLICommand): +class EnvironmentCommand(BaseDatasetsCLICommand): @staticmethod def register_subcommand(parser: ArgumentParser): - download_parser = parser.add_parser("env") + download_parser = parser.add_parser("env", help="Print relevant system environment info.") download_parser.set_defaults(func=info_command_factory) def run(self): - pt_version = "not installed" - pt_cuda_available = "NA" - if config.TORCH_AVAILABLE: - import torch - - pt_version = torch.__version__ - pt_cuda_available = torch.cuda.is_available() - - tf_version = "not installed" - tf_cuda_available = "NA" - if config.TF_AVAILABLE: - import tensorflow as tf - - tf_version = tf.__version__ - try: - # deprecated in v2.1 - tf_cuda_available = tf.test.is_gpu_available() - except AttributeError: - # returns list of devices, convert to bool - tf_cuda_available = bool(tf.config.list_physical_devices("GPU")) - info = { "`datasets` version": version, "Platform": platform.platform(), "Python version": platform.python_version(), - "PyTorch version (GPU?)": "{} ({})".format(pt_version, pt_cuda_available), - "Tensorflow version (GPU?)": "{} ({})".format(tf_version, tf_cuda_available), - "Using GPU in script?": "", - "Using distributed or parallel set-up in script?": "", + "PyArrow version": pyarrow.__version__, } - print("\nCopy-and-paste the text below in your GitHub issue and FILL OUT the two last points.\n") + print("\nCopy-and-paste the text below in your GitHub issue.\n") print(self.format_dict(info)) return info diff --git a/src/datasets/commands/run_beam.py b/src/datasets/commands/run_beam.py index 7b671722efa..c8a884f33f3 100644 --- a/src/datasets/commands/run_beam.py +++ b/src/datasets/commands/run_beam.py @@ -6,7 +6,7 @@ from datasets import config from datasets.builder import DatasetBuilder -from datasets.commands import BaseTransformersCLICommand +from datasets.commands import BaseDatasetsCLICommand from datasets.load import import_main_class, prepare_module from datasets.utils.download_manager import DownloadConfig, GenerateMode @@ -25,10 +25,10 @@ def run_beam_command_factory(args): ) -class RunBeamCommand(BaseTransformersCLICommand): +class RunBeamCommand(BaseDatasetsCLICommand): @staticmethod def register_subcommand(parser: ArgumentParser): - run_beam_parser = parser.add_parser("run_beam") + run_beam_parser = parser.add_parser("run_beam", help="Run a Beam dataset processing pipeline.") run_beam_parser.add_argument("--name", type=str, default=None, help="Dataset processing name") run_beam_parser.add_argument( "--cache_dir", diff --git a/src/datasets/commands/test.py b/src/datasets/commands/test.py index 84aac9a6d6c..148e68b6b02 100644 --- a/src/datasets/commands/test.py +++ b/src/datasets/commands/test.py @@ -6,7 +6,7 @@ import datasets.config from datasets.builder import DatasetBuilder -from datasets.commands import BaseTransformersCLICommand +from datasets.commands import BaseDatasetsCLICommand from datasets.load import import_main_class, prepare_module from datasets.utils.download_manager import GenerateMode from datasets.utils.filelock import logger as fl_logger @@ -32,10 +32,10 @@ def test_command_factory(args): ) -class TestCommand(BaseTransformersCLICommand): +class TestCommand(BaseDatasetsCLICommand): @staticmethod def register_subcommand(parser: ArgumentParser): - test_parser = parser.add_parser("test") + test_parser = parser.add_parser("test", help="Test dataset implementation.") test_parser.add_argument("--name", type=str, default=None, help="Dataset processing name") test_parser.add_argument( "--cache_dir",