activeloopai · ProgerDav · Jan 20, 2023 · Jan 8, 2023 · Jan 8, 2023 · Jan 8, 2023
diff --git a/.gitignore b/.gitignore
@@ -190,7 +190,6 @@ hyper.sublime-*
 .kube-temp
 
 hyper_search/
-yolo/
 logs/
 
 # other

diff --git a/deeplake/__init__.py b/deeplake/__init__.py
@@ -44,6 +44,7 @@
 ingest = api_dataset.ingest
 connect = api_dataset.connect
 ingest_coco = api_dataset.ingest_coco
+ingest_yolo = api_dataset.ingest_yolo
 ingest_kaggle = api_dataset.ingest_kaggle
 ingest_dataframe = api_dataset.ingest_dataframe
 ingest_huggingface = huggingface.ingest_huggingface

diff --git a/deeplake/api/dataset.py b/deeplake/api/dataset.py
@@ -8,6 +8,7 @@
 from deeplake.auto.unstructured.kaggle import download_kaggle_dataset
 from deeplake.auto.unstructured.image_classification import ImageClassification
 from deeplake.auto.unstructured.coco.coco import CocoDataset
+from deeplake.auto.unstructured.yolo.yolo import YoloDataset
 from deeplake.client.client import DeepLakeBackendClient
 from deeplake.client.log import logger
 from deeplake.core.dataset import Dataset, dataset_factory
@@ -1023,7 +1024,7 @@ def ingest_coco(
             >>> ds = deeplake.ingest_coco(
             >>>     "path/to/images/directory",
             >>>     ["path/to/annotation/file1.json", "path/to/annotation/file2.json"],
-            >>>     dest="hub://username/dataset",
+            >>>     dest="hub://org_id/dataset",
             >>>     key_to_tensor_mapping={"category_id": "labels", "bbox": "boxes"},
             >>>     file_to_group_mapping={"file1.json": "group1", "file2.json": "group2"},
             >>>     ignore_keys=["area", "image_id", "id"],
@@ -1034,7 +1035,7 @@ def ingest_coco(
             >>> ds = deeplake.ingest_coco(
             >>>     "s3://bucket/images/directory",
             >>>     "s3://bucket/annotation/file1.json",
-            >>>     dest="hub://username/dataset",
+            >>>     dest="hub://org_id/dataset",
             >>>     ignore_one_group=True,
             >>>     ignore_keys=["area", "image_id", "id"],
             >>>     image_settings={"name": "images", "linked": True, creds_key="my_managed_creds_key", "sample_compression": "jpeg"},
@@ -1048,7 +1049,7 @@ def ingest_coco(
             annotation_files (str, pathlib.Path, List[str]): Path to JSON annotation files in COCO format.
             dest (str, pathlib.Path):
                 - The full path to the dataset. Can be:
-                - a Deep Lake cloud path of the form ``hub://username/datasetname``. To write to Deep Lake cloud datasets, ensure that you are logged in to Deep Lake (use 'activeloop login' from command line)
+                - a Deep Lake cloud path of the form ``hub://org_id/datasetname``. To write to Deep Lake cloud datasets, ensure that you are logged in to Deep Lake (use 'activeloop login' from command line), or pass in a token using the 'token' parameter.
                 - an s3 path of the form ``s3://bucketname/path/to/dataset``. Credentials are required in either the environment or passed to the creds argument.
                 - a local file system path of the form ``./path/to/dataset`` or ``~/path/to/dataset`` or ``path/to/dataset``.
                 - a memory path of the form ``mem://path/to/dataset`` which doesn't save the dataset but keeps it in memory instead. Should be used only for testing as it does not persist.
@@ -1070,6 +1071,7 @@ def ingest_coco(
         Raises:
             IngestionError: If either ``key_to_tensor_mapping`` or ``file_to_group_mapping`` are not one-to-one.
         """
+
         dest = convert_pathlib_to_string_if_needed(dest)
         images_directory = convert_pathlib_to_string_if_needed(images_directory)
         annotation_files = (
@@ -1078,7 +1080,12 @@ def ingest_coco(
             else convert_pathlib_to_string_if_needed(annotation_files)
         )
 
-        ds = deeplake.empty(dest, creds=dest_creds, verbose=False, **dataset_kwargs)
+        feature_report_path(
+            dest,
+            "ingest_coco",
+            {"num_workers": num_workers},
+            token=dataset_kwargs.get("token", None),
+        )
 
         unstructured = CocoDataset(
             source=images_directory,
@@ -1091,6 +1098,130 @@ def ingest_coco(
             creds=src_creds,
         )
         structure = unstructured.prepare_structure(inspect_limit)
+
+        ds = deeplake.empty(dest, creds=dest_creds, verbose=False, **dataset_kwargs)
+
+        structure.create_missing(ds)
+
+        unstructured.structure(
+            ds,
+            progressbar,
+            num_workers,
+        )
+
+        return ds
+
+    @staticmethod
+    def ingest_yolo(
+        data_directory: Union[str, pathlib.Path],
+        dest: Union[str, pathlib.Path],
+        class_names_file: Optional[Union[str, pathlib.Path]] = None,
+        annotations_directory: Optional[Union[str, pathlib.Path]] = None,
+        allow_no_annotation: bool = False,
+        image_params: Optional[Dict] = None,
+        label_params: Optional[Dict] = None,
+        coordinates_params: Optional[Dict] = None,
+        src_creds: Optional[Dict] = None,
+        dest_creds: Optional[Dict] = None,
+        image_creds_key: Optional[str] = None,
+        inspect_limit: int = 1000,
+        progressbar: bool = True,
+        num_workers: int = 0,
+        connect_kwargs: Optional[Dict] = None,
+        **dataset_kwargs,
+    ) -> Dataset:
+        """Ingest images and annotations (bounding boxes or polygons) in YOLO format to a Deep Lake Dataset.
+
+        Examples:
+            >>> ds = deeplake.ingest_yolo(
+            >>>     "path/to/data/directory",
+            >>>     dest="hub://org_id/dataset",
+            >>>     allow_no_annotation=True,
+            >>>     token="my_activeloop_token",
+            >>>     num_workers=4,
+            >>> )
+            >>> # or ingest data from cloud
+            >>> ds = deeplake.ingest_yolo(
+            >>>     "s3://bucket/data_directory",
+            >>>     dest="hub://org_id/dataset",
+            >>>     image_params={"name": "image_links", "htype": "link[image]"},
+            >>>     image_creds_key='my_s3_managed_crerendials",
+            >>>     src_creds=aws_creds, # Can also be inferred from environment
+            >>>     token="my_activeloop_token",
+            >>>     num_workers=4,
+            >>> )
+
+        Args:
+            data_directory (str, pathlib.Path): The path to the directory containing the data (images files and annotation files(see 'annotations_directory' input for specifying annotations in a separate directory).
+            dest (str, pathlib.Path):
+                - The full path to the dataset. Can be:
+                - a Deep Lake cloud path of the form ``hub://org_id/datasetname``. To write to Deep Lake cloud datasets, ensure that you are logged in to Deep Lake (use 'activeloop login' from command line), or pass in a token using the 'token' parameter.
+                - an s3 path of the form ``s3://bucketname/path/to/dataset``. Credentials are required in either the environment or passed to the creds argument.
+                - a local file system path of the form ``./path/to/dataset`` or ``~/path/to/dataset`` or ``path/to/dataset``.
+                - a memory path of the form ``mem://path/to/dataset`` which doesn't save the dataset but keeps it in memory instead. Should be used only for testing as it does not persist.
+            class_names_file: Path to the file containing the class names on separate lines. This is typically a file titled classes.names.
+            annotations_directory (Optional[Union[str, pathlib.Path]]): Path to directory containing the annotations. If specified, the 'data_directory' will not be examined for annotations.
+            allow_no_annotation (bool): Flag to determine whether missing annotations files corresponding to an image should be treated as empty annoations. Set to ``False`` by default.
+            image_params (Optional[Dict]): A dictionary containing parameters for the images tensor.
+            label_params (Optional[Dict]): A dictionary containing parameters for the labels tensor.
+            coordinates_params (Optional[Dict]): A dictionary containing parameters for the ccoordinates tensor. This tensor either contains bounding boxes or polygons.
+            src_creds (Optional[Dict]): Credentials to access the source path. If not provided, will be inferred from the environment.
+            dest_creds (Optional[Dict]): A dictionary containing credentials used to access the destination path of the dataset.
+            image_creds_key (Optional[str]): creds_key for linked tensors, applicable if the htype for the images tensor is specified as 'link[image]' in the 'image_params' input.
+            inspect_limit (int): The maximum number of annotations to inspect, in order to infer whether they are bounding boxes of polygons. This in put is ignored if the htype is specfied in the 'coordinates_params'.
+            progressbar (bool): Enables or disables ingestion progress bar. Set to ``True`` by default.
+            num_workers (int): The number of workers to use for ingestion. Set to ``0`` by default.
+            connect_kwargs (Optional[Dict]): If specified, the dataset will be connected to Platform, and connect_kwargs will be passed to :func:`ds.connect`.
+            **dataset_kwargs: Any arguments passed here will be forwarded to the dataset creator function. See :func:`deeplake.empty`.
+
+        Returns:
+            Dataset: The Dataset created from the images and YOLO annotations.
+
+        Raises:
+            IngestionError: If annotations are not found for all the images and 'allow_no_annotation' is False
+        """
+
+        dest = convert_pathlib_to_string_if_needed(dest)
+        data_directory = convert_pathlib_to_string_if_needed(data_directory)
+
+        annotations_directory = (
+            convert_pathlib_to_string_if_needed(annotations_directory)
+            if annotations_directory is not None
+            else None
+        )
+
+        class_names_file = (
+            convert_pathlib_to_string_if_needed(class_names_file)
+            if class_names_file is not None
+            else None
+        )
+
+        feature_report_path(
+            dest,
+            "ingest_yolo",
+            {"num_workers": num_workers},
+            token=dataset_kwargs.get("token", None),
+        )
+
+        unstructured = YoloDataset(
+            data_directory=data_directory,
+            class_names_file=class_names_file,
+            annotations_directory=annotations_directory,
+            image_params=image_params,
+            label_params=label_params,
+            coordinates_params=coordinates_params,
+            allow_no_annotation=allow_no_annotation,
+            creds=src_creds,
+            image_creds_key=image_creds_key,
+            inspect_limit=inspect_limit,
+        )
+
+        structure = unstructured.prepare_structure()
+
+        ds = deeplake.empty(dest, creds=dest_creds, verbose=False, **dataset_kwargs)
+        if connect_kwargs is not None:
+            ds.connect(**connect_kwargs, token=dataset_kwargs.get("token", None))
+
         structure.create_missing(ds)
 
         unstructured.structure(
@@ -1185,6 +1316,7 @@ def ingest(
                 "Progressbar": progressbar,
                 "Summary": summary,
             },
+            token=dataset_kwargs.get("token", None),
         )
 
         src = convert_pathlib_to_string_if_needed(src)
@@ -1212,11 +1344,11 @@ def ingest(
                 if images_compression is None:
                     raise InvalidFileExtension(src)
 
-            ds = deeplake.dataset(dest, creds=dest_creds, **dataset_kwargs)
-
             # TODO: support more than just image classification (and update docstring)
             unstructured = ImageClassification(source=src)
 
+            ds = deeplake.dataset(dest, creds=dest_creds, **dataset_kwargs)
+
             # TODO: auto detect compression
             unstructured.structure(
                 ds,  # type: ignore
@@ -1278,6 +1410,7 @@ def ingest_kaggle(
                 "Progressbar": progressbar,
                 "Summary": summary,
             },
+            token=dataset_kwargs.get("token", None),
         )
 
         if os.path.isdir(src) and os.path.isdir(dest):
@@ -1334,36 +1467,49 @@ def ingest_dataframe(
         import pandas as pd
         from deeplake.auto.structured.dataframe import DataFrame
 
+        feature_report_path(
+            convert_pathlib_to_string_if_needed(dest),
+            "ingest_dataframe",
+            {},
+            token=dataset_kwargs.get("token", None),
+        )
+
         if not isinstance(src, pd.DataFrame):
             raise Exception("Source provided is not a valid pandas dataframe object")
 
+        structured = DataFrame(src)
+
         if isinstance(dest, Dataset):
             ds = dest
         else:
             dest = convert_pathlib_to_string_if_needed(dest)
             ds = deeplake.dataset(dest, creds=dest_creds, **dataset_kwargs)
 
-        structured = DataFrame(src)
         structured.fill_dataset(ds, progressbar)  # type: ignore
         return ds  # type: ignore
 
     @staticmethod
-    @deeplake_reporter.record_call
     def list(
-        workspace: str = "",
+        org_id: str = "",
         token: Optional[str] = None,
     ) -> None:
         """List all available Deep Lake cloud datasets.
 
         Args:
-            workspace (str): Specify user/organization name. If not given,
+            org_id (str): Specify organization id. If not given,
                 returns a list of all datasets that can be accessed, regardless of what workspace they are in.
-                Otherwise, lists all datasets in the given workspace.
+                Otherwise, lists all datasets in the given organization.
             token (str, optional): Activeloop token, used for fetching credentials for Deep Lake datasets. This is optional, tokens are normally autogenerated.
 
         Returns:
             List: List of dataset names.
         """
+
+        deeplake_reporter.feature_report(
+            feature_name="list",
+            parameters={"org_id": org_id},
+        )
+
         client = DeepLakeBackendClient(token=token)
-        datasets = client.get_datasets(workspace=workspace)
+        datasets = client.get_datasets(workspace=org_id)
         return datasets
diff --git a/deeplake/auto/tests/test_coco_template.py b/deeplake/auto/tests/test_coco_template.py
@@ -11,13 +11,12 @@ def test_full_dataset_structure(local_ds):
     dataset_structure = DatasetStructure(ignore_one_group=False)
 
     dataset_structure.add_first_level_tensor(
-        TensorStructure("tensor1", params={"htype": "generic"}, primary=False)
+        TensorStructure("tensor1", params={"htype": "generic"})
     )
     dataset_structure.add_first_level_tensor(
         TensorStructure(
             "images",
             params={"htype": "image", "sample_compression": "jpeg"},
-            primary=True,
         )
     )
 
@@ -51,13 +50,12 @@ def test_missing_dataset_structure(local_ds):
     local_ds.create_tensor("annotations/masks", htype="binary_mask")
 
     dataset_structure.add_first_level_tensor(
-        TensorStructure("tensor1", params={"htype": "generic"}, primary=False)
+        TensorStructure("tensor1", params={"htype": "generic"})
     )
     dataset_structure.add_first_level_tensor(
         TensorStructure(
             "images",
             params={"htype": "image", "sample_compression": "jpeg"},
-            primary=True,
         )
     )
-Original file line number
+Diff line change
@@ Expand Up / @@ -190,7 +190,6 @@ hyper.sublime-* @@
     .kube-temp
     hyper_search/
-    yolo/
     logs/
     # other
@@ Expand Down @@