Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
9b97589
Added support for ingestion yolo and tweaked git_ignore to allow fold…
Jan 8, 2023
efa503b
Ran black .
Jan 8, 2023
6cdb6bf
Fixed missing import and linting
Jan 8, 2023
634577a
Fixed more typing issues
Jan 8, 2023
6a53112
Tweaked bugout reporting for dataset.py where token was not user prop…
Jan 9, 2023
af87c68
fixed typing in dataset.py
Jan 10, 2023
649c7c0
Removed unecessary variables from coco
Jan 10, 2023
fcccbf5
Fixed issues related to linked tensors
Jan 10, 2023
a434643
Added tests
Jan 10, 2023
838ac02
Fixed linting
Jan 10, 2023
fd64ae4
Fixed some issues with linked tensors and made minor refactors
Jan 10, 2023
7472a46
Added ingest_yolo to init file
Jan 10, 2023
dcaf0e6
Added fix to links
Jan 10, 2023
70ca93f
Updated docstrings and made tweaks.
Jan 10, 2023
397418c
Ran black
Jan 10, 2023
40d8e9e
Fixed typo in docstring
Jan 10, 2023
4c2c338
Fixed issue with linked test
Jan 10, 2023
6b9b756
Improved ingest_yolo docstring and fixed username to org_id in ingest…
Jan 10, 2023
6af15ea
Fixed formatting problem in docstring
Jan 10, 2023
ffa4f82
Fixed tests
istranic Jan 11, 2023
ca2d806
Added better error checking and more dataset creation to after data v…
istranic Jan 11, 2023
082aabb
Moved dataset creation to after data verification in ingest_coco.
istranic Jan 11, 2023
a3b0dfc
Increased test coverage
istranic Jan 13, 2023
923405f
Fixed tests
istranic Jan 13, 2023
046abbe
Fixed auto-inference of htype for coordinates
istranic Jan 13, 2023
e5374ad
Fixed tests
istranic Jan 13, 2023
a52a971
FIxed test
istranic Jan 14, 2023
5360fcd
Fixed point of dataset creation in ingest functions. Fixed bugout rep…
istranic Jan 15, 2023
c39b879
Fixed bug
istranic Jan 15, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,6 @@ hyper.sublime-*
.kube-temp

hyper_search/
yolo/
logs/

# other
Expand Down
1 change: 1 addition & 0 deletions deeplake/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
ingest = api_dataset.ingest
connect = api_dataset.connect
ingest_coco = api_dataset.ingest_coco
ingest_yolo = api_dataset.ingest_yolo
ingest_kaggle = api_dataset.ingest_kaggle
ingest_dataframe = api_dataset.ingest_dataframe
ingest_huggingface = huggingface.ingest_huggingface
Expand Down
170 changes: 158 additions & 12 deletions deeplake/api/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from deeplake.auto.unstructured.kaggle import download_kaggle_dataset
from deeplake.auto.unstructured.image_classification import ImageClassification
from deeplake.auto.unstructured.coco.coco import CocoDataset
from deeplake.auto.unstructured.yolo.yolo import YoloDataset
from deeplake.client.client import DeepLakeBackendClient
from deeplake.client.log import logger
from deeplake.core.dataset import Dataset, dataset_factory
Expand Down Expand Up @@ -1023,7 +1024,7 @@ def ingest_coco(
>>> ds = deeplake.ingest_coco(
>>> "path/to/images/directory",
>>> ["path/to/annotation/file1.json", "path/to/annotation/file2.json"],
>>> dest="hub://username/dataset",
>>> dest="hub://org_id/dataset",
>>> key_to_tensor_mapping={"category_id": "labels", "bbox": "boxes"},
>>> file_to_group_mapping={"file1.json": "group1", "file2.json": "group2"},
>>> ignore_keys=["area", "image_id", "id"],
Expand All @@ -1034,7 +1035,7 @@ def ingest_coco(
>>> ds = deeplake.ingest_coco(
>>> "s3://bucket/images/directory",
>>> "s3://bucket/annotation/file1.json",
>>> dest="hub://username/dataset",
>>> dest="hub://org_id/dataset",
>>> ignore_one_group=True,
>>> ignore_keys=["area", "image_id", "id"],
>>> image_settings={"name": "images", "linked": True, creds_key="my_managed_creds_key", "sample_compression": "jpeg"},
Expand All @@ -1048,7 +1049,7 @@ def ingest_coco(
annotation_files (str, pathlib.Path, List[str]): Path to JSON annotation files in COCO format.
dest (str, pathlib.Path):
- The full path to the dataset. Can be:
- a Deep Lake cloud path of the form ``hub://username/datasetname``. To write to Deep Lake cloud datasets, ensure that you are logged in to Deep Lake (use 'activeloop login' from command line)
- a Deep Lake cloud path of the form ``hub://org_id/datasetname``. To write to Deep Lake cloud datasets, ensure that you are logged in to Deep Lake (use 'activeloop login' from command line), or pass in a token using the 'token' parameter.
- an s3 path of the form ``s3://bucketname/path/to/dataset``. Credentials are required in either the environment or passed to the creds argument.
- a local file system path of the form ``./path/to/dataset`` or ``~/path/to/dataset`` or ``path/to/dataset``.
- a memory path of the form ``mem://path/to/dataset`` which doesn't save the dataset but keeps it in memory instead. Should be used only for testing as it does not persist.
Expand All @@ -1070,6 +1071,7 @@ def ingest_coco(
Raises:
IngestionError: If either ``key_to_tensor_mapping`` or ``file_to_group_mapping`` are not one-to-one.
"""

dest = convert_pathlib_to_string_if_needed(dest)
images_directory = convert_pathlib_to_string_if_needed(images_directory)
annotation_files = (
Expand All @@ -1078,7 +1080,12 @@ def ingest_coco(
else convert_pathlib_to_string_if_needed(annotation_files)
)

ds = deeplake.empty(dest, creds=dest_creds, verbose=False, **dataset_kwargs)
feature_report_path(
dest,
"ingest_coco",
{"num_workers": num_workers},
token=dataset_kwargs.get("token", None),
)

unstructured = CocoDataset(
source=images_directory,
Expand All @@ -1091,6 +1098,130 @@ def ingest_coco(
creds=src_creds,
)
structure = unstructured.prepare_structure(inspect_limit)

ds = deeplake.empty(dest, creds=dest_creds, verbose=False, **dataset_kwargs)

structure.create_missing(ds)

unstructured.structure(
ds,
progressbar,
num_workers,
)

return ds

@staticmethod
def ingest_yolo(
data_directory: Union[str, pathlib.Path],
dest: Union[str, pathlib.Path],
class_names_file: Optional[Union[str, pathlib.Path]] = None,
annotations_directory: Optional[Union[str, pathlib.Path]] = None,
allow_no_annotation: bool = False,
image_params: Optional[Dict] = None,
label_params: Optional[Dict] = None,
coordinates_params: Optional[Dict] = None,
src_creds: Optional[Dict] = None,
dest_creds: Optional[Dict] = None,
image_creds_key: Optional[str] = None,
inspect_limit: int = 1000,
progressbar: bool = True,
num_workers: int = 0,
connect_kwargs: Optional[Dict] = None,
**dataset_kwargs,
) -> Dataset:
"""Ingest images and annotations (bounding boxes or polygons) in YOLO format to a Deep Lake Dataset.

Examples:
>>> ds = deeplake.ingest_yolo(
>>> "path/to/data/directory",
>>> dest="hub://org_id/dataset",
>>> allow_no_annotation=True,
>>> token="my_activeloop_token",
>>> num_workers=4,
>>> )
>>> # or ingest data from cloud
>>> ds = deeplake.ingest_yolo(
>>> "s3://bucket/data_directory",
>>> dest="hub://org_id/dataset",
>>> image_params={"name": "image_links", "htype": "link[image]"},
>>> image_creds_key='my_s3_managed_crerendials",
>>> src_creds=aws_creds, # Can also be inferred from environment
>>> token="my_activeloop_token",
>>> num_workers=4,
>>> )

Args:
data_directory (str, pathlib.Path): The path to the directory containing the data (images files and annotation files(see 'annotations_directory' input for specifying annotations in a separate directory).
dest (str, pathlib.Path):
- The full path to the dataset. Can be:
- a Deep Lake cloud path of the form ``hub://org_id/datasetname``. To write to Deep Lake cloud datasets, ensure that you are logged in to Deep Lake (use 'activeloop login' from command line), or pass in a token using the 'token' parameter.
- an s3 path of the form ``s3://bucketname/path/to/dataset``. Credentials are required in either the environment or passed to the creds argument.
- a local file system path of the form ``./path/to/dataset`` or ``~/path/to/dataset`` or ``path/to/dataset``.
- a memory path of the form ``mem://path/to/dataset`` which doesn't save the dataset but keeps it in memory instead. Should be used only for testing as it does not persist.
class_names_file: Path to the file containing the class names on separate lines. This is typically a file titled classes.names.
annotations_directory (Optional[Union[str, pathlib.Path]]): Path to directory containing the annotations. If specified, the 'data_directory' will not be examined for annotations.
allow_no_annotation (bool): Flag to determine whether missing annotations files corresponding to an image should be treated as empty annoations. Set to ``False`` by default.
image_params (Optional[Dict]): A dictionary containing parameters for the images tensor.
label_params (Optional[Dict]): A dictionary containing parameters for the labels tensor.
coordinates_params (Optional[Dict]): A dictionary containing parameters for the ccoordinates tensor. This tensor either contains bounding boxes or polygons.
src_creds (Optional[Dict]): Credentials to access the source path. If not provided, will be inferred from the environment.
dest_creds (Optional[Dict]): A dictionary containing credentials used to access the destination path of the dataset.
image_creds_key (Optional[str]): creds_key for linked tensors, applicable if the htype for the images tensor is specified as 'link[image]' in the 'image_params' input.
inspect_limit (int): The maximum number of annotations to inspect, in order to infer whether they are bounding boxes of polygons. This in put is ignored if the htype is specfied in the 'coordinates_params'.
progressbar (bool): Enables or disables ingestion progress bar. Set to ``True`` by default.
num_workers (int): The number of workers to use for ingestion. Set to ``0`` by default.
connect_kwargs (Optional[Dict]): If specified, the dataset will be connected to Platform, and connect_kwargs will be passed to :func:`ds.connect`.
**dataset_kwargs: Any arguments passed here will be forwarded to the dataset creator function. See :func:`deeplake.empty`.

Returns:
Dataset: The Dataset created from the images and YOLO annotations.

Raises:
IngestionError: If annotations are not found for all the images and 'allow_no_annotation' is False
"""

dest = convert_pathlib_to_string_if_needed(dest)
data_directory = convert_pathlib_to_string_if_needed(data_directory)

annotations_directory = (
convert_pathlib_to_string_if_needed(annotations_directory)
if annotations_directory is not None
else None
)

class_names_file = (
convert_pathlib_to_string_if_needed(class_names_file)
if class_names_file is not None
else None
)

feature_report_path(
dest,
"ingest_yolo",
{"num_workers": num_workers},
token=dataset_kwargs.get("token", None),
)

unstructured = YoloDataset(
data_directory=data_directory,
class_names_file=class_names_file,
annotations_directory=annotations_directory,
image_params=image_params,
label_params=label_params,
coordinates_params=coordinates_params,
allow_no_annotation=allow_no_annotation,
creds=src_creds,
image_creds_key=image_creds_key,
inspect_limit=inspect_limit,
)

structure = unstructured.prepare_structure()

ds = deeplake.empty(dest, creds=dest_creds, verbose=False, **dataset_kwargs)
if connect_kwargs is not None:
ds.connect(**connect_kwargs, token=dataset_kwargs.get("token", None))

structure.create_missing(ds)

unstructured.structure(
Expand Down Expand Up @@ -1185,6 +1316,7 @@ def ingest(
"Progressbar": progressbar,
"Summary": summary,
},
token=dataset_kwargs.get("token", None),
)

src = convert_pathlib_to_string_if_needed(src)
Expand Down Expand Up @@ -1212,11 +1344,11 @@ def ingest(
if images_compression is None:
raise InvalidFileExtension(src)

ds = deeplake.dataset(dest, creds=dest_creds, **dataset_kwargs)

# TODO: support more than just image classification (and update docstring)
unstructured = ImageClassification(source=src)

ds = deeplake.dataset(dest, creds=dest_creds, **dataset_kwargs)

# TODO: auto detect compression
unstructured.structure(
ds, # type: ignore
Expand Down Expand Up @@ -1278,6 +1410,7 @@ def ingest_kaggle(
"Progressbar": progressbar,
"Summary": summary,
},
token=dataset_kwargs.get("token", None),
)

if os.path.isdir(src) and os.path.isdir(dest):
Expand Down Expand Up @@ -1334,36 +1467,49 @@ def ingest_dataframe(
import pandas as pd
from deeplake.auto.structured.dataframe import DataFrame

feature_report_path(
convert_pathlib_to_string_if_needed(dest),
"ingest_dataframe",
{},
token=dataset_kwargs.get("token", None),
)

if not isinstance(src, pd.DataFrame):
raise Exception("Source provided is not a valid pandas dataframe object")

structured = DataFrame(src)

if isinstance(dest, Dataset):
ds = dest
else:
dest = convert_pathlib_to_string_if_needed(dest)
ds = deeplake.dataset(dest, creds=dest_creds, **dataset_kwargs)

structured = DataFrame(src)
structured.fill_dataset(ds, progressbar) # type: ignore
return ds # type: ignore

@staticmethod
@deeplake_reporter.record_call
def list(
workspace: str = "",
org_id: str = "",
token: Optional[str] = None,
) -> None:
"""List all available Deep Lake cloud datasets.

Args:
workspace (str): Specify user/organization name. If not given,
org_id (str): Specify organization id. If not given,
returns a list of all datasets that can be accessed, regardless of what workspace they are in.
Otherwise, lists all datasets in the given workspace.
Otherwise, lists all datasets in the given organization.
token (str, optional): Activeloop token, used for fetching credentials for Deep Lake datasets. This is optional, tokens are normally autogenerated.

Returns:
List: List of dataset names.
"""

deeplake_reporter.feature_report(
feature_name="list",
parameters={"org_id": org_id},
)

client = DeepLakeBackendClient(token=token)
datasets = client.get_datasets(workspace=workspace)
datasets = client.get_datasets(workspace=org_id)
return datasets
6 changes: 2 additions & 4 deletions deeplake/auto/tests/test_coco_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,12 @@ def test_full_dataset_structure(local_ds):
dataset_structure = DatasetStructure(ignore_one_group=False)

dataset_structure.add_first_level_tensor(
TensorStructure("tensor1", params={"htype": "generic"}, primary=False)
TensorStructure("tensor1", params={"htype": "generic"})
)
dataset_structure.add_first_level_tensor(
TensorStructure(
"images",
params={"htype": "image", "sample_compression": "jpeg"},
primary=True,
)
)

Expand Down Expand Up @@ -51,13 +50,12 @@ def test_missing_dataset_structure(local_ds):
local_ds.create_tensor("annotations/masks", htype="binary_mask")

dataset_structure.add_first_level_tensor(
TensorStructure("tensor1", params={"htype": "generic"}, primary=False)
TensorStructure("tensor1", params={"htype": "generic"})
)
dataset_structure.add_first_level_tensor(
TensorStructure(
"images",
params={"htype": "image", "sample_compression": "jpeg"},
primary=True,
)
)

Expand Down
Loading