diff --git a/deeplake/api/dataset.py b/deeplake/api/dataset.py index 36e9eaa291..e45b749fb0 100644 --- a/deeplake/api/dataset.py +++ b/deeplake/api/dataset.py @@ -6,6 +6,11 @@ from typing import Dict, Optional, Union, List from deeplake.auto.unstructured.kaggle import download_kaggle_dataset +from deeplake.auto.unstructured.image_classification import ( + ImageClassification, + AudioClassification, + VideoClassification, +) from deeplake.auto.unstructured.image_classification import ImageClassification from deeplake.auto.unstructured.coco.coco import CocoDataset from deeplake.auto.unstructured.yolo.yolo import YoloDataset @@ -63,6 +68,13 @@ ReadOnlyModeError, LockedException, ) +from hub.compression import ( + IMAGE_COMPRESSIONS, + VIDEO_COMPRESSIONS, + AUDIO_COMPRESSIONS, + BYTE_COMPRESSIONS, + COMPRESSION_ALIASES, +) from deeplake.util.storage import ( get_storage_and_cache_chain, storage_provider_from_path, @@ -72,6 +84,13 @@ from deeplake.util.cache_chain import generate_chain from deeplake.core.storage.deeplake_memory_object import DeepLakeMemoryObject +_image_compressions = ( + IMAGE_COMPRESSIONS[:] + BYTE_COMPRESSIONS + list(COMPRESSION_ALIASES) +) +_image_compressions.remove("dcm") +_video_compressions = VIDEO_COMPRESSIONS +_audio_compressions = AUDIO_COMPRESSIONS + class dataset: @staticmethod @@ -1509,7 +1528,8 @@ def ingest_yolo( def ingest_classification( src: Union[str, pathlib.Path], dest: Union[str, pathlib.Path], - image_params: Optional[Dict] = None, + sample_compression: str = "auto", + primary_params: Optional[Dict] = None, label_params: Optional[Dict] = None, dest_creds: Optional[Union[str, Dict]] = None, progressbar: bool = True, @@ -1529,6 +1549,7 @@ def ingest_classification( - an s3 path of the form ``s3://bucketname/path/to/dataset``. Credentials are required in either the environment or passed to the creds argument. - a local file system path of the form ``./path/to/dataset`` or ``~/path/to/dataset`` or ``path/to/dataset``. - a memory path of the form ``mem://path/to/dataset`` which doesn't save the dataset but keeps it in memory instead. Should be used only for testing as it does not persist. + sample_compression (str): For image classification datasets, this compression will be used for the `images` tensor. If ``sample_compression`` is "auto", compression will be automatically determined by the most common extension in the directory. image_params (Optional[Dict]): A dictionary containing parameters for the images tensor. label_params (Optional[Dict]): A dictionary containing parameters for the labels tensor. dest_creds (Optional[Union[str, Dict]]): The string ``ENV`` or a dictionary containing credentials used to access the destination path of the dataset. @@ -1595,6 +1616,7 @@ def ingest_classification( dest, "ingest_classification", { + "sample_Compression": sample_compression, "Progressbar": progressbar, "Summary": summary, }, @@ -1626,19 +1648,26 @@ def ingest_classification( if not os.path.isdir(src): raise InvalidPathException(src) - if image_params is None: - image_params = {} + if sample_compression == "auto": + sample_compression = get_most_common_extension(src) + if primary_params is None: + primary_params = {} if label_params is None: label_params = {} - if not image_params.get("sample_compression", None): - images_compression = get_most_common_extension(src) - if images_compression is None: + if not primary_params.get("sample_compression", None): + sample_compression = get_most_common_extension(src) + if sample_compression is None: raise InvalidFileExtension(src) - image_params["sample_compression"] = images_compression + primary_params["sample_compression"] = sample_compression # TODO: support more than just image classification (and update docstring) - unstructured = ImageClassification(source=src) + if sample_compression in _image_compressions: + unstructured = ImageClassification(source=src, htype="image") # type: ignore + elif sample_compression in _audio_compressions: + unstructured = AudioClassification(source=src, htype="audio") # type: ignore + elif sample_compression in _video_compressions: + unstructured = VideoClassification(source=src, htype="video") # type: ignore ds = deeplake.empty( dest, creds=dest_creds, token=token, verbose=False, **dataset_kwargs @@ -1652,6 +1681,7 @@ def ingest_classification( ds, # type: ignore progressbar=progressbar, generate_summary=summary, + tensor_args={"sample_compression": sample_compression}, image_tensor_args=image_params, label_tensor_args=label_params, num_workers=num_workers, @@ -1666,8 +1696,8 @@ def ingest_kaggle( src: Union[str, pathlib.Path], dest: Union[str, pathlib.Path], exist_ok: bool = False, - images_compression: str = "auto", - dest_creds: Optional[Union[str, Dict]] = None, + sample_compression: str = "auto", + dest_creds: Optional[Dict] = None, kaggle_credentials: Optional[dict] = None, progressbar: bool = True, summary: bool = True, @@ -1685,8 +1715,8 @@ def ingest_kaggle( - a local file system path of the form ``./path/to/dataset`` or ``~/path/to/dataset`` or ``path/to/dataset``. - a memory path of the form ``mem://path/to/dataset`` which doesn't save the dataset but keeps it in memory instead. Should be used only for testing as it does not persist. exist_ok (bool): If the kaggle dataset was already downloaded and ``exist_ok`` is ``True``, ingestion will proceed without error. - images_compression (str): For image classification datasets, this compression will be used for the ``images`` tensor. If ``images_compression`` is "auto", compression will be automatically determined by the most common extension in the directory. - dest_creds (Optional[Union[str, Dict]]): The string ``ENV`` or a dictionary containing credentials used to access the destination path of the dataset. + sample_compression (str): For image classification datasets, this compression will be used for the ``images`` tensor. If ``sample_compression`` is "auto", compression will be automatically determined by the most common extension in the directory. + dest_creds (Optional[Dict]): A dictionary containing credentials used to access the destination path of the dataset. kaggle_credentials (dict): A dictionary containing kaggle credentials {"username":"YOUR_USERNAME", "key": "YOUR_KEY"}. If ``None``, environment variables/the kaggle.json file will be used if available. progressbar (bool): Enables or disables ingestion progress bar. Set to ``True`` by default. summary (bool): Generates ingestion summary. Set to ``True`` by default. @@ -1709,7 +1739,7 @@ def ingest_kaggle( dest, "ingest_kaggle", { - "Images_Compression": images_compression, + "sample_Compression": sample_compression, "Exist_Ok": exist_ok, "Progressbar": progressbar, "Summary": summary, @@ -1731,6 +1761,7 @@ def ingest_kaggle( ds = deeplake.ingest_classification( src=src, dest=dest, + sample_compression=sample_compression, image_params={"sample_compression": images_compression}, dest_creds=dest_creds, progressbar=progressbar, diff --git a/deeplake/auto/tests/test_ingestion.py b/deeplake/auto/tests/test_ingestion.py index af99840aa3..e0f2d5db5e 100644 --- a/deeplake/auto/tests/test_ingestion.py +++ b/deeplake/auto/tests/test_ingestion.py @@ -120,7 +120,62 @@ def test_image_classification_sets(memory_ds: Dataset): assert ds["train/labels"].info.class_names == ["class0", "class1", "class2"] -def test_ingestion_exception(memory_path: str): +def test_audio(memory_ds: Dataset): + path = get_dummy_data_path("tests_auto/audio_classification") + src = "test_auto/invalid_path" + ds = deeplake.ingest( + src=path, dest=memory_ds.path, progressbar=False, summary=False, overwrite=False + ) + + with pytest.raises(InvalidPathException): + deeplake.ingest( + src=src, + dest=memory_ds.path, + progressbar=False, + summary=False, + overwrite=False, + ) + + with pytest.raises(SamePathException): + deeplake.ingest( + src=path, dest=path, progressbar=False, summary=False, overwrite=False + ) + + assert ds["audios"].meta.sample_compression == "mp3" + assert list(ds.tensors.keys()) == ["audios", "labels"] + assert ds["audios"].numpy().shape == (0,) + assert ds["audios"].numpy().shape == (0,) + assert ds["labels"].info.class_names == ("class0", "class1", "class2") + +def test_video(memory_ds: Dataset): + path = get_dummy_data_path("tests_auto/video_classification") + src = "test_auto/invalid_path" + ds = deeplake.ingest( + src=path, dest=memory_ds.path, progressbar=False, summary=False, overwrite=False + ) + + with pytest.raises(InvalidPathException): + deeplake.ingest( + src=src, + dest=memory_ds.path, + progressbar=False, + summary=False, + overwrite=False, + ) + + with pytest.raises(SamePathException): + deeplake.ingest( + src=path, dest=path, progressbar=False, summary=False, overwrite=False + ) + + assert ds["videos"].meta.sample_compression == "mp4" + assert list(ds.tensors.keys()) == ["videos", "labels"] + assert ds["videos"].numpy().shape == (0,) + assert ds["videos"].numpy().shape == (0,) + assert ds["labels"].info.class_names == ("class0", "class1", "class2") + + +def test_ingestion_exception(memory_ds: Dataset): path = get_dummy_data_path("tests_auto/image_classification_with_sets") with pytest.raises(InvalidPathException): deeplake.ingest_classification( diff --git a/deeplake/auto/unstructured/image_classification.py b/deeplake/auto/unstructured/image_classification.py index b1fc0f4c24..9bf1326531 100644 --- a/deeplake/auto/unstructured/image_classification.py +++ b/deeplake/auto/unstructured/image_classification.py @@ -17,7 +17,6 @@ import deeplake -IMAGES_TENSOR_NAME = "images" LABELS_TENSOR_NAME = "labels" @@ -43,8 +42,8 @@ def _set_name_from_path(path: Path) -> str: return path.parts[-3] -class ImageClassification(UnstructuredDataset): - def __init__(self, source: str): +class Classification(UnstructuredDataset): + def __init__(self, source: str, htype: str): """Convert an unstructured dataset to a structured dataset. Note: @@ -70,6 +69,7 @@ def __init__(self, source: str): f"No files found in {self.source}. Please ensure that the source path is correct." ) + self.htype = htype self.set_names = self.get_set_names() self.class_names = self.get_class_names() @@ -94,6 +94,7 @@ def structure( # type: ignore ds: Dataset, progressbar: bool = True, generate_summary: bool = True, + tensor_args: dict = {}, shuffle: bool = True, image_tensor_args: dict = {}, label_tensor_args: dict = {}, @@ -105,6 +106,7 @@ def structure( # type: ignore ds (Dataset): A Deep Lake dataset object. progressbar (bool): Defines if the method uses a progress bar. Defaults to True. generate_summary (bool): Defines if the method generates ingestion summary. Defaults to True. + tensor_args (dict): Defines the sample compression of the dataset (jpeg or png). shuffle (bool): Defines if the file paths should be shuffled prior to ingestion. Defaults to True. image_tensor_args (dict): Defines the parameters for the images tensor. label_tensor_args (dict): Defines the parameters for the class_labels tensor. @@ -115,7 +117,7 @@ def structure( # type: ignore """ - images_tensor_map = {} + tensor_map = {} labels_tensor_map = {} use_set_prefix = len(self.set_names) > 1 @@ -124,6 +126,9 @@ def structure( # type: ignore if not use_set_prefix: set_name = "" + tensor_name = os.path.join(set_name, self.htype + "s") + labels_tensor_name = os.path.join(set_name, LABELS_TENSOR_NAME) + tensor_map[set_name] = tensor_name.replace("\\", "/") images_tensor_name = os.path.join( set_name, image_tensor_args.pop("name", IMAGES_TENSOR_NAME) ) @@ -135,9 +140,9 @@ def structure( # type: ignore # TODO: infer sample_compression ds.create_tensor( - images_tensor_name.replace("\\", "/"), - htype="image", - **image_tensor_args, + tensor_name.replace("\\", "/"), + htype=self.htype, + **tensor_args, ) ds.create_tensor( labels_tensor_name.replace("\\", "/"), @@ -146,6 +151,65 @@ def structure( # type: ignore **label_tensor_args, ) + paths = self._abs_file_paths + skipped_files: list = [] + + iterator = tqdm( + paths, + desc='Ingesting "%s" (%i files skipped)' + % (self.source.name, len(skipped_files)), + total=len(paths), + disable=not progressbar, + ) + + with ds, iterator: + for file_path in iterator: + image = deeplake.read(file_path) + + class_name = _class_name_from_path(file_path) + + label = np.uint32(self.class_names.index(class_name)) + + set_name = _set_name_from_path(file_path) if use_set_prefix else "" + + # TODO: try to get all len(shape)s to match. + # if appending fails because of a shape mismatch, expand dims (might also fail) + try: + ds[tensor_map[set_name]].append(image) + + except TensorInvalidSampleShapeError: + im = image.array + reshaped_image = np.expand_dims(im, -1) + ds[tensor_map[set_name]].append(reshaped_image) + + except Exception: + skipped_files.append(file_path.name) + iterator.set_description( + 'Ingesting "%s" (%i files skipped)' + % (self.source.name, len(skipped_files)) + ) + continue + + ds[labels_tensor_map[set_name]].append(label) + + if generate_summary: + ingestion_summary(str(self.source), skipped_files) + return ds + + +class ImageClassification(Classification): + def __init__(self, source: str, htype: str): + super().__init__(source, htype) + + +class AudioClassification(Classification): + def __init__(self, source: str, htype: str): + super().__init__(source, htype) + + +class VideoClassification(Classification): + def __init__(self, source: str, htype: str): + super().__init__(source, htype) paths = self._abs_file_paths if shuffle: rshuffle(paths) diff --git a/deeplake/compression.py b/deeplake/compression.py index 8b9139b2ec..23446a9c29 100644 --- a/deeplake/compression.py +++ b/deeplake/compression.py @@ -70,6 +70,38 @@ ) VIDEO_COMPRESSIONS = ["mp4", "mkv", "avi"] + +VIDEO_COMPRESSION_EXT_DICT = { + "mp4": [".mp4"], + "mkv": [".mkv"], + "avi": [".avi"], +} + +VIDEO_COMPRESSION_EXTENSIONS = list( + set(itertools.chain(*VIDEO_COMPRESSION_EXT_DICT.values())) +) + +AUDIO_COMPRESSIONS = ["mp3", "flac", "wav"] + + +AUDIO_COMPRESSION_EXT_DICT = { + "mp3": [".mp3"], + "flac": [".flac"], + "wav": [".wav"], +} + +AUDIO_COMPRESSION_EXTENSIONS = list( + set(itertools.chain(*AUDIO_COMPRESSION_EXT_DICT.values())) +) + +COMPRESSION_EXTENSIONS = list( + IMAGE_COMPRESSION_EXTENSIONS + + VIDEO_COMPRESSION_EXTENSIONS + + AUDIO_COMPRESSION_EXTENSIONS +) + + +READONLY_COMPRESSIONS = ["mpo", "fli", "dcm", *AUDIO_COMPRESSIONS, *VIDEO_COMPRESSIONS] AUDIO_COMPRESSIONS = ["mp3", "flac", "wav"] NIFTI_COMPRESSIONS = ["nii", "nii.gz"] POINT_CLOUD_COMPRESSIONS = ["las"] diff --git a/deeplake/core/compression.py b/deeplake/core/compression.py index 6ca6eebbc8..79299708c5 100644 --- a/deeplake/core/compression.py +++ b/deeplake/core/compression.py @@ -881,9 +881,9 @@ def _open_video(file: Union[str, bytes, memoryview]): raise ModuleNotFoundError( "PyAV is not installed. Run `pip install deeplake[video]`." ) - if isinstance(file, str): + if isinstance(file, (str, Path)): container = av.open( - file, options={"protocol_whitelist": "file,http,https,tcp,tls,subfile"} + str(file), options={"protocol_whitelist": "file,http,https,tcp,tls,subfile"} ) else: container = av.open(BytesIO(file)) @@ -1056,9 +1056,9 @@ def _open_audio(file: Union[str, bytes, memoryview]): raise ModuleNotFoundError( "PyAV is not installed. Please run `pip install deeplake[audio]`" ) - if isinstance(file, str): + if isinstance(file, (str, Path)): container = av.open( - file, options={"protocol_whitelist": "file,http,https,tcp,tls,subfile"} + str(file), options={"protocol_whitelist": "file,http,https,tcp,tls,subfile"} ) else: container = av.open(BytesIO(file)) diff --git a/deeplake/tests/dummy_data/tests_auto/audio_classification/class0/samplemp3.mp3 b/deeplake/tests/dummy_data/tests_auto/audio_classification/class0/samplemp3.mp3 new file mode 100644 index 0000000000..0379b4d748 Binary files /dev/null and b/deeplake/tests/dummy_data/tests_auto/audio_classification/class0/samplemp3.mp3 differ diff --git a/deeplake/tests/dummy_data/tests_auto/audio_classification/class1/samplemp3.mp3 b/deeplake/tests/dummy_data/tests_auto/audio_classification/class1/samplemp3.mp3 new file mode 100644 index 0000000000..0379b4d748 Binary files /dev/null and b/deeplake/tests/dummy_data/tests_auto/audio_classification/class1/samplemp3.mp3 differ diff --git a/deeplake/tests/dummy_data/tests_auto/audio_classification/class2/samplemp3.mp3 b/deeplake/tests/dummy_data/tests_auto/audio_classification/class2/samplemp3.mp3 new file mode 100644 index 0000000000..0379b4d748 Binary files /dev/null and b/deeplake/tests/dummy_data/tests_auto/audio_classification/class2/samplemp3.mp3 differ diff --git a/deeplake/tests/dummy_data/tests_auto/videos_classification/class0/samplemp4.mp4 b/deeplake/tests/dummy_data/tests_auto/videos_classification/class0/samplemp4.mp4 new file mode 100644 index 0000000000..f4cb0547a6 Binary files /dev/null and b/deeplake/tests/dummy_data/tests_auto/videos_classification/class0/samplemp4.mp4 differ diff --git a/deeplake/tests/dummy_data/tests_auto/videos_classification/class1/samplemp4.mp4 b/deeplake/tests/dummy_data/tests_auto/videos_classification/class1/samplemp4.mp4 new file mode 100644 index 0000000000..f4cb0547a6 Binary files /dev/null and b/deeplake/tests/dummy_data/tests_auto/videos_classification/class1/samplemp4.mp4 differ diff --git a/deeplake/tests/dummy_data/tests_auto/videos_classification/class2/samplemp4.mp4 b/deeplake/tests/dummy_data/tests_auto/videos_classification/class2/samplemp4.mp4 new file mode 100644 index 0000000000..f4cb0547a6 Binary files /dev/null and b/deeplake/tests/dummy_data/tests_auto/videos_classification/class2/samplemp4.mp4 differ diff --git a/deeplake/util/auto.py b/deeplake/util/auto.py index 4dfb6e28b7..76eb036389 100644 --- a/deeplake/util/auto.py +++ b/deeplake/util/auto.py @@ -4,11 +4,11 @@ from typing import Tuple import shutil from deeplake.util.exceptions import AutoCompressionError -from deeplake.compression import IMAGE_COMPRESSION_EXTENSIONS +from deeplake.compression import COMPRESSION_EXTENSIONS def get_most_common_extension( - local_path: str, allowed_extensions: Tuple = tuple(IMAGE_COMPRESSION_EXTENSIONS) + local_path: str, allowed_extensions: Tuple = tuple(COMPRESSION_EXTENSIONS) ): """Determines the most frequently used extension in a directory of files.