diff --git a/docs/source/datapoints.rst b/docs/source/datapoints.rst index 55d3cda4a8c..90ab3375adb 100644 --- a/docs/source/datapoints.rst +++ b/docs/source/datapoints.rst @@ -14,6 +14,6 @@ see e.g. :ref:`sphx_glr_auto_examples_plot_transforms_v2_e2e.py`. Image Video - BoundingBoxFormat - BoundingBoxes + BBoxFormat + BBoxes Mask diff --git a/docs/source/models.rst b/docs/source/models.rst index 15540778602..f9e7963e221 100644 --- a/docs/source/models.rst +++ b/docs/source/models.rst @@ -413,7 +413,7 @@ Here is an example of how to use the pre-trained object detection models: from torchvision.io.image import read_image from torchvision.models.detection import fasterrcnn_resnet50_fpn_v2, FasterRCNN_ResNet50_FPN_V2_Weights - from torchvision.utils import draw_bounding_boxes + from torchvision.utils import draw_bboxes from torchvision.transforms.functional import to_pil_image img = read_image("test/assets/encode_jpeg/grace_hopper_517x606.jpg") @@ -432,7 +432,7 @@ Here is an example of how to use the pre-trained object detection models: # Step 4: Use the model and visualize the prediction prediction = model(batch)[0] labels = [weights.meta["categories"][i] for i in prediction["labels"]] - box = draw_bounding_boxes(img, boxes=prediction["boxes"], + box = draw_bboxes(img, boxes=prediction["boxes"], labels=labels, colors="red", width=4, font_size=30) diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst index 9f3efe30341..874156a6041 100644 --- a/docs/source/transforms.rst +++ b/docs/source/transforms.rst @@ -206,8 +206,8 @@ Miscellaneous v2.RandomErasing Lambda v2.Lambda - v2.SanitizeBoundingBoxes - v2.ClampBoundingBoxes + v2.SanitizeBBoxes + v2.ClampBBoxes v2.UniformTemporalSubsample .. _conversion_transforms: @@ -236,7 +236,7 @@ Conversion ConvertImageDtype v2.ConvertImageDtype v2.ToDtype - v2.ConvertBoundingBoxFormat + v2.ConvertBBoxFormat Auto-Augmentation ----------------- diff --git a/docs/source/utils.rst b/docs/source/utils.rst index 971381a658f..ee614f6e67f 100644 --- a/docs/source/utils.rst +++ b/docs/source/utils.rst @@ -12,7 +12,7 @@ visualization `. :toctree: generated/ :template: function.rst - draw_bounding_boxes + draw_bboxes draw_segmentation_masks draw_keypoints flow_to_image diff --git a/references/detection/presets.py b/references/detection/presets.py index 098ec85e690..0b8d1124d19 100644 --- a/references/detection/presets.py +++ b/references/detection/presets.py @@ -77,8 +77,8 @@ def __init__( if use_v2: transforms += [ - T.ConvertBoundingBoxFormat(datapoints.BoundingBoxFormat.XYXY), - T.SanitizeBoundingBoxes(), + T.ConvertBBoxFormat(datapoints.BBoxFormat.XYXY), + T.SanitizeBBoxes(), ] self.transforms = T.Compose(transforms) diff --git a/test/builtin_dataset_mocks.py b/test/builtin_dataset_mocks.py index ef5d5e1ec96..26f2a3076ad 100644 --- a/test/builtin_dataset_mocks.py +++ b/test/builtin_dataset_mocks.py @@ -925,7 +925,7 @@ def _make_attributes_file(cls, root, image_file_names): cls._make_ann_file(root, "list_attr_celeba.txt", data, field_names=(*field_names, "")) @classmethod - def _make_bounding_boxes_file(cls, root, image_file_names): + def _make_bboxes_file(cls, root, image_file_names): field_names = ("image_id", "x_1", "y_1", "width", "height") data = [ [f"{name} ", *[f"{coord:3d}" for coord in make_tensor((4,), low=0, dtype=torch.int).tolist()]] @@ -960,7 +960,7 @@ def generate(cls, root): for make_ann_file_fn in ( cls._make_identity_file, cls._make_attributes_file, - cls._make_bounding_boxes_file, + cls._make_bboxes_file, cls._make_landmarks_file, ): make_ann_file_fn(root, image_file_names) @@ -1342,7 +1342,7 @@ def _make_archive(cls, root): with open(archive_folder / "train_test_split.txt", "w") as file: file.write("\n".join(f"{image_id} {split_id}" for image_id, split_id in zip(image_ids, split_ids))) - with open(archive_folder / "bounding_boxes.txt", "w") as file: + with open(archive_folder / "bboxes.txt", "w") as file: file.write( "\n".join( " ".join( diff --git a/test/common_utils.py b/test/common_utils.py index c9cff035cac..555cd5612db 100644 --- a/test/common_utils.py +++ b/test/common_utils.py @@ -620,15 +620,15 @@ def make_image_loaders_for_interpolation( @dataclasses.dataclass -class BoundingBoxesLoader(TensorLoader): - format: datapoints.BoundingBoxFormat +class BBoxesLoader(TensorLoader): + format: datapoints.BBoxFormat spatial_size: Tuple[int, int] -def make_bounding_box( +def make_bbox( size=None, *, - format=datapoints.BoundingBoxFormat.XYXY, + format=datapoints.BBoxFormat.XYXY, spatial_size=None, batch_dims=(), dtype=None, @@ -639,7 +639,7 @@ def make_bounding_box( - (box[3] - box[1], box[2] - box[0]) for XYXY - (H, W) for XYWH and CXCYWH spatial_size: Size of the reference object, e.g. an image. Corresponds to the .spatial_size attribute on - returned datapoints.BoundingBoxes + returned datapoints.BBoxes To generate a valid joint sample, you need to set spatial_size here to the same value as size on the other maker functions, e.g. @@ -647,8 +647,8 @@ def make_bounding_box( .. code:: image = make_image=(size=size) - bounding_boxes = make_bounding_box(spatial_size=size) - assert F.get_spatial_size(bounding_boxes) == F.get_spatial_size(image) + bboxes = make_bbox(spatial_size=size) + assert F.get_spatial_size(bboxes) == F.get_spatial_size(image) For convenience, if both size and spatial_size are omitted, spatial_size defaults to the same value as size for all other maker functions, e.g. @@ -656,8 +656,8 @@ def make_bounding_box( .. code:: image = make_image=() - bounding_boxes = make_bounding_box() - assert F.get_spatial_size(bounding_boxes) == F.get_spatial_size(image) + bboxes = make_bbox() + assert F.get_spatial_size(bboxes) == F.get_spatial_size(image) """ def sample_position(values, max_value): @@ -666,7 +666,7 @@ def sample_position(values, max_value): return torch.stack([torch.randint(max_value - v, ()) for v in values.flatten().tolist()]).reshape(values.shape) if isinstance(format, str): - format = datapoints.BoundingBoxFormat[format] + format = datapoints.BBoxFormat[format] if spatial_size is None: if size is None: @@ -679,7 +679,7 @@ def sample_position(values, max_value): dtype = dtype or torch.float32 if any(dim == 0 for dim in batch_dims): - return datapoints.BoundingBoxes( + return datapoints.BBoxes( torch.empty(*batch_dims, 4, dtype=dtype, device=device), format=format, spatial_size=spatial_size ) @@ -691,28 +691,28 @@ def sample_position(values, max_value): y = sample_position(h, spatial_size[0]) x = sample_position(w, spatial_size[1]) - if format is datapoints.BoundingBoxFormat.XYWH: + if format is datapoints.BBoxFormat.XYWH: parts = (x, y, w, h) - elif format is datapoints.BoundingBoxFormat.XYXY: + elif format is datapoints.BBoxFormat.XYXY: x1, y1 = x, y x2 = x1 + w y2 = y1 + h parts = (x1, y1, x2, y2) - elif format is datapoints.BoundingBoxFormat.CXCYWH: + elif format is datapoints.BBoxFormat.CXCYWH: cx = x + w / 2 cy = y + h / 2 parts = (cx, cy, w, h) else: raise ValueError(f"Format {format} is not supported") - return datapoints.BoundingBoxes( + return datapoints.BBoxes( torch.stack(parts, dim=-1).to(dtype=dtype, device=device), format=format, spatial_size=spatial_size ) -def make_bounding_box_loader(*, extra_dims=(), format, spatial_size=DEFAULT_PORTRAIT_SPATIAL_SIZE, dtype=torch.float32): +def make_bbox_loader(*, extra_dims=(), format, spatial_size=DEFAULT_PORTRAIT_SPATIAL_SIZE, dtype=torch.float32): if isinstance(format, str): - format = datapoints.BoundingBoxFormat[format] + format = datapoints.BBoxFormat[format] spatial_size = _parse_spatial_size(spatial_size, name="spatial_size") @@ -721,25 +721,23 @@ def fn(shape, dtype, device): if num_coordinates != 4: raise pytest.UsageError() - return make_bounding_box( - format=format, spatial_size=spatial_size, batch_dims=batch_dims, dtype=dtype, device=device - ) + return make_bbox(format=format, spatial_size=spatial_size, batch_dims=batch_dims, dtype=dtype, device=device) - return BoundingBoxesLoader(fn, shape=(*extra_dims, 4), dtype=dtype, format=format, spatial_size=spatial_size) + return BBoxesLoader(fn, shape=(*extra_dims, 4), dtype=dtype, format=format, spatial_size=spatial_size) -def make_bounding_box_loaders( +def make_bbox_loaders( *, extra_dims=DEFAULT_EXTRA_DIMS, - formats=tuple(datapoints.BoundingBoxFormat), + formats=tuple(datapoints.BBoxFormat), spatial_size=DEFAULT_PORTRAIT_SPATIAL_SIZE, dtypes=(torch.float32, torch.float64, torch.int64), ): for params in combinations_grid(extra_dims=extra_dims, format=formats, dtype=dtypes): - yield make_bounding_box_loader(**params, spatial_size=spatial_size) + yield make_bbox_loader(**params, spatial_size=spatial_size) -make_bounding_boxes = from_loaders(make_bounding_box_loaders) +make_bboxes = from_loaders(make_bbox_loaders) class MaskLoader(TensorLoader): diff --git a/test/test_datapoints.py b/test/test_datapoints.py index a5f09043582..abb22a2134a 100644 --- a/test/test_datapoints.py +++ b/test/test_datapoints.py @@ -23,15 +23,13 @@ def test_mask_instance(data): @pytest.mark.parametrize("data", [torch.randint(0, 32, size=(5, 4)), [[0, 0, 5, 5], [2, 2, 7, 7]]]) -@pytest.mark.parametrize( - "format", ["XYXY", "CXCYWH", datapoints.BoundingBoxFormat.XYXY, datapoints.BoundingBoxFormat.XYWH] -) +@pytest.mark.parametrize("format", ["XYXY", "CXCYWH", datapoints.BBoxFormat.XYXY, datapoints.BBoxFormat.XYWH]) def test_bbox_instance(data, format): - bboxes = datapoints.BoundingBoxes(data, format=format, spatial_size=(32, 32)) + bboxes = datapoints.BBoxes(data, format=format, spatial_size=(32, 32)) assert isinstance(bboxes, torch.Tensor) assert bboxes.ndim == 2 and bboxes.shape[1] == 4 if isinstance(format, str): - format = datapoints.BoundingBoxFormat[(format.upper())] + format = datapoints.BBoxFormat[(format.upper())] assert bboxes.format == format @@ -164,7 +162,7 @@ def test_wrap_like(): [ datapoints.Image(torch.rand(3, 16, 16)), datapoints.Video(torch.rand(2, 3, 16, 16)), - datapoints.BoundingBoxes([0.0, 1.0, 2.0, 3.0], format=datapoints.BoundingBoxFormat.XYXY, spatial_size=(10, 10)), + datapoints.BBoxes([0.0, 1.0, 2.0, 3.0], format=datapoints.BBoxFormat.XYXY, spatial_size=(10, 10)), datapoints.Mask(torch.randint(0, 256, (16, 16), dtype=torch.uint8)), ], ) diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py index d1f24410703..c5ae7c6de93 100644 --- a/test/test_prototype_transforms.py +++ b/test/test_prototype_transforms.py @@ -9,7 +9,7 @@ from common_utils import ( assert_equal, DEFAULT_EXTRA_DIMS, - make_bounding_box, + make_bbox, make_detection_mask, make_image, make_images, @@ -20,7 +20,7 @@ from prototype_common_utils import make_label, make_one_hot_labels -from torchvision.datapoints import BoundingBoxes, BoundingBoxFormat, Image, Mask, Video +from torchvision.datapoints import BBoxes, BBoxFormat, Image, Mask, Video from torchvision.prototype import datapoints, transforms from torchvision.transforms.v2._utils import _convert_fill_arg from torchvision.transforms.v2.functional import InterpolationMode, pil_to_tensor, to_image_pil @@ -78,7 +78,7 @@ def test_mixup_cutmix(transform, input): input_copy = dict(input) for unsup_data in [ make_label(), - make_bounding_box(format="XYXY"), + make_bbox(format="XYXY"), make_detection_mask(), make_segmentation_mask(), ]: @@ -101,10 +101,10 @@ def test__extract_image_targets_assertion(self, mocker): self.create_fake_image(mocker, Image), # labels, bboxes, masks mocker.MagicMock(spec=datapoints.Label), - mocker.MagicMock(spec=BoundingBoxes), + mocker.MagicMock(spec=BBoxes), mocker.MagicMock(spec=Mask), # labels, bboxes, masks - mocker.MagicMock(spec=BoundingBoxes), + mocker.MagicMock(spec=BBoxes), mocker.MagicMock(spec=Mask), ] @@ -122,11 +122,11 @@ def test__extract_image_targets(self, image_type, label_type, mocker): self.create_fake_image(mocker, image_type), # labels, bboxes, masks mocker.MagicMock(spec=label_type), - mocker.MagicMock(spec=BoundingBoxes), + mocker.MagicMock(spec=BBoxes), mocker.MagicMock(spec=Mask), # labels, bboxes, masks mocker.MagicMock(spec=label_type), - mocker.MagicMock(spec=BoundingBoxes), + mocker.MagicMock(spec=BBoxes), mocker.MagicMock(spec=Mask), ] @@ -142,7 +142,7 @@ def test__extract_image_targets(self, image_type, label_type, mocker): for target in targets: for key, type_ in [ - ("boxes", BoundingBoxes), + ("boxes", BBoxes), ("masks", Mask), ("labels", label_type), ]: @@ -163,7 +163,7 @@ def test__copy_paste(self, label_type): if label_type == datapoints.OneHotLabel: labels = torch.nn.functional.one_hot(labels, num_classes=5) target = { - "boxes": BoundingBoxes( + "boxes": BBoxes( torch.tensor([[2.0, 3.0, 8.0, 9.0], [20.0, 20.0, 30.0, 30.0]]), format="XYXY", spatial_size=(32, 32) ), "masks": Mask(masks), @@ -178,7 +178,7 @@ def test__copy_paste(self, label_type): if label_type == datapoints.OneHotLabel: paste_labels = torch.nn.functional.one_hot(paste_labels, num_classes=5) paste_target = { - "boxes": BoundingBoxes( + "boxes": BBoxes( torch.tensor([[12.0, 13.0, 19.0, 18.0], [1.0, 15.0, 8.0, 19.0]]), format="XYXY", spatial_size=(32, 32) ), "masks": Mask(paste_masks), @@ -216,7 +216,7 @@ def test__get_params(self, mocker): flat_inputs = [ make_image(size=spatial_size, color_space="RGB"), - make_bounding_box(format=BoundingBoxFormat.XYXY, spatial_size=spatial_size, batch_dims=batch_shape), + make_bbox(format=BBoxFormat.XYXY, spatial_size=spatial_size, batch_dims=batch_shape), ] params = transform._get_params(flat_inputs) @@ -311,9 +311,7 @@ def test__transform_culling(self, mocker): ), ) - bounding_boxes = make_bounding_box( - format=BoundingBoxFormat.XYXY, spatial_size=spatial_size, batch_dims=(batch_size,) - ) + bboxes = make_bbox(format=BBoxFormat.XYXY, spatial_size=spatial_size, batch_dims=(batch_size,)) masks = make_detection_mask(size=spatial_size, batch_dims=(batch_size,)) labels = make_label(extra_dims=(batch_size,)) @@ -322,17 +320,17 @@ def test__transform_culling(self, mocker): output = transform( dict( - bounding_boxes=bounding_boxes, + bboxes=bboxes, masks=masks, labels=labels, ) ) - assert_equal(output["bounding_boxes"], bounding_boxes[is_valid]) + assert_equal(output["bboxes"], bboxes[is_valid]) assert_equal(output["masks"], masks[is_valid]) assert_equal(output["labels"], labels[is_valid]) - def test__transform_bounding_boxes_clamping(self, mocker): + def test__transform_bboxes_clamping(self, mocker): batch_size = 3 spatial_size = (10, 10) @@ -349,15 +347,13 @@ def test__transform_bounding_boxes_clamping(self, mocker): ), ) - bounding_boxes = make_bounding_box( - format=BoundingBoxFormat.XYXY, spatial_size=spatial_size, batch_dims=(batch_size,) - ) - mock = mocker.patch("torchvision.prototype.transforms._geometry.F.clamp_bounding_boxes") + bboxes = make_bbox(format=BBoxFormat.XYXY, spatial_size=spatial_size, batch_dims=(batch_size,)) + mock = mocker.patch("torchvision.prototype.transforms._geometry.F.clamp_bboxes") transform = transforms.FixedSizeCrop((-1, -1)) mocker.patch("torchvision.prototype.transforms._geometry.has_any", return_value=True) - transform(bounding_boxes) + transform(bboxes) mock.assert_called_once() @@ -390,7 +386,7 @@ class TestPermuteDimensions: def test_call(self, dims, inverse_dims): sample = dict( image=make_image(), - bounding_boxes=make_bounding_box(format=BoundingBoxFormat.XYXY), + bboxes=make_bbox(format=BBoxFormat.XYXY), video=make_video(), str="str", int=0, @@ -434,7 +430,7 @@ class TestTransposeDimensions: def test_call(self, dims): sample = dict( image=make_image(), - bounding_boxes=make_bounding_box(format=BoundingBoxFormat.XYXY), + bboxes=make_bbox(format=BBoxFormat.XYXY), video=make_video(), str="str", int=0, @@ -496,7 +492,7 @@ def make_datapoints(): pil_image = to_image_pil(make_image(size=size, color_space="RGB")) target = { - "boxes": make_bounding_box(spatial_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float), + "boxes": make_bbox(spatial_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float), "labels": make_label(extra_dims=(num_objects,), categories=80), "masks": make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long), } @@ -505,7 +501,7 @@ def make_datapoints(): tensor_image = torch.Tensor(make_image(size=size, color_space="RGB")) target = { - "boxes": make_bounding_box(spatial_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float), + "boxes": make_bbox(spatial_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float), "labels": make_label(extra_dims=(num_objects,), categories=80), "masks": make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long), } @@ -514,7 +510,7 @@ def make_datapoints(): datapoint_image = make_image(size=size, color_space="RGB") target = { - "boxes": make_bounding_box(spatial_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float), + "boxes": make_bbox(spatial_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float), "labels": make_label(extra_dims=(num_objects,), categories=80), "masks": make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long), } diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py index e5624d78fed..0e1bce81560 100644 --- a/test/test_transforms_v2.py +++ b/test/test_transforms_v2.py @@ -16,8 +16,8 @@ assert_equal, assert_run_python_script, cpu_and_cuda, - make_bounding_box, - make_bounding_boxes, + make_bbox, + make_bboxes, make_detection_mask, make_image, make_images, @@ -45,9 +45,9 @@ def make_pil_images(*args, **kwargs): yield to_pil_image(image) -def make_vanilla_tensor_bounding_boxes(*args, **kwargs): - for bounding_boxes in make_bounding_boxes(*args, **kwargs): - yield bounding_boxes.data +def make_vanilla_tensor_bboxes(*args, **kwargs): + for bboxes in make_bboxes(*args, **kwargs): + yield bboxes.data def parametrize(transforms_with_inputs): @@ -69,7 +69,7 @@ def auto_augment_adapter(transform, input, device): adapted_input = {} image_or_video_found = False for key, value in input.items(): - if isinstance(value, (datapoints.BoundingBoxes, datapoints.Mask)): + if isinstance(value, (datapoints.BBoxes, datapoints.Mask)): # AA transforms don't support bounding boxes or masks continue elif check_type(value, (datapoints.Image, datapoints.Video, is_simple_tensor, PIL.Image.Image)): @@ -143,8 +143,8 @@ class TestSmoke: (transforms.RandomZoomOut(p=1.0), None), (transforms.Resize([16, 16], antialias=True), None), (transforms.ScaleJitter((16, 16), scale_range=(0.8, 1.2), antialias=True), None), - (transforms.ClampBoundingBoxes(), None), - (transforms.ConvertBoundingBoxFormat(datapoints.BoundingBoxFormat.CXCYWH), None), + (transforms.ClampBBoxes(), None), + (transforms.ConvertBBoxFormat(datapoints.BBoxFormat.CXCYWH), None), (transforms.ConvertImageDtype(), None), (transforms.GaussianBlur(kernel_size=3), None), ( @@ -180,16 +180,10 @@ def test_common(self, transform, adapter, container_type, image_or_video, device image_datapoint=make_image(size=spatial_size), video_datapoint=make_video(size=spatial_size), image_pil=next(make_pil_images(sizes=[spatial_size], color_spaces=["RGB"])), - bounding_boxes_xyxy=make_bounding_box( - format=datapoints.BoundingBoxFormat.XYXY, spatial_size=spatial_size, batch_dims=(3,) - ), - bounding_boxes_xywh=make_bounding_box( - format=datapoints.BoundingBoxFormat.XYWH, spatial_size=spatial_size, batch_dims=(4,) - ), - bounding_boxes_cxcywh=make_bounding_box( - format=datapoints.BoundingBoxFormat.CXCYWH, spatial_size=spatial_size, batch_dims=(5,) - ), - bounding_boxes_degenerate_xyxy=datapoints.BoundingBoxes( + bboxes_xyxy=make_bbox(format=datapoints.BBoxFormat.XYXY, spatial_size=spatial_size, batch_dims=(3,)), + bboxes_xywh=make_bbox(format=datapoints.BBoxFormat.XYWH, spatial_size=spatial_size, batch_dims=(4,)), + bboxes_cxcywh=make_bbox(format=datapoints.BBoxFormat.CXCYWH, spatial_size=spatial_size, batch_dims=(5,)), + bboxes_degenerate_xyxy=datapoints.BBoxes( [ [0, 0, 0, 0], # no height or width [0, 0, 0, 1], # no height @@ -198,10 +192,10 @@ def test_common(self, transform, adapter, container_type, image_or_video, device [0, 2, 1, 1], # x1 < x2, y1 > y2 [2, 2, 1, 1], # x1 > x2, y1 > y2 ], - format=datapoints.BoundingBoxFormat.XYXY, + format=datapoints.BBoxFormat.XYXY, spatial_size=spatial_size, ), - bounding_boxes_degenerate_xywh=datapoints.BoundingBoxes( + bboxes_degenerate_xywh=datapoints.BBoxes( [ [0, 0, 0, 0], # no height or width [0, 0, 0, 1], # no height @@ -210,10 +204,10 @@ def test_common(self, transform, adapter, container_type, image_or_video, device [0, 0, -1, 1], # negative width [0, 0, -1, -1], # negative height and width ], - format=datapoints.BoundingBoxFormat.XYWH, + format=datapoints.BBoxFormat.XYWH, spatial_size=spatial_size, ), - bounding_boxes_degenerate_cxcywh=datapoints.BoundingBoxes( + bboxes_degenerate_cxcywh=datapoints.BBoxes( [ [0, 0, 0, 0], # no height or width [0, 0, 0, 1], # no height @@ -222,7 +216,7 @@ def test_common(self, transform, adapter, container_type, image_or_video, device [0, 0, -1, 1], # negative width [0, 0, -1, -1], # negative height and width ], - format=datapoints.BoundingBoxFormat.CXCYWH, + format=datapoints.BBoxFormat.CXCYWH, spatial_size=spatial_size, ), detection_mask=make_detection_mask(size=spatial_size), @@ -261,20 +255,18 @@ def test_common(self, transform, adapter, container_type, image_or_video, device else: assert output_item is input_item - if isinstance(input_item, datapoints.BoundingBoxes) and not isinstance( - transform, transforms.ConvertBoundingBoxFormat - ): + if isinstance(input_item, datapoints.BBoxes) and not isinstance(transform, transforms.ConvertBBoxFormat): assert output_item.format == input_item.format # Enforce that the transform does not turn a degenerate box marked by RandomIoUCrop (or any other future # transform that does this), back into a valid one. # TODO: we should test that against all degenerate boxes above - for format in list(datapoints.BoundingBoxFormat): + for format in list(datapoints.BBoxFormat): sample = dict( - boxes=datapoints.BoundingBoxes([[0, 0, 0, 0]], format=format, spatial_size=(224, 244)), + boxes=datapoints.BBoxes([[0, 0, 0, 0]], format=format, spatial_size=(224, 244)), labels=torch.tensor([3]), ) - assert transforms.SanitizeBoundingBoxes()(sample)["boxes"].shape == (0, 4) + assert transforms.SanitizeBBoxes()(sample)["boxes"].shape == (0, 4) @parametrize( [ @@ -942,7 +934,7 @@ def test__transform(self, mocker, p): class TestTransform: @pytest.mark.parametrize( "inpt_type", - [torch.Tensor, PIL.Image.Image, datapoints.Image, np.ndarray, datapoints.BoundingBoxes, str, int], + [torch.Tensor, PIL.Image.Image, datapoints.Image, np.ndarray, datapoints.BBoxes, str, int], ) def test_check_transformed_types(self, inpt_type, mocker): # This test ensures that we correctly handle which types to transform and which to bypass @@ -960,7 +952,7 @@ def test_check_transformed_types(self, inpt_type, mocker): class TestToImageTensor: @pytest.mark.parametrize( "inpt_type", - [torch.Tensor, PIL.Image.Image, datapoints.Image, np.ndarray, datapoints.BoundingBoxes, str, int], + [torch.Tensor, PIL.Image.Image, datapoints.Image, np.ndarray, datapoints.BBoxes, str, int], ) def test__transform(self, inpt_type, mocker): fn = mocker.patch( @@ -971,7 +963,7 @@ def test__transform(self, inpt_type, mocker): inpt = mocker.MagicMock(spec=inpt_type) transform = transforms.ToImageTensor() transform(inpt) - if inpt_type in (datapoints.BoundingBoxes, datapoints.Image, str, int): + if inpt_type in (datapoints.BBoxes, datapoints.Image, str, int): assert fn.call_count == 0 else: fn.assert_called_once_with(inpt) @@ -980,7 +972,7 @@ def test__transform(self, inpt_type, mocker): class TestToImagePIL: @pytest.mark.parametrize( "inpt_type", - [torch.Tensor, PIL.Image.Image, datapoints.Image, np.ndarray, datapoints.BoundingBoxes, str, int], + [torch.Tensor, PIL.Image.Image, datapoints.Image, np.ndarray, datapoints.BBoxes, str, int], ) def test__transform(self, inpt_type, mocker): fn = mocker.patch("torchvision.transforms.v2.functional.to_image_pil") @@ -988,7 +980,7 @@ def test__transform(self, inpt_type, mocker): inpt = mocker.MagicMock(spec=inpt_type) transform = transforms.ToImagePIL() transform(inpt) - if inpt_type in (datapoints.BoundingBoxes, PIL.Image.Image, str, int): + if inpt_type in (datapoints.BBoxes, PIL.Image.Image, str, int): assert fn.call_count == 0 else: fn.assert_called_once_with(inpt, mode=transform.mode) @@ -997,7 +989,7 @@ def test__transform(self, inpt_type, mocker): class TestToPILImage: @pytest.mark.parametrize( "inpt_type", - [torch.Tensor, PIL.Image.Image, datapoints.Image, np.ndarray, datapoints.BoundingBoxes, str, int], + [torch.Tensor, PIL.Image.Image, datapoints.Image, np.ndarray, datapoints.BBoxes, str, int], ) def test__transform(self, inpt_type, mocker): fn = mocker.patch("torchvision.transforms.v2.functional.to_image_pil") @@ -1005,7 +997,7 @@ def test__transform(self, inpt_type, mocker): inpt = mocker.MagicMock(spec=inpt_type) transform = transforms.ToPILImage() transform(inpt) - if inpt_type in (PIL.Image.Image, datapoints.BoundingBoxes, str, int): + if inpt_type in (PIL.Image.Image, datapoints.BBoxes, str, int): assert fn.call_count == 0 else: fn.assert_called_once_with(inpt, mode=transform.mode) @@ -1014,7 +1006,7 @@ def test__transform(self, inpt_type, mocker): class TestToTensor: @pytest.mark.parametrize( "inpt_type", - [torch.Tensor, PIL.Image.Image, datapoints.Image, np.ndarray, datapoints.BoundingBoxes, str, int], + [torch.Tensor, PIL.Image.Image, datapoints.Image, np.ndarray, datapoints.BBoxes, str, int], ) def test__transform(self, inpt_type, mocker): fn = mocker.patch("torchvision.transforms.functional.to_tensor") @@ -1023,7 +1015,7 @@ def test__transform(self, inpt_type, mocker): with pytest.warns(UserWarning, match="deprecated and will be removed"): transform = transforms.ToTensor() transform(inpt) - if inpt_type in (datapoints.Image, torch.Tensor, datapoints.BoundingBoxes, str, int): + if inpt_type in (datapoints.Image, torch.Tensor, datapoints.BBoxes, str, int): assert fn.call_count == 0 else: fn.assert_called_once_with(inpt) @@ -1065,7 +1057,7 @@ def test__get_params(self, device, options, mocker): image = mocker.MagicMock(spec=datapoints.Image) image.num_channels = 3 image.spatial_size = (24, 32) - bboxes = datapoints.BoundingBoxes( + bboxes = datapoints.BBoxes( torch.tensor([[1, 1, 10, 10], [20, 20, 23, 23], [1, 20, 10, 23], [20, 1, 23, 10]]), format="XYXY", spatial_size=image.spatial_size, @@ -1103,7 +1095,7 @@ def test__get_params(self, device, options, mocker): def test__transform_empty_params(self, mocker): transform = transforms.RandomIoUCrop(sampler_options=[2.0]) image = datapoints.Image(torch.rand(1, 3, 4, 4)) - bboxes = datapoints.BoundingBoxes(torch.tensor([[1, 1, 2, 2]]), format="XYXY", spatial_size=(4, 4)) + bboxes = datapoints.BBoxes(torch.tensor([[1, 1, 2, 2]]), format="XYXY", spatial_size=(4, 4)) label = torch.tensor([1]) sample = [image, bboxes, label] # Let's mock transform._get_params to control the output: @@ -1123,7 +1115,7 @@ def test__transform(self, mocker): transform = transforms.RandomIoUCrop() image = datapoints.Image(torch.rand(3, 32, 24)) - bboxes = make_bounding_box(format="XYXY", spatial_size=(32, 24), batch_dims=(6,)) + bboxes = make_bbox(format="XYXY", spatial_size=(32, 24), batch_dims=(6,)) masks = make_detection_mask((32, 24), num_objects=6) sample = [image, bboxes, masks] @@ -1147,7 +1139,7 @@ def test__transform(self, mocker): # check number of bboxes vs number of labels: output_bboxes = output[1] - assert isinstance(output_bboxes, datapoints.BoundingBoxes) + assert isinstance(output_bboxes, datapoints.BBoxes) assert (output_bboxes[~is_within_crop_area] == 0).all() output_masks = output[2] @@ -1505,7 +1497,7 @@ def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize): transforms.ConvertImageDtype(torch.float), ] if sanitize: - t += [transforms.SanitizeBoundingBoxes()] + t += [transforms.SanitizeBBoxes()] t = transforms.Compose(t) num_boxes = 5 @@ -1523,7 +1515,7 @@ def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize): boxes = torch.randint(0, min(H, W) // 2, size=(num_boxes, 4)) boxes[:, 2:] += boxes[:, :2] boxes = boxes.clamp(min=0, max=min(H, W)) - boxes = datapoints.BoundingBoxes(boxes, format="XYXY", spatial_size=(H, W)) + boxes = datapoints.BBoxes(boxes, format="XYXY", spatial_size=(H, W)) masks = datapoints.Mask(torch.randint(0, 2, size=(num_boxes, H, W), dtype=torch.uint8)) @@ -1546,7 +1538,7 @@ def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize): # ssd and ssdlite contain RandomIoUCrop which may "remove" some bbox. It # doesn't remove them strictly speaking, it just marks some boxes as # degenerate and those boxes will be later removed by - # SanitizeBoundingBoxes(), which we add to the pipelines if the sanitize + # SanitizeBBoxes(), which we add to the pipelines if the sanitize # param is True. # Note that the values below are probably specific to the random seed # set above (which is fine). @@ -1560,7 +1552,7 @@ def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize): @pytest.mark.parametrize("min_size", (1, 10)) @pytest.mark.parametrize("labels_getter", ("default", lambda inputs: inputs["labels"], None, lambda inputs: None)) @pytest.mark.parametrize("sample_type", (tuple, dict)) -def test_sanitize_bounding_boxes(min_size, labels_getter, sample_type): +def test_sanitize_bboxes(min_size, labels_getter, sample_type): if sample_type is tuple and not isinstance(labels_getter, str): # The "lambda inputs: inputs["labels"]" labels_getter used in this test @@ -1594,9 +1586,9 @@ def test_sanitize_bounding_boxes(min_size, labels_getter, sample_type): boxes = torch.tensor(boxes) labels = torch.arange(boxes.shape[0]) - boxes = datapoints.BoundingBoxes( + boxes = datapoints.BBoxes( boxes, - format=datapoints.BoundingBoxFormat.XYXY, + format=datapoints.BBoxFormat.XYXY, spatial_size=(H, W), ) @@ -1616,7 +1608,7 @@ def test_sanitize_bounding_boxes(min_size, labels_getter, sample_type): img = sample.pop("image") sample = (img, sample) - out = transforms.SanitizeBoundingBoxes(min_size=min_size, labels_getter=labels_getter)(sample) + out = transforms.SanitizeBBoxes(min_size=min_size, labels_getter=labels_getter)(sample) if sample_type is tuple: out_image = out[0] @@ -1634,7 +1626,7 @@ def test_sanitize_bounding_boxes(min_size, labels_getter, sample_type): assert out_image is input_img assert out_whatever is whatever - assert isinstance(out_boxes, datapoints.BoundingBoxes) + assert isinstance(out_boxes, datapoints.BBoxes) assert isinstance(out_masks, datapoints.Mask) if labels_getter is None or (callable(labels_getter) and labels_getter({"labels": "blah"}) is None): @@ -1646,42 +1638,42 @@ def test_sanitize_bounding_boxes(min_size, labels_getter, sample_type): assert out_labels.tolist() == valid_indices -def test_sanitize_bounding_boxes_errors(): +def test_sanitize_bboxes_errors(): - good_bbox = datapoints.BoundingBoxes( + good_bbox = datapoints.BBoxes( [[0, 0, 10, 10]], - format=datapoints.BoundingBoxFormat.XYXY, + format=datapoints.BBoxFormat.XYXY, spatial_size=(20, 20), ) with pytest.raises(ValueError, match="min_size must be >= 1"): - transforms.SanitizeBoundingBoxes(min_size=0) + transforms.SanitizeBBoxes(min_size=0) with pytest.raises(ValueError, match="labels_getter should either be 'default'"): - transforms.SanitizeBoundingBoxes(labels_getter=12) + transforms.SanitizeBBoxes(labels_getter=12) with pytest.raises(ValueError, match="Could not infer where the labels are"): bad_labels_key = {"bbox": good_bbox, "BAD_KEY": torch.arange(good_bbox.shape[0])} - transforms.SanitizeBoundingBoxes()(bad_labels_key) + transforms.SanitizeBBoxes()(bad_labels_key) with pytest.raises(ValueError, match="must be a tensor"): not_a_tensor = {"bbox": good_bbox, "labels": torch.arange(good_bbox.shape[0]).tolist()} - transforms.SanitizeBoundingBoxes()(not_a_tensor) + transforms.SanitizeBBoxes()(not_a_tensor) with pytest.raises(ValueError, match="Number of boxes"): different_sizes = {"bbox": good_bbox, "labels": torch.arange(good_bbox.shape[0] + 3)} - transforms.SanitizeBoundingBoxes()(different_sizes) + transforms.SanitizeBBoxes()(different_sizes) with pytest.raises(ValueError, match="boxes must be of shape"): - bad_bbox = datapoints.BoundingBoxes( # batch with 2 elements + bad_bbox = datapoints.BBoxes( # batch with 2 elements [ [[0, 0, 10, 10]], [[0, 0, 10, 10]], ], - format=datapoints.BoundingBoxFormat.XYXY, + format=datapoints.BBoxFormat.XYXY, spatial_size=(20, 20), ) different_sizes = {"bbox": bad_bbox, "labels": torch.arange(bad_bbox.shape[0])} - transforms.SanitizeBoundingBoxes()(different_sizes) + transforms.SanitizeBBoxes()(different_sizes) @pytest.mark.parametrize( diff --git a/test/test_transforms_v2_consistency.py b/test/test_transforms_v2_consistency.py index 9adec66b3c4..781c3cd864d 100644 --- a/test/test_transforms_v2_consistency.py +++ b/test/test_transforms_v2_consistency.py @@ -17,7 +17,7 @@ ArgsKwargs, assert_close, assert_equal, - make_bounding_box, + make_bbox, make_detection_mask, make_image, make_images, @@ -1090,7 +1090,7 @@ def make_label(extra_dims, categories): pil_image = to_image_pil(make_image(size=size, color_space="RGB")) target = { - "boxes": make_bounding_box(spatial_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float), + "boxes": make_bbox(spatial_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float), "labels": make_label(extra_dims=(num_objects,), categories=80), } if with_mask: @@ -1100,7 +1100,7 @@ def make_label(extra_dims, categories): tensor_image = torch.Tensor(make_image(size=size, color_space="RGB", dtype=torch.float32)) target = { - "boxes": make_bounding_box(spatial_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float), + "boxes": make_bbox(spatial_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float), "labels": make_label(extra_dims=(num_objects,), categories=80), } if with_mask: @@ -1110,7 +1110,7 @@ def make_label(extra_dims, categories): datapoint_image = make_image(size=size, color_space="RGB", dtype=torch.float32) target = { - "boxes": make_bounding_box(spatial_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float), + "boxes": make_bbox(spatial_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float), "labels": make_label(extra_dims=(num_objects,), categories=80), } if with_mask: @@ -1127,7 +1127,7 @@ def make_label(extra_dims, categories): v2_transforms.Compose( [ v2_transforms.RandomIoUCrop(), - v2_transforms.SanitizeBoundingBoxes(labels_getter=lambda sample: sample[1]["labels"]), + v2_transforms.SanitizeBBoxes(labels_getter=lambda sample: sample[1]["labels"]), ] ), {"with_mask": False}, diff --git a/test/test_transforms_v2_functional.py b/test/test_transforms_v2_functional.py index 5d692b58108..c46b34f4755 100644 --- a/test/test_transforms_v2_functional.py +++ b/test/test_transforms_v2_functional.py @@ -16,7 +16,7 @@ cache, cpu_and_cuda, DEFAULT_SQUARE_SPATIAL_SIZE, - make_bounding_boxes, + make_bboxes, needs_cuda, parametrized_error_message, set_rng_seed, @@ -26,7 +26,7 @@ from torchvision.transforms.functional import _get_perspective_coeffs from torchvision.transforms.v2 import functional as F from torchvision.transforms.v2.functional._geometry import _center_crop_compute_padding -from torchvision.transforms.v2.functional._meta import clamp_bounding_boxes, convert_format_bounding_boxes +from torchvision.transforms.v2.functional._meta import clamp_bboxes, convert_format_bboxes from torchvision.transforms.v2.utils import is_simple_tensor from transforms_v2_dispatcher_infos import DISPATCHER_INFOS from transforms_v2_kernel_infos import KERNEL_INFOS @@ -176,7 +176,7 @@ def test_batched_vs_single(self, test_id, info, args_kwargs, device): # Everything to the left is considered a batch dimension. data_dims = { datapoints.Image: 3, - datapoints.BoundingBoxes: 1, + datapoints.BBoxes: 1, # `Mask`'s are special in the sense that the data dimensions depend on the type of mask. For detection masks # it is 3 `(*, N, H, W)`, but for segmentation masks it is 2 `(*, H, W)`. Since both a grouped under one # type all kernels should also work without differentiating between the two. Thus, we go with 2 here as @@ -515,15 +515,15 @@ def test_unkown_type(self, info): [ info for info in DISPATCHER_INFOS - if datapoints.BoundingBoxes in info.kernels and info.dispatcher is not F.convert_format_bounding_boxes + if datapoints.BBoxes in info.kernels and info.dispatcher is not F.convert_format_bboxes ], - args_kwargs_fn=lambda info: info.sample_inputs(datapoints.BoundingBoxes), + args_kwargs_fn=lambda info: info.sample_inputs(datapoints.BBoxes), ) - def test_bounding_boxes_format_consistency(self, info, args_kwargs): - (bounding_boxes, *other_args), kwargs = args_kwargs.load() - format = bounding_boxes.format + def test_bboxes_format_consistency(self, info, args_kwargs): + (bboxes, *other_args), kwargs = args_kwargs.load() + format = bboxes.format - output = info.dispatcher(bounding_boxes, *other_args, **kwargs) + output = info.dispatcher(bboxes, *other_args, **kwargs) assert output.format == format @@ -562,61 +562,59 @@ def assert_samples_from_standard_normal(t): assert_samples_from_standard_normal(F.normalize_image_tensor(image, mean, std)) -class TestClampBoundingBoxes: +class TestClampBBoxes: @pytest.mark.parametrize( "metadata", [ dict(), - dict(format=datapoints.BoundingBoxFormat.XYXY), + dict(format=datapoints.BBoxFormat.XYXY), dict(spatial_size=(1, 1)), ], ) def test_simple_tensor_insufficient_metadata(self, metadata): - simple_tensor = next(make_bounding_boxes()).as_subclass(torch.Tensor) + simple_tensor = next(make_bboxes()).as_subclass(torch.Tensor) with pytest.raises(ValueError, match=re.escape("`format` and `spatial_size` has to be passed")): - F.clamp_bounding_boxes(simple_tensor, **metadata) + F.clamp_bboxes(simple_tensor, **metadata) @pytest.mark.parametrize( "metadata", [ - dict(format=datapoints.BoundingBoxFormat.XYXY), + dict(format=datapoints.BBoxFormat.XYXY), dict(spatial_size=(1, 1)), - dict(format=datapoints.BoundingBoxFormat.XYXY, spatial_size=(1, 1)), + dict(format=datapoints.BBoxFormat.XYXY, spatial_size=(1, 1)), ], ) def test_datapoint_explicit_metadata(self, metadata): - datapoint = next(make_bounding_boxes()) + datapoint = next(make_bboxes()) with pytest.raises(ValueError, match=re.escape("`format` and `spatial_size` must not be passed")): - F.clamp_bounding_boxes(datapoint, **metadata) + F.clamp_bboxes(datapoint, **metadata) -class TestConvertFormatBoundingBoxes: +class TestConvertFormatBBoxes: @pytest.mark.parametrize( ("inpt", "old_format"), [ - (next(make_bounding_boxes()), None), - (next(make_bounding_boxes()).as_subclass(torch.Tensor), datapoints.BoundingBoxFormat.XYXY), + (next(make_bboxes()), None), + (next(make_bboxes()).as_subclass(torch.Tensor), datapoints.BBoxFormat.XYXY), ], ) def test_missing_new_format(self, inpt, old_format): with pytest.raises(TypeError, match=re.escape("missing 1 required argument: 'new_format'")): - F.convert_format_bounding_boxes(inpt, old_format) + F.convert_format_bboxes(inpt, old_format) def test_simple_tensor_insufficient_metadata(self): - simple_tensor = next(make_bounding_boxes()).as_subclass(torch.Tensor) + simple_tensor = next(make_bboxes()).as_subclass(torch.Tensor) with pytest.raises(ValueError, match=re.escape("`old_format` has to be passed")): - F.convert_format_bounding_boxes(simple_tensor, new_format=datapoints.BoundingBoxFormat.CXCYWH) + F.convert_format_bboxes(simple_tensor, new_format=datapoints.BBoxFormat.CXCYWH) def test_datapoint_explicit_metadata(self): - datapoint = next(make_bounding_boxes()) + datapoint = next(make_bboxes()) with pytest.raises(ValueError, match=re.escape("`old_format` must not be passed")): - F.convert_format_bounding_boxes( - datapoint, old_format=datapoint.format, new_format=datapoints.BoundingBoxFormat.CXCYWH - ) + F.convert_format_bboxes(datapoint, old_format=datapoint.format, new_format=datapoints.BBoxFormat.CXCYWH) # TODO: All correctness checks below this line should be ported to be references on a `KernelInfo` in @@ -649,7 +647,7 @@ def _compute_affine_matrix(angle_, translate_, scale_, shear_, center_): @pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize( "format", - [datapoints.BoundingBoxFormat.XYXY, datapoints.BoundingBoxFormat.XYWH, datapoints.BoundingBoxFormat.CXCYWH], + [datapoints.BBoxFormat.XYXY, datapoints.BBoxFormat.XYWH, datapoints.BBoxFormat.CXCYWH], ) @pytest.mark.parametrize( "top, left, height, width, expected_bboxes", @@ -658,7 +656,7 @@ def _compute_affine_matrix(angle_, translate_, scale_, shear_, center_): [-8, 12, 70, 40, [(-2.0, 23.0, 13.0, 43.0), (38.0, 13.0, 58.0, 30.0), (33.0, 54.0, 44.0, 70.0)]], ], ) -def test_correctness_crop_bounding_boxes(device, format, top, left, height, width, expected_bboxes): +def test_correctness_crop_bboxes(device, format, top, left, height, width, expected_bboxes): # Expected bboxes computed using Albumentations: # import numpy as np @@ -672,7 +670,7 @@ def test_correctness_crop_bounding_boxes(device, format, top, left, height, widt # out_box = denormalize_bbox(n_out_box, height, width) # expected_bboxes.append(out_box) - format = datapoints.BoundingBoxFormat.XYXY + format = datapoints.BBoxFormat.XYXY spatial_size = (64, 76) in_boxes = [ [10.0, 15.0, 25.0, 35.0], @@ -680,14 +678,14 @@ def test_correctness_crop_bounding_boxes(device, format, top, left, height, widt [45.0, 46.0, 56.0, 62.0], ] in_boxes = torch.tensor(in_boxes, device=device) - if format != datapoints.BoundingBoxFormat.XYXY: - in_boxes = convert_format_bounding_boxes(in_boxes, datapoints.BoundingBoxFormat.XYXY, format) + if format != datapoints.BBoxFormat.XYXY: + in_boxes = convert_format_bboxes(in_boxes, datapoints.BBoxFormat.XYXY, format) - expected_bboxes = clamp_bounding_boxes( - datapoints.BoundingBoxes(expected_bboxes, format="XYXY", spatial_size=spatial_size) + expected_bboxes = clamp_bboxes( + datapoints.BBoxes(expected_bboxes, format="XYXY", spatial_size=spatial_size) ).tolist() - output_boxes, output_spatial_size = F.crop_bounding_boxes( + output_boxes, output_spatial_size = F.crop_bboxes( in_boxes, format, top, @@ -696,8 +694,8 @@ def test_correctness_crop_bounding_boxes(device, format, top, left, height, widt spatial_size[1], ) - if format != datapoints.BoundingBoxFormat.XYXY: - output_boxes = convert_format_bounding_boxes(output_boxes, format, datapoints.BoundingBoxFormat.XYXY) + if format != datapoints.BBoxFormat.XYXY: + output_boxes = convert_format_bboxes(output_boxes, format, datapoints.BBoxFormat.XYXY) torch.testing.assert_close(output_boxes.tolist(), expected_bboxes) torch.testing.assert_close(output_spatial_size, spatial_size) @@ -718,7 +716,7 @@ def test_correctness_vertical_flip_segmentation_mask_on_fixed_input(device): @pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize( "format", - [datapoints.BoundingBoxFormat.XYXY, datapoints.BoundingBoxFormat.XYWH, datapoints.BoundingBoxFormat.CXCYWH], + [datapoints.BBoxFormat.XYXY, datapoints.BBoxFormat.XYWH, datapoints.BBoxFormat.CXCYWH], ) @pytest.mark.parametrize( "top, left, height, width, size", @@ -727,7 +725,7 @@ def test_correctness_vertical_flip_segmentation_mask_on_fixed_input(device): [-5, 5, 35, 45, (32, 34)], ], ) -def test_correctness_resized_crop_bounding_boxes(device, format, top, left, height, width, size): +def test_correctness_resized_crop_bboxes(device, format, top, left, height, width, size): def _compute_expected_bbox(bbox, top_, left_, height_, width_, size_): # bbox should be xyxy bbox[0] = (bbox[0] - left_) * size_[1] / width_ @@ -736,7 +734,7 @@ def _compute_expected_bbox(bbox, top_, left_, height_, width_, size_): bbox[3] = (bbox[3] - top_) * size_[0] / height_ return bbox - format = datapoints.BoundingBoxFormat.XYXY + format = datapoints.BBoxFormat.XYXY spatial_size = (100, 100) in_boxes = [ [10.0, 10.0, 20.0, 20.0], @@ -747,16 +745,14 @@ def _compute_expected_bbox(bbox, top_, left_, height_, width_, size_): expected_bboxes.append(_compute_expected_bbox(list(in_box), top, left, height, width, size)) expected_bboxes = torch.tensor(expected_bboxes, device=device) - in_boxes = datapoints.BoundingBoxes( - in_boxes, format=datapoints.BoundingBoxFormat.XYXY, spatial_size=spatial_size, device=device - ) - if format != datapoints.BoundingBoxFormat.XYXY: - in_boxes = convert_format_bounding_boxes(in_boxes, datapoints.BoundingBoxFormat.XYXY, format) + in_boxes = datapoints.BBoxes(in_boxes, format=datapoints.BBoxFormat.XYXY, spatial_size=spatial_size, device=device) + if format != datapoints.BBoxFormat.XYXY: + in_boxes = convert_format_bboxes(in_boxes, datapoints.BBoxFormat.XYXY, format) - output_boxes, output_spatial_size = F.resized_crop_bounding_boxes(in_boxes, format, top, left, height, width, size) + output_boxes, output_spatial_size = F.resized_crop_bboxes(in_boxes, format, top, left, height, width, size) - if format != datapoints.BoundingBoxFormat.XYXY: - output_boxes = convert_format_bounding_boxes(output_boxes, format, datapoints.BoundingBoxFormat.XYXY) + if format != datapoints.BBoxFormat.XYXY: + output_boxes = convert_format_bboxes(output_boxes, format, datapoints.BBoxFormat.XYXY) torch.testing.assert_close(output_boxes, expected_bboxes) torch.testing.assert_close(output_spatial_size, size) @@ -776,7 +772,7 @@ def _parse_padding(padding): @pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("padding", [[1], [1, 1], [1, 1, 2, 2]]) -def test_correctness_pad_bounding_boxes(device, padding): +def test_correctness_pad_bboxes(device, padding): def _compute_expected_bbox(bbox, padding_): pad_left, pad_up, _, _ = _parse_padding(padding_) @@ -784,14 +780,14 @@ def _compute_expected_bbox(bbox, padding_): format = bbox.format bbox = ( bbox.clone() - if format == datapoints.BoundingBoxFormat.XYXY - else convert_format_bounding_boxes(bbox, new_format=datapoints.BoundingBoxFormat.XYXY) + if format == datapoints.BBoxFormat.XYXY + else convert_format_bboxes(bbox, new_format=datapoints.BBoxFormat.XYXY) ) bbox[0::2] += pad_left bbox[1::2] += pad_up - bbox = convert_format_bounding_boxes(bbox, new_format=format) + bbox = convert_format_bboxes(bbox, new_format=format) if bbox.dtype != dtype: # Temporary cast to original dtype # e.g. float32 -> int @@ -803,12 +799,12 @@ def _compute_expected_spatial_size(bbox, padding_): height, width = bbox.spatial_size return height + pad_up + pad_down, width + pad_left + pad_right - for bboxes in make_bounding_boxes(): + for bboxes in make_bboxes(): bboxes = bboxes.to(device) bboxes_format = bboxes.format bboxes_spatial_size = bboxes.spatial_size - output_boxes, output_spatial_size = F.pad_bounding_boxes( + output_boxes, output_spatial_size = F.pad_bboxes( bboxes, format=bboxes_format, spatial_size=bboxes_spatial_size, padding=padding ) @@ -819,7 +815,7 @@ def _compute_expected_spatial_size(bbox, padding_): expected_bboxes = [] for bbox in bboxes: - bbox = datapoints.BoundingBoxes(bbox, format=bboxes_format, spatial_size=bboxes_spatial_size) + bbox = datapoints.BBoxes(bbox, format=bboxes_format, spatial_size=bboxes_spatial_size) expected_bboxes.append(_compute_expected_bbox(bbox, padding)) if len(expected_bboxes) > 1: @@ -849,7 +845,7 @@ def test_correctness_pad_segmentation_mask_on_fixed_input(device): [[[3, 2], [32, 3], [30, 24], [2, 25]], [[5, 5], [30, 3], [33, 19], [4, 25]]], ], ) -def test_correctness_perspective_bounding_boxes(device, startpoints, endpoints): +def test_correctness_perspective_bboxes(device, startpoints, endpoints): def _compute_expected_bbox(bbox, pcoeffs_): m1 = np.array( [ @@ -864,7 +860,7 @@ def _compute_expected_bbox(bbox, pcoeffs_): ] ) - bbox_xyxy = convert_format_bounding_boxes(bbox, new_format=datapoints.BoundingBoxFormat.XYXY) + bbox_xyxy = convert_format_bboxes(bbox, new_format=datapoints.BBoxFormat.XYXY) points = np.array( [ [bbox_xyxy[0].item(), bbox_xyxy[1].item(), 1.0], @@ -884,24 +880,24 @@ def _compute_expected_bbox(bbox, pcoeffs_): np.max(transformed_points[:, 1]), ] ) - out_bbox = datapoints.BoundingBoxes( + out_bbox = datapoints.BBoxes( out_bbox, - format=datapoints.BoundingBoxFormat.XYXY, + format=datapoints.BBoxFormat.XYXY, spatial_size=bbox.spatial_size, dtype=bbox.dtype, device=bbox.device, ) - return clamp_bounding_boxes(convert_format_bounding_boxes(out_bbox, new_format=bbox.format)) + return clamp_bboxes(convert_format_bboxes(out_bbox, new_format=bbox.format)) spatial_size = (32, 38) pcoeffs = _get_perspective_coeffs(startpoints, endpoints) inv_pcoeffs = _get_perspective_coeffs(endpoints, startpoints) - for bboxes in make_bounding_boxes(spatial_size=spatial_size, extra_dims=((4,),)): + for bboxes in make_bboxes(spatial_size=spatial_size, extra_dims=((4,),)): bboxes = bboxes.to(device) - output_bboxes = F.perspective_bounding_boxes( + output_bboxes = F.perspective_bboxes( bboxes.as_subclass(torch.Tensor), format=bboxes.format, spatial_size=bboxes.spatial_size, @@ -915,7 +911,7 @@ def _compute_expected_bbox(bbox, pcoeffs_): expected_bboxes = [] for bbox in bboxes: - bbox = datapoints.BoundingBoxes(bbox, format=bboxes.format, spatial_size=bboxes.spatial_size) + bbox = datapoints.BBoxes(bbox, format=bboxes.format, spatial_size=bboxes.spatial_size) expected_bboxes.append(_compute_expected_bbox(bbox, inv_pcoeffs)) if len(expected_bboxes) > 1: expected_bboxes = torch.stack(expected_bboxes) @@ -929,12 +925,12 @@ def _compute_expected_bbox(bbox, pcoeffs_): "output_size", [(18, 18), [18, 15], (16, 19), [12], [46, 48]], ) -def test_correctness_center_crop_bounding_boxes(device, output_size): +def test_correctness_center_crop_bboxes(device, output_size): def _compute_expected_bbox(bbox, output_size_): format_ = bbox.format spatial_size_ = bbox.spatial_size dtype = bbox.dtype - bbox = convert_format_bounding_boxes(bbox.float(), format_, datapoints.BoundingBoxFormat.XYWH) + bbox = convert_format_bboxes(bbox.float(), format_, datapoints.BBoxFormat.XYWH) if len(output_size_) == 1: output_size_.append(output_size_[-1]) @@ -948,16 +944,16 @@ def _compute_expected_bbox(bbox, output_size_): bbox[3].item(), ] out_bbox = torch.tensor(out_bbox) - out_bbox = convert_format_bounding_boxes(out_bbox, datapoints.BoundingBoxFormat.XYWH, format_) - out_bbox = clamp_bounding_boxes(out_bbox, format=format_, spatial_size=output_size) + out_bbox = convert_format_bboxes(out_bbox, datapoints.BBoxFormat.XYWH, format_) + out_bbox = clamp_bboxes(out_bbox, format=format_, spatial_size=output_size) return out_bbox.to(dtype=dtype, device=bbox.device) - for bboxes in make_bounding_boxes(extra_dims=((4,),)): + for bboxes in make_bboxes(extra_dims=((4,),)): bboxes = bboxes.to(device) bboxes_format = bboxes.format bboxes_spatial_size = bboxes.spatial_size - output_boxes, output_spatial_size = F.center_crop_bounding_boxes( + output_boxes, output_spatial_size = F.center_crop_bboxes( bboxes, bboxes_format, bboxes_spatial_size, output_size ) @@ -966,7 +962,7 @@ def _compute_expected_bbox(bbox, output_size_): expected_bboxes = [] for bbox in bboxes: - bbox = datapoints.BoundingBoxes(bbox, format=bboxes_format, spatial_size=bboxes_spatial_size) + bbox = datapoints.BBoxes(bbox, format=bboxes_format, spatial_size=bboxes_spatial_size) expected_bboxes.append(_compute_expected_bbox(bbox, output_size)) if len(expected_bboxes) > 1: diff --git a/test/test_transforms_v2_refactored.py b/test/test_transforms_v2_refactored.py index 3b808d6b73c..96dd7980138 100644 --- a/test/test_transforms_v2_refactored.py +++ b/test/test_transforms_v2_refactored.py @@ -19,7 +19,7 @@ cpu_and_cuda, freeze_rng_state, ignore_jit_no_profile_information_warning, - make_bounding_box, + make_bbox, make_detection_mask, make_image, make_image_pil, @@ -196,7 +196,7 @@ def _check_dispatcher_dispatch(dispatcher, kernel, input, *args, **kwargs): assert isinstance(output, type(input)) - if isinstance(input, datapoints.BoundingBoxes): + if isinstance(input, datapoints.BBoxes): assert output.format == input.format @@ -306,7 +306,7 @@ def check_transform(transform_cls, input, *args, **kwargs): output = transform(input) assert isinstance(output, type(input)) - if isinstance(input, datapoints.BoundingBoxes): + if isinstance(input, datapoints.BBoxes): assert output.format == input.format _check_transform_v1_compatibility(transform, input) @@ -392,16 +392,16 @@ def assert_warns_antialias_default_value(): yield -def reference_affine_bounding_boxes_helper(bounding_boxes, *, format, spatial_size, affine_matrix): +def reference_affine_bboxes_helper(bboxes, *, format, spatial_size, affine_matrix): def transform(bbox): # Go to float before converting to prevent precision loss in case of CXCYWH -> XYXY and W or H is 1 in_dtype = bbox.dtype if not torch.is_floating_point(bbox): bbox = bbox.float() - bbox_xyxy = F.convert_format_bounding_boxes( + bbox_xyxy = F.convert_format_bboxes( bbox.as_subclass(torch.Tensor), old_format=format, - new_format=datapoints.BoundingBoxFormat.XYXY, + new_format=datapoints.BBoxFormat.XYXY, inplace=True, ) points = np.array( @@ -422,15 +422,15 @@ def transform(bbox): ], dtype=bbox_xyxy.dtype, ) - out_bbox = F.convert_format_bounding_boxes( - out_bbox, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=format, inplace=True + out_bbox = F.convert_format_bboxes( + out_bbox, old_format=datapoints.BBoxFormat.XYXY, new_format=format, inplace=True ) # It is important to clamp before casting, especially for CXCYWH format, dtype=int64 - out_bbox = F.clamp_bounding_boxes(out_bbox, format=format, spatial_size=spatial_size) + out_bbox = F.clamp_bboxes(out_bbox, format=format, spatial_size=spatial_size) out_bbox = out_bbox.to(dtype=in_dtype) return out_bbox - return torch.stack([transform(b) for b in bounding_boxes.reshape(-1, 4).unbind()]).reshape(bounding_boxes.shape) + return torch.stack([transform(b) for b in bboxes.reshape(-1, 4).unbind()]).reshape(bboxes.shape) class TestResize: @@ -503,25 +503,25 @@ def test_kernel_image_tensor(self, size, interpolation, use_max_size, antialias, check_scripted_vs_eager=not isinstance(size, int), ) - @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat)) + @pytest.mark.parametrize("format", list(datapoints.BBoxFormat)) @pytest.mark.parametrize("size", OUTPUT_SIZES) @pytest.mark.parametrize("use_max_size", [True, False]) @pytest.mark.parametrize("dtype", [torch.float32, torch.int64]) @pytest.mark.parametrize("device", cpu_and_cuda()) - def test_kernel_bounding_boxes(self, format, size, use_max_size, dtype, device): + def test_kernel_bboxes(self, format, size, use_max_size, dtype, device): if not (max_size_kwarg := self._make_max_size_kwarg(use_max_size=use_max_size, size=size)): return - bounding_boxes = make_bounding_box( + bboxes = make_bbox( format=format, spatial_size=self.INPUT_SIZE, dtype=dtype, device=device, ) check_kernel( - F.resize_bounding_boxes, - bounding_boxes, - spatial_size=bounding_boxes.spatial_size, + F.resize_bboxes, + bboxes, + spatial_size=bboxes.spatial_size, size=size, **max_size_kwarg, check_scripted_vs_eager=not isinstance(size, int), @@ -541,7 +541,7 @@ def test_kernel_video(self): (F.resize_image_tensor, make_image_tensor), (F.resize_image_pil, make_image_pil), (F.resize_image_tensor, make_image), - (F.resize_bounding_boxes, make_bounding_box), + (F.resize_bboxes, make_bbox), (F.resize_mask, make_segmentation_mask), (F.resize_video, make_video), ], @@ -562,7 +562,7 @@ def test_dispatcher(self, size, kernel, make_input): (F.resize_image_tensor, torch.Tensor), (F.resize_image_pil, PIL.Image.Image), (F.resize_image_tensor, datapoints.Image), - (F.resize_bounding_boxes, datapoints.BoundingBoxes), + (F.resize_bboxes, datapoints.BBoxes), (F.resize_mask, datapoints.Mask), (F.resize_video, datapoints.Video), ], @@ -578,7 +578,7 @@ def test_dispatcher_signature(self, kernel, input_type): make_image_tensor, make_image_pil, make_image, - make_bounding_box, + make_bbox, make_segmentation_mask, make_detection_mask, make_video, @@ -612,45 +612,43 @@ def test_image_correctness(self, size, interpolation, use_max_size, fn): self._check_output_size(image, actual, size=size, **max_size_kwarg) torch.testing.assert_close(actual, expected, atol=1, rtol=0) - def _reference_resize_bounding_boxes(self, bounding_boxes, *, size, max_size=None): - old_height, old_width = bounding_boxes.spatial_size - new_height, new_width = self._compute_output_size( - input_size=bounding_boxes.spatial_size, size=size, max_size=max_size - ) + def _reference_resize_bboxes(self, bboxes, *, size, max_size=None): + old_height, old_width = bboxes.spatial_size + new_height, new_width = self._compute_output_size(input_size=bboxes.spatial_size, size=size, max_size=max_size) if (old_height, old_width) == (new_height, new_width): - return bounding_boxes + return bboxes affine_matrix = np.array( [ [new_width / old_width, 0, 0], [0, new_height / old_height, 0], ], - dtype="float64" if bounding_boxes.dtype == torch.float64 else "float32", + dtype="float64" if bboxes.dtype == torch.float64 else "float32", ) - expected_bboxes = reference_affine_bounding_boxes_helper( - bounding_boxes, - format=bounding_boxes.format, + expected_bboxes = reference_affine_bboxes_helper( + bboxes, + format=bboxes.format, spatial_size=(new_height, new_width), affine_matrix=affine_matrix, ) - return datapoints.BoundingBoxes.wrap_like(bounding_boxes, expected_bboxes, spatial_size=(new_height, new_width)) + return datapoints.BBoxes.wrap_like(bboxes, expected_bboxes, spatial_size=(new_height, new_width)) - @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat)) + @pytest.mark.parametrize("format", list(datapoints.BBoxFormat)) @pytest.mark.parametrize("size", OUTPUT_SIZES) @pytest.mark.parametrize("use_max_size", [True, False]) @pytest.mark.parametrize("fn", [F.resize, transform_cls_to_functional(transforms.Resize)]) - def test_bounding_boxes_correctness(self, format, size, use_max_size, fn): + def test_bboxes_correctness(self, format, size, use_max_size, fn): if not (max_size_kwarg := self._make_max_size_kwarg(use_max_size=use_max_size, size=size)): return - bounding_boxes = make_bounding_box(format=format, spatial_size=self.INPUT_SIZE) + bboxes = make_bbox(format=format, spatial_size=self.INPUT_SIZE) - actual = fn(bounding_boxes, size=size, **max_size_kwarg) - expected = self._reference_resize_bounding_boxes(bounding_boxes, size=size, **max_size_kwarg) + actual = fn(bboxes, size=size, **max_size_kwarg) + expected = self._reference_resize_bboxes(bboxes, size=size, **max_size_kwarg) - self._check_output_size(bounding_boxes, actual, size=size, **max_size_kwarg) + self._check_output_size(bboxes, actual, size=size, **max_size_kwarg) torch.testing.assert_close(actual, expected) @pytest.mark.parametrize("interpolation", set(transforms.InterpolationMode) - set(INTERPOLATION_MODES)) @@ -684,7 +682,7 @@ def test_dispatcher_pil_antialias_warning(self): make_image_tensor, make_image_pil, make_image, - make_bounding_box, + make_bbox, make_segmentation_mask, make_detection_mask, make_video, @@ -753,7 +751,7 @@ def test_transform_unknown_size_error(self): make_image_tensor, make_image_pil, make_image, - make_bounding_box, + make_bbox, make_segmentation_mask, make_detection_mask, make_video, @@ -780,7 +778,7 @@ def test_noop(self, size, make_input): make_image_tensor, make_image_pil, make_image, - make_bounding_box, + make_bbox, make_segmentation_mask, make_detection_mask, make_video, @@ -805,16 +803,16 @@ class TestHorizontalFlip: def test_kernel_image_tensor(self, dtype, device): check_kernel(F.horizontal_flip_image_tensor, make_image(dtype=dtype, device=device)) - @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat)) + @pytest.mark.parametrize("format", list(datapoints.BBoxFormat)) @pytest.mark.parametrize("dtype", [torch.float32, torch.int64]) @pytest.mark.parametrize("device", cpu_and_cuda()) - def test_kernel_bounding_boxes(self, format, dtype, device): - bounding_boxes = make_bounding_box(format=format, dtype=dtype, device=device) + def test_kernel_bboxes(self, format, dtype, device): + bboxes = make_bbox(format=format, dtype=dtype, device=device) check_kernel( - F.horizontal_flip_bounding_boxes, - bounding_boxes, + F.horizontal_flip_bboxes, + bboxes, format=format, - spatial_size=bounding_boxes.spatial_size, + spatial_size=bboxes.spatial_size, ) @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_mask]) @@ -830,7 +828,7 @@ def test_kernel_video(self): (F.horizontal_flip_image_tensor, make_image_tensor), (F.horizontal_flip_image_pil, make_image_pil), (F.horizontal_flip_image_tensor, make_image), - (F.horizontal_flip_bounding_boxes, make_bounding_box), + (F.horizontal_flip_bboxes, make_bbox), (F.horizontal_flip_mask, make_segmentation_mask), (F.horizontal_flip_video, make_video), ], @@ -844,7 +842,7 @@ def test_dispatcher(self, kernel, make_input): (F.horizontal_flip_image_tensor, torch.Tensor), (F.horizontal_flip_image_pil, PIL.Image.Image), (F.horizontal_flip_image_tensor, datapoints.Image), - (F.horizontal_flip_bounding_boxes, datapoints.BoundingBoxes), + (F.horizontal_flip_bboxes, datapoints.BBoxes), (F.horizontal_flip_mask, datapoints.Mask), (F.horizontal_flip_video, datapoints.Video), ], @@ -854,7 +852,7 @@ def test_dispatcher_signature(self, kernel, input_type): @pytest.mark.parametrize( "make_input", - [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video], + [make_image_tensor, make_image_pil, make_image, make_bbox, make_segmentation_mask, make_video], ) @pytest.mark.parametrize("device", cpu_and_cuda()) def test_transform(self, make_input, device): @@ -871,39 +869,39 @@ def test_image_correctness(self, fn): torch.testing.assert_close(actual, expected) - def _reference_horizontal_flip_bounding_boxes(self, bounding_boxes): + def _reference_horizontal_flip_bboxes(self, bboxes): affine_matrix = np.array( [ - [-1, 0, bounding_boxes.spatial_size[1]], + [-1, 0, bboxes.spatial_size[1]], [0, 1, 0], ], - dtype="float64" if bounding_boxes.dtype == torch.float64 else "float32", + dtype="float64" if bboxes.dtype == torch.float64 else "float32", ) - expected_bboxes = reference_affine_bounding_boxes_helper( - bounding_boxes, - format=bounding_boxes.format, - spatial_size=bounding_boxes.spatial_size, + expected_bboxes = reference_affine_bboxes_helper( + bboxes, + format=bboxes.format, + spatial_size=bboxes.spatial_size, affine_matrix=affine_matrix, ) - return datapoints.BoundingBoxes.wrap_like(bounding_boxes, expected_bboxes) + return datapoints.BBoxes.wrap_like(bboxes, expected_bboxes) - @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat)) + @pytest.mark.parametrize("format", list(datapoints.BBoxFormat)) @pytest.mark.parametrize( "fn", [F.horizontal_flip, transform_cls_to_functional(transforms.RandomHorizontalFlip, p=1)] ) - def test_bounding_boxes_correctness(self, format, fn): - bounding_boxes = make_bounding_box(format=format) + def test_bboxes_correctness(self, format, fn): + bboxes = make_bbox(format=format) - actual = fn(bounding_boxes) - expected = self._reference_horizontal_flip_bounding_boxes(bounding_boxes) + actual = fn(bboxes) + expected = self._reference_horizontal_flip_bboxes(bboxes) torch.testing.assert_close(actual, expected) @pytest.mark.parametrize( "make_input", - [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video], + [make_image_tensor, make_image_pil, make_image, make_bbox, make_segmentation_mask, make_video], ) @pytest.mark.parametrize("device", cpu_and_cuda()) def test_transform_noop(self, make_input, device): @@ -986,16 +984,16 @@ def test_kernel_image_tensor(self, param, value, dtype, device): shear=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["shear"], center=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["center"], ) - @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat)) + @pytest.mark.parametrize("format", list(datapoints.BBoxFormat)) @pytest.mark.parametrize("dtype", [torch.float32, torch.int64]) @pytest.mark.parametrize("device", cpu_and_cuda()) - def test_kernel_bounding_boxes(self, param, value, format, dtype, device): - bounding_boxes = make_bounding_box(format=format, dtype=dtype, device=device) + def test_kernel_bboxes(self, param, value, format, dtype, device): + bboxes = make_bbox(format=format, dtype=dtype, device=device) self._check_kernel( - F.affine_bounding_boxes, - bounding_boxes, + F.affine_bboxes, + bboxes, format=format, - spatial_size=bounding_boxes.spatial_size, + spatial_size=bboxes.spatial_size, **{param: value}, check_scripted_vs_eager=not (param == "shear" and isinstance(value, (int, float))), ) @@ -1013,7 +1011,7 @@ def test_kernel_video(self): (F.affine_image_tensor, make_image_tensor), (F.affine_image_pil, make_image_pil), (F.affine_image_tensor, make_image), - (F.affine_bounding_boxes, make_bounding_box), + (F.affine_bboxes, make_bbox), (F.affine_mask, make_segmentation_mask), (F.affine_video, make_video), ], @@ -1027,7 +1025,7 @@ def test_dispatcher(self, kernel, make_input): (F.affine_image_tensor, torch.Tensor), (F.affine_image_pil, PIL.Image.Image), (F.affine_image_tensor, datapoints.Image), - (F.affine_bounding_boxes, datapoints.BoundingBoxes), + (F.affine_bboxes, datapoints.BBoxes), (F.affine_mask, datapoints.Mask), (F.affine_video, datapoints.Video), ], @@ -1037,7 +1035,7 @@ def test_dispatcher_signature(self, kernel, input_type): @pytest.mark.parametrize( "make_input", - [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video], + [make_image_tensor, make_image_pil, make_image, make_bbox, make_segmentation_mask, make_video], ) @pytest.mark.parametrize("device", cpu_and_cuda()) def test_transform(self, make_input, device): @@ -1131,43 +1129,43 @@ def _compute_affine_matrix(self, *, angle, translate, scale, shear, center): true_matrix = np.matmul(t_matrix, np.matmul(c_matrix, np.matmul(rss_matrix, c_matrix_inv))) return true_matrix - def _reference_affine_bounding_boxes(self, bounding_boxes, *, angle, translate, scale, shear, center): + def _reference_affine_bboxes(self, bboxes, *, angle, translate, scale, shear, center): if center is None: - center = [s * 0.5 for s in bounding_boxes.spatial_size[::-1]] + center = [s * 0.5 for s in bboxes.spatial_size[::-1]] affine_matrix = self._compute_affine_matrix( angle=angle, translate=translate, scale=scale, shear=shear, center=center ) affine_matrix = affine_matrix[:2, :] - expected_bboxes = reference_affine_bounding_boxes_helper( - bounding_boxes, - format=bounding_boxes.format, - spatial_size=bounding_boxes.spatial_size, + expected_bboxes = reference_affine_bboxes_helper( + bboxes, + format=bboxes.format, + spatial_size=bboxes.spatial_size, affine_matrix=affine_matrix, ) return expected_bboxes - @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat)) + @pytest.mark.parametrize("format", list(datapoints.BBoxFormat)) @pytest.mark.parametrize("angle", _CORRECTNESS_AFFINE_KWARGS["angle"]) @pytest.mark.parametrize("translate", _CORRECTNESS_AFFINE_KWARGS["translate"]) @pytest.mark.parametrize("scale", _CORRECTNESS_AFFINE_KWARGS["scale"]) @pytest.mark.parametrize("shear", _CORRECTNESS_AFFINE_KWARGS["shear"]) @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"]) - def test_functional_bounding_boxes_correctness(self, format, angle, translate, scale, shear, center): - bounding_boxes = make_bounding_box(format=format) + def test_functional_bboxes_correctness(self, format, angle, translate, scale, shear, center): + bboxes = make_bbox(format=format) actual = F.affine( - bounding_boxes, + bboxes, angle=angle, translate=translate, scale=scale, shear=shear, center=center, ) - expected = self._reference_affine_bounding_boxes( - bounding_boxes, + expected = self._reference_affine_bboxes( + bboxes, angle=angle, translate=translate, scale=scale, @@ -1177,21 +1175,21 @@ def test_functional_bounding_boxes_correctness(self, format, angle, translate, s torch.testing.assert_close(actual, expected) - @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat)) + @pytest.mark.parametrize("format", list(datapoints.BBoxFormat)) @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"]) @pytest.mark.parametrize("seed", list(range(5))) - def test_transform_bounding_boxes_correctness(self, format, center, seed): - bounding_boxes = make_bounding_box(format=format) + def test_transform_bboxes_correctness(self, format, center, seed): + bboxes = make_bbox(format=format) transform = transforms.RandomAffine(**self._CORRECTNESS_TRANSFORM_AFFINE_RANGES, center=center) torch.manual_seed(seed) - params = transform._get_params([bounding_boxes]) + params = transform._get_params([bboxes]) torch.manual_seed(seed) - actual = transform(bounding_boxes) + actual = transform(bboxes) - expected = self._reference_affine_bounding_boxes(bounding_boxes, **params, center=center) + expected = self._reference_affine_bboxes(bboxes, **params, center=center) torch.testing.assert_close(actual, expected) @@ -1284,16 +1282,16 @@ class TestVerticalFlip: def test_kernel_image_tensor(self, dtype, device): check_kernel(F.vertical_flip_image_tensor, make_image(dtype=dtype, device=device)) - @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat)) + @pytest.mark.parametrize("format", list(datapoints.BBoxFormat)) @pytest.mark.parametrize("dtype", [torch.float32, torch.int64]) @pytest.mark.parametrize("device", cpu_and_cuda()) - def test_kernel_bounding_boxes(self, format, dtype, device): - bounding_boxes = make_bounding_box(format=format, dtype=dtype, device=device) + def test_kernel_bboxes(self, format, dtype, device): + bboxes = make_bbox(format=format, dtype=dtype, device=device) check_kernel( - F.vertical_flip_bounding_boxes, - bounding_boxes, + F.vertical_flip_bboxes, + bboxes, format=format, - spatial_size=bounding_boxes.spatial_size, + spatial_size=bboxes.spatial_size, ) @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_mask]) @@ -1309,7 +1307,7 @@ def test_kernel_video(self): (F.vertical_flip_image_tensor, make_image_tensor), (F.vertical_flip_image_pil, make_image_pil), (F.vertical_flip_image_tensor, make_image), - (F.vertical_flip_bounding_boxes, make_bounding_box), + (F.vertical_flip_bboxes, make_bbox), (F.vertical_flip_mask, make_segmentation_mask), (F.vertical_flip_video, make_video), ], @@ -1323,7 +1321,7 @@ def test_dispatcher(self, kernel, make_input): (F.vertical_flip_image_tensor, torch.Tensor), (F.vertical_flip_image_pil, PIL.Image.Image), (F.vertical_flip_image_tensor, datapoints.Image), - (F.vertical_flip_bounding_boxes, datapoints.BoundingBoxes), + (F.vertical_flip_bboxes, datapoints.BBoxes), (F.vertical_flip_mask, datapoints.Mask), (F.vertical_flip_video, datapoints.Video), ], @@ -1333,7 +1331,7 @@ def test_dispatcher_signature(self, kernel, input_type): @pytest.mark.parametrize( "make_input", - [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video], + [make_image_tensor, make_image_pil, make_image, make_bbox, make_segmentation_mask, make_video], ) @pytest.mark.parametrize("device", cpu_and_cuda()) def test_transform(self, make_input, device): @@ -1348,37 +1346,37 @@ def test_image_correctness(self, fn): torch.testing.assert_close(actual, expected) - def _reference_vertical_flip_bounding_boxes(self, bounding_boxes): + def _reference_vertical_flip_bboxes(self, bboxes): affine_matrix = np.array( [ [1, 0, 0], - [0, -1, bounding_boxes.spatial_size[0]], + [0, -1, bboxes.spatial_size[0]], ], - dtype="float64" if bounding_boxes.dtype == torch.float64 else "float32", + dtype="float64" if bboxes.dtype == torch.float64 else "float32", ) - expected_bboxes = reference_affine_bounding_boxes_helper( - bounding_boxes, - format=bounding_boxes.format, - spatial_size=bounding_boxes.spatial_size, + expected_bboxes = reference_affine_bboxes_helper( + bboxes, + format=bboxes.format, + spatial_size=bboxes.spatial_size, affine_matrix=affine_matrix, ) - return datapoints.BoundingBoxes.wrap_like(bounding_boxes, expected_bboxes) + return datapoints.BBoxes.wrap_like(bboxes, expected_bboxes) - @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat)) + @pytest.mark.parametrize("format", list(datapoints.BBoxFormat)) @pytest.mark.parametrize("fn", [F.vertical_flip, transform_cls_to_functional(transforms.RandomVerticalFlip, p=1)]) - def test_bounding_boxes_correctness(self, format, fn): - bounding_boxes = make_bounding_box(format=format) + def test_bboxes_correctness(self, format, fn): + bboxes = make_bbox(format=format) - actual = fn(bounding_boxes) - expected = self._reference_vertical_flip_bounding_boxes(bounding_boxes) + actual = fn(bboxes) + expected = self._reference_vertical_flip_bboxes(bboxes) torch.testing.assert_close(actual, expected) @pytest.mark.parametrize( "make_input", - [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video], + [make_image_tensor, make_image_pil, make_image, make_bbox, make_segmentation_mask, make_video], ) @pytest.mark.parametrize("device", cpu_and_cuda()) def test_transform_noop(self, make_input, device): @@ -1435,21 +1433,21 @@ def test_kernel_image_tensor(self, param, value, dtype, device): expand=[False, True], center=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["center"], ) - @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat)) + @pytest.mark.parametrize("format", list(datapoints.BBoxFormat)) @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8]) @pytest.mark.parametrize("device", cpu_and_cuda()) - def test_kernel_bounding_boxes(self, param, value, format, dtype, device): + def test_kernel_bboxes(self, param, value, format, dtype, device): kwargs = {param: value} if param != "angle": kwargs["angle"] = self._MINIMAL_AFFINE_KWARGS["angle"] - bounding_boxes = make_bounding_box(format=format, dtype=dtype, device=device) + bboxes = make_bbox(format=format, dtype=dtype, device=device) check_kernel( - F.rotate_bounding_boxes, - bounding_boxes, + F.rotate_bboxes, + bboxes, format=format, - spatial_size=bounding_boxes.spatial_size, + spatial_size=bboxes.spatial_size, **kwargs, ) @@ -1466,7 +1464,7 @@ def test_kernel_video(self): (F.rotate_image_tensor, make_image_tensor), (F.rotate_image_pil, make_image_pil), (F.rotate_image_tensor, make_image), - (F.rotate_bounding_boxes, make_bounding_box), + (F.rotate_bboxes, make_bbox), (F.rotate_mask, make_segmentation_mask), (F.rotate_video, make_video), ], @@ -1480,7 +1478,7 @@ def test_dispatcher(self, kernel, make_input): (F.rotate_image_tensor, torch.Tensor), (F.rotate_image_pil, PIL.Image.Image), (F.rotate_image_tensor, datapoints.Image), - (F.rotate_bounding_boxes, datapoints.BoundingBoxes), + (F.rotate_bboxes, datapoints.BBoxes), (F.rotate_mask, datapoints.Mask), (F.rotate_video, datapoints.Video), ], @@ -1490,7 +1488,7 @@ def test_dispatcher_signature(self, kernel, input_type): @pytest.mark.parametrize( "make_input", - [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video], + [make_image_tensor, make_image_pil, make_image, make_bbox, make_segmentation_mask, make_video], ) @pytest.mark.parametrize("device", cpu_and_cuda()) def test_transform(self, make_input, device): @@ -1549,13 +1547,13 @@ def test_transform_image_correctness(self, center, interpolation, expand, fill, mae = (actual.float() - expected.float()).abs().mean() assert mae < 1 if interpolation is transforms.InterpolationMode.NEAREST else 6 - def _reference_rotate_bounding_boxes(self, bounding_boxes, *, angle, expand, center): + def _reference_rotate_bboxes(self, bboxes, *, angle, expand, center): # FIXME if expand: raise ValueError("This reference currently does not support expand=True") if center is None: - center = [s * 0.5 for s in bounding_boxes.spatial_size[::-1]] + center = [s * 0.5 for s in bboxes.spatial_size[::-1]] a = np.cos(angle * np.pi / 180.0) b = np.sin(angle * np.pi / 180.0) @@ -1566,48 +1564,48 @@ def _reference_rotate_bounding_boxes(self, bounding_boxes, *, angle, expand, cen [a, b, cx - cx * a - b * cy], [-b, a, cy + cx * b - a * cy], ], - dtype="float64" if bounding_boxes.dtype == torch.float64 else "float32", + dtype="float64" if bboxes.dtype == torch.float64 else "float32", ) - expected_bboxes = reference_affine_bounding_boxes_helper( - bounding_boxes, - format=bounding_boxes.format, - spatial_size=bounding_boxes.spatial_size, + expected_bboxes = reference_affine_bboxes_helper( + bboxes, + format=bboxes.format, + spatial_size=bboxes.spatial_size, affine_matrix=affine_matrix, ) return expected_bboxes - @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat)) + @pytest.mark.parametrize("format", list(datapoints.BBoxFormat)) @pytest.mark.parametrize("angle", _CORRECTNESS_AFFINE_KWARGS["angle"]) # TODO: add support for expand=True in the reference @pytest.mark.parametrize("expand", [False]) @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"]) - def test_functional_bounding_boxes_correctness(self, format, angle, expand, center): - bounding_boxes = make_bounding_box(format=format) + def test_functional_bboxes_correctness(self, format, angle, expand, center): + bboxes = make_bbox(format=format) - actual = F.rotate(bounding_boxes, angle=angle, expand=expand, center=center) - expected = self._reference_rotate_bounding_boxes(bounding_boxes, angle=angle, expand=expand, center=center) + actual = F.rotate(bboxes, angle=angle, expand=expand, center=center) + expected = self._reference_rotate_bboxes(bboxes, angle=angle, expand=expand, center=center) torch.testing.assert_close(actual, expected) - @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat)) + @pytest.mark.parametrize("format", list(datapoints.BBoxFormat)) # TODO: add support for expand=True in the reference @pytest.mark.parametrize("expand", [False]) @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"]) @pytest.mark.parametrize("seed", list(range(5))) - def test_transform_bounding_boxes_correctness(self, format, expand, center, seed): - bounding_boxes = make_bounding_box(format=format) + def test_transform_bboxes_correctness(self, format, expand, center, seed): + bboxes = make_bbox(format=format) transform = transforms.RandomRotation(**self._CORRECTNESS_TRANSFORM_AFFINE_RANGES, expand=expand, center=center) torch.manual_seed(seed) - params = transform._get_params([bounding_boxes]) + params = transform._get_params([bboxes]) torch.manual_seed(seed) - actual = transform(bounding_boxes) + actual = transform(bboxes) - expected = self._reference_rotate_bounding_boxes(bounding_boxes, **params, expand=expand, center=center) + expected = self._reference_rotate_bboxes(bboxes, **params, expand=expand, center=center) torch.testing.assert_close(actual, expected) @@ -1759,7 +1757,7 @@ def test_dispatcher(self, kernel, make_input, input_dtype, output_dtype, device, @pytest.mark.parametrize( "make_input", - [make_image_tensor, make_image, make_bounding_box, make_segmentation_mask, make_video], + [make_image_tensor, make_image, make_bbox, make_segmentation_mask, make_video], ) @pytest.mark.parametrize("input_dtype", [torch.float32, torch.float64, torch.uint8]) @pytest.mark.parametrize("output_dtype", [torch.float32, torch.float64, torch.uint8]) @@ -1834,7 +1832,7 @@ def make_inpt_with_bbox_and_mask(self, make_input): mask_dtype = torch.bool sample = { "inpt": make_input(size=(H, W), dtype=inpt_dtype), - "bbox": make_bounding_box(size=(H, W), dtype=bbox_dtype), + "bbox": make_bbox(size=(H, W), dtype=bbox_dtype), "mask": make_detection_mask(size=(H, W), dtype=mask_dtype), } @@ -1988,7 +1986,7 @@ def test_error(self, T): for input_with_bad_type in ( F.to_pil_image(imgs[0]), datapoints.Mask(torch.rand(12, 12)), - datapoints.BoundingBoxes(torch.rand(2, 4), format="XYXY", spatial_size=12), + datapoints.BBoxes(torch.rand(2, 4), format="XYXY", spatial_size=12), ): with pytest.raises(ValueError, match="does not support PIL images, "): cutmix_mixup(input_with_bad_type) diff --git a/test/test_transforms_v2_utils.py b/test/test_transforms_v2_utils.py index 58c8bfd5815..5bbc1172d49 100644 --- a/test/test_transforms_v2_utils.py +++ b/test/test_transforms_v2_utils.py @@ -4,7 +4,7 @@ import torch import torchvision.transforms.v2.utils -from common_utils import make_bounding_box, make_detection_mask, make_image +from common_utils import make_bbox, make_detection_mask, make_image from torchvision import datapoints from torchvision.transforms.v2.functional import to_image_pil @@ -12,7 +12,7 @@ IMAGE = make_image(color_space="RGB") -BOUNDING_BOX = make_bounding_box(format=datapoints.BoundingBoxFormat.XYXY, spatial_size=IMAGE.spatial_size) +BOUNDING_BOX = make_bbox(format=datapoints.BBoxFormat.XYXY, spatial_size=IMAGE.spatial_size) MASK = make_detection_mask(size=IMAGE.spatial_size) @@ -20,20 +20,20 @@ ("sample", "types", "expected"), [ ((IMAGE, BOUNDING_BOX, MASK), (datapoints.Image,), True), - ((IMAGE, BOUNDING_BOX, MASK), (datapoints.BoundingBoxes,), True), + ((IMAGE, BOUNDING_BOX, MASK), (datapoints.BBoxes,), True), ((IMAGE, BOUNDING_BOX, MASK), (datapoints.Mask,), True), - ((IMAGE, BOUNDING_BOX, MASK), (datapoints.Image, datapoints.BoundingBoxes), True), + ((IMAGE, BOUNDING_BOX, MASK), (datapoints.Image, datapoints.BBoxes), True), ((IMAGE, BOUNDING_BOX, MASK), (datapoints.Image, datapoints.Mask), True), - ((IMAGE, BOUNDING_BOX, MASK), (datapoints.BoundingBoxes, datapoints.Mask), True), - ((MASK,), (datapoints.Image, datapoints.BoundingBoxes), False), + ((IMAGE, BOUNDING_BOX, MASK), (datapoints.BBoxes, datapoints.Mask), True), + ((MASK,), (datapoints.Image, datapoints.BBoxes), False), ((BOUNDING_BOX,), (datapoints.Image, datapoints.Mask), False), - ((IMAGE,), (datapoints.BoundingBoxes, datapoints.Mask), False), + ((IMAGE,), (datapoints.BBoxes, datapoints.Mask), False), ( (IMAGE, BOUNDING_BOX, MASK), - (datapoints.Image, datapoints.BoundingBoxes, datapoints.Mask), + (datapoints.Image, datapoints.BBoxes, datapoints.Mask), True, ), - ((), (datapoints.Image, datapoints.BoundingBoxes, datapoints.Mask), False), + ((), (datapoints.Image, datapoints.BBoxes, datapoints.Mask), False), ((IMAGE, BOUNDING_BOX, MASK), (lambda obj: isinstance(obj, datapoints.Image),), True), ((IMAGE, BOUNDING_BOX, MASK), (lambda _: False,), False), ((IMAGE, BOUNDING_BOX, MASK), (lambda _: True,), True), @@ -58,30 +58,30 @@ def test_has_any(sample, types, expected): ("sample", "types", "expected"), [ ((IMAGE, BOUNDING_BOX, MASK), (datapoints.Image,), True), - ((IMAGE, BOUNDING_BOX, MASK), (datapoints.BoundingBoxes,), True), + ((IMAGE, BOUNDING_BOX, MASK), (datapoints.BBoxes,), True), ((IMAGE, BOUNDING_BOX, MASK), (datapoints.Mask,), True), - ((IMAGE, BOUNDING_BOX, MASK), (datapoints.Image, datapoints.BoundingBoxes), True), + ((IMAGE, BOUNDING_BOX, MASK), (datapoints.Image, datapoints.BBoxes), True), ((IMAGE, BOUNDING_BOX, MASK), (datapoints.Image, datapoints.Mask), True), - ((IMAGE, BOUNDING_BOX, MASK), (datapoints.BoundingBoxes, datapoints.Mask), True), + ((IMAGE, BOUNDING_BOX, MASK), (datapoints.BBoxes, datapoints.Mask), True), ( (IMAGE, BOUNDING_BOX, MASK), - (datapoints.Image, datapoints.BoundingBoxes, datapoints.Mask), + (datapoints.Image, datapoints.BBoxes, datapoints.Mask), True, ), - ((BOUNDING_BOX, MASK), (datapoints.Image, datapoints.BoundingBoxes), False), + ((BOUNDING_BOX, MASK), (datapoints.Image, datapoints.BBoxes), False), ((BOUNDING_BOX, MASK), (datapoints.Image, datapoints.Mask), False), - ((IMAGE, MASK), (datapoints.BoundingBoxes, datapoints.Mask), False), + ((IMAGE, MASK), (datapoints.BBoxes, datapoints.Mask), False), ( (IMAGE, BOUNDING_BOX, MASK), - (datapoints.Image, datapoints.BoundingBoxes, datapoints.Mask), + (datapoints.Image, datapoints.BBoxes, datapoints.Mask), True, ), - ((BOUNDING_BOX, MASK), (datapoints.Image, datapoints.BoundingBoxes, datapoints.Mask), False), - ((IMAGE, MASK), (datapoints.Image, datapoints.BoundingBoxes, datapoints.Mask), False), - ((IMAGE, BOUNDING_BOX), (datapoints.Image, datapoints.BoundingBoxes, datapoints.Mask), False), + ((BOUNDING_BOX, MASK), (datapoints.Image, datapoints.BBoxes, datapoints.Mask), False), + ((IMAGE, MASK), (datapoints.Image, datapoints.BBoxes, datapoints.Mask), False), + ((IMAGE, BOUNDING_BOX), (datapoints.Image, datapoints.BBoxes, datapoints.Mask), False), ( (IMAGE, BOUNDING_BOX, MASK), - (lambda obj: isinstance(obj, (datapoints.Image, datapoints.BoundingBoxes, datapoints.Mask)),), + (lambda obj: isinstance(obj, (datapoints.Image, datapoints.BBoxes, datapoints.Mask)),), True, ), ((IMAGE, BOUNDING_BOX, MASK), (lambda _: False,), False), diff --git a/test/test_utils.py b/test/test_utils.py index b13bd0f0f5b..f3c58180be0 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -98,7 +98,7 @@ def test_draw_boxes(): boxes_cp = boxes.clone() labels = ["a", "b", "c", "d"] colors = ["green", "#FF00FF", (0, 255, 0), "red"] - result = utils.draw_bounding_boxes(img, boxes, labels=labels, colors=colors, fill=True) + result = utils.draw_bboxes(img, boxes, labels=labels, colors=colors, fill=True) path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets", "fakedata", "draw_boxes_util.png") if not os.path.exists(path): @@ -118,17 +118,17 @@ def test_draw_boxes(): @pytest.mark.parametrize("colors", [None, ["red", "blue", "#FF00FF", (1, 34, 122)], "red", "#FF00FF", (1, 34, 122)]) def test_draw_boxes_colors(colors): img = torch.full((3, 100, 100), 0, dtype=torch.uint8) - utils.draw_bounding_boxes(img, boxes, fill=False, width=7, colors=colors) + utils.draw_bboxes(img, boxes, fill=False, width=7, colors=colors) with pytest.raises(ValueError, match="Number of colors must be equal or larger than the number of objects"): - utils.draw_bounding_boxes(image=img, boxes=boxes, colors=[]) + utils.draw_bboxes(image=img, boxes=boxes, colors=[]) def test_draw_boxes_vanilla(): img = torch.full((3, 100, 100), 0, dtype=torch.uint8) img_cp = img.clone() boxes_cp = boxes.clone() - result = utils.draw_bounding_boxes(img, boxes, fill=False, width=7, colors="white") + result = utils.draw_bboxes(img, boxes, fill=False, width=7, colors="white") path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets", "fakedata", "draw_boxes_vanilla.png") if not os.path.exists(path): @@ -145,7 +145,7 @@ def test_draw_boxes_vanilla(): def test_draw_boxes_grayscale(): img = torch.full((1, 4, 4), fill_value=255, dtype=torch.uint8) boxes = torch.tensor([[0, 0, 3, 3]], dtype=torch.int64) - bboxed_img = utils.draw_bounding_boxes(image=img, boxes=boxes, colors=["#1BBC9B"]) + bboxed_img = utils.draw_bboxes(image=img, boxes=boxes, colors=["#1BBC9B"]) assert bboxed_img.size(0) == 3 @@ -160,33 +160,33 @@ def test_draw_invalid_boxes(): colors_wrong = ["pink", "blue"] with pytest.raises(TypeError, match="Tensor expected"): - utils.draw_bounding_boxes(img_tp, boxes) + utils.draw_bboxes(img_tp, boxes) with pytest.raises(ValueError, match="Tensor uint8 expected"): - utils.draw_bounding_boxes(img_wrong1, boxes) + utils.draw_bboxes(img_wrong1, boxes) with pytest.raises(ValueError, match="Pass individual images, not batches"): - utils.draw_bounding_boxes(img_wrong2, boxes) + utils.draw_bboxes(img_wrong2, boxes) with pytest.raises(ValueError, match="Only grayscale and RGB images are supported"): - utils.draw_bounding_boxes(img_wrong2[0][:2], boxes) + utils.draw_bboxes(img_wrong2[0][:2], boxes) with pytest.raises(ValueError, match="Number of boxes"): - utils.draw_bounding_boxes(img_correct, boxes, labels_wrong) + utils.draw_bboxes(img_correct, boxes, labels_wrong) with pytest.raises(ValueError, match="Number of colors"): - utils.draw_bounding_boxes(img_correct, boxes, colors=colors_wrong) + utils.draw_bboxes(img_correct, boxes, colors=colors_wrong) with pytest.raises(ValueError, match="Boxes need to be in"): - utils.draw_bounding_boxes(img_correct, boxes_wrong) + utils.draw_bboxes(img_correct, boxes_wrong) def test_draw_boxes_warning(): img = torch.full((3, 100, 100), 255, dtype=torch.uint8) with pytest.warns(UserWarning, match=re.escape("Argument 'font_size' will be ignored since 'font' is not set.")): - utils.draw_bounding_boxes(img, boxes, font_size=11) + utils.draw_bboxes(img, boxes, font_size=11) def test_draw_no_boxes(): img = torch.full((3, 100, 100), 0, dtype=torch.uint8) boxes = torch.full((0, 4), 0, dtype=torch.float) with pytest.warns(UserWarning, match=re.escape("boxes doesn't contain any box. No box was drawn")): - res = utils.draw_bounding_boxes(img, boxes) + res = utils.draw_bboxes(img, boxes) # Check that the function didn't change the image assert res.eq(img).all() diff --git a/test/transforms_v2_dispatcher_infos.py b/test/transforms_v2_dispatcher_infos.py index 239954dda68..0223cbf1fdb 100644 --- a/test/transforms_v2_dispatcher_infos.py +++ b/test/transforms_v2_dispatcher_infos.py @@ -143,7 +143,7 @@ def fill_sequence_needs_broadcast(args_kwargs): kernels={ datapoints.Image: F.crop_image_tensor, datapoints.Video: F.crop_video, - datapoints.BoundingBoxes: F.crop_bounding_boxes, + datapoints.BBoxes: F.crop_bboxes, datapoints.Mask: F.crop_mask, }, pil_kernel_info=PILKernelInfo(F.crop_image_pil, kernel_name="crop_image_pil"), @@ -153,7 +153,7 @@ def fill_sequence_needs_broadcast(args_kwargs): kernels={ datapoints.Image: F.resized_crop_image_tensor, datapoints.Video: F.resized_crop_video, - datapoints.BoundingBoxes: F.resized_crop_bounding_boxes, + datapoints.BBoxes: F.resized_crop_bboxes, datapoints.Mask: F.resized_crop_mask, }, pil_kernel_info=PILKernelInfo(F.resized_crop_image_pil), @@ -163,7 +163,7 @@ def fill_sequence_needs_broadcast(args_kwargs): kernels={ datapoints.Image: F.pad_image_tensor, datapoints.Video: F.pad_video, - datapoints.BoundingBoxes: F.pad_bounding_boxes, + datapoints.BBoxes: F.pad_bboxes, datapoints.Mask: F.pad_mask, }, pil_kernel_info=PILKernelInfo(F.pad_image_pil, kernel_name="pad_image_pil"), @@ -185,7 +185,7 @@ def fill_sequence_needs_broadcast(args_kwargs): kernels={ datapoints.Image: F.perspective_image_tensor, datapoints.Video: F.perspective_video, - datapoints.BoundingBoxes: F.perspective_bounding_boxes, + datapoints.BBoxes: F.perspective_bboxes, datapoints.Mask: F.perspective_mask, }, pil_kernel_info=PILKernelInfo(F.perspective_image_pil), @@ -199,7 +199,7 @@ def fill_sequence_needs_broadcast(args_kwargs): kernels={ datapoints.Image: F.elastic_image_tensor, datapoints.Video: F.elastic_video, - datapoints.BoundingBoxes: F.elastic_bounding_boxes, + datapoints.BBoxes: F.elastic_bboxes, datapoints.Mask: F.elastic_mask, }, pil_kernel_info=PILKernelInfo(F.elastic_image_pil), @@ -210,7 +210,7 @@ def fill_sequence_needs_broadcast(args_kwargs): kernels={ datapoints.Image: F.center_crop_image_tensor, datapoints.Video: F.center_crop_video, - datapoints.BoundingBoxes: F.center_crop_bounding_boxes, + datapoints.BBoxes: F.center_crop_bboxes, datapoints.Mask: F.center_crop_mask, }, pil_kernel_info=PILKernelInfo(F.center_crop_image_pil), @@ -374,15 +374,15 @@ def fill_sequence_needs_broadcast(args_kwargs): ], ), DispatcherInfo( - F.clamp_bounding_boxes, - kernels={datapoints.BoundingBoxes: F.clamp_bounding_boxes}, + F.clamp_bboxes, + kernels={datapoints.BBoxes: F.clamp_bboxes}, test_marks=[ skip_dispatch_datapoint, ], ), DispatcherInfo( - F.convert_format_bounding_boxes, - kernels={datapoints.BoundingBoxes: F.convert_format_bounding_boxes}, + F.convert_format_bboxes, + kernels={datapoints.BBoxes: F.convert_format_bboxes}, test_marks=[ skip_dispatch_datapoint, ], diff --git a/test/transforms_v2_kernel_infos.py b/test/transforms_v2_kernel_infos.py index 6f1c91ac62a..bf87364ec82 100644 --- a/test/transforms_v2_kernel_infos.py +++ b/test/transforms_v2_kernel_infos.py @@ -14,8 +14,8 @@ get_num_channels, ImageLoader, InfoBase, - make_bounding_box_loader, - make_bounding_box_loaders, + make_bbox_loader, + make_bbox_loaders, make_detection_mask_loader, make_image_loader, make_image_loaders, @@ -184,16 +184,16 @@ def float32_vs_uint8_fill_adapter(other_args, kwargs): return other_args, dict(kwargs, fill=fill) -def reference_affine_bounding_boxes_helper(bounding_boxes, *, format, spatial_size, affine_matrix): +def reference_affine_bboxes_helper(bboxes, *, format, spatial_size, affine_matrix): def transform(bbox, affine_matrix_, format_, spatial_size_): # Go to float before converting to prevent precision loss in case of CXCYWH -> XYXY and W or H is 1 in_dtype = bbox.dtype if not torch.is_floating_point(bbox): bbox = bbox.float() - bbox_xyxy = F.convert_format_bounding_boxes( + bbox_xyxy = F.convert_format_bboxes( bbox.as_subclass(torch.Tensor), old_format=format_, - new_format=datapoints.BoundingBoxFormat.XYXY, + new_format=datapoints.BBoxFormat.XYXY, inplace=True, ) points = np.array( @@ -214,18 +214,18 @@ def transform(bbox, affine_matrix_, format_, spatial_size_): ], dtype=bbox_xyxy.dtype, ) - out_bbox = F.convert_format_bounding_boxes( - out_bbox, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=format_, inplace=True + out_bbox = F.convert_format_bboxes( + out_bbox, old_format=datapoints.BBoxFormat.XYXY, new_format=format_, inplace=True ) # It is important to clamp before casting, especially for CXCYWH format, dtype=int64 - out_bbox = F.clamp_bounding_boxes(out_bbox, format=format_, spatial_size=spatial_size_) + out_bbox = F.clamp_bboxes(out_bbox, format=format_, spatial_size=spatial_size_) out_bbox = out_bbox.to(dtype=in_dtype) return out_bbox - if bounding_boxes.ndim < 2: - bounding_boxes = [bounding_boxes] + if bboxes.ndim < 2: + bboxes = [bboxes] - expected_bboxes = [transform(bbox, affine_matrix, format, spatial_size) for bbox in bounding_boxes] + expected_bboxes = [transform(bbox, affine_matrix, format, spatial_size) for bbox in bboxes] if len(expected_bboxes) > 1: expected_bboxes = torch.stack(expected_bboxes) else: @@ -234,30 +234,30 @@ def transform(bbox, affine_matrix_, format_, spatial_size_): return expected_bboxes -def sample_inputs_convert_format_bounding_boxes(): - formats = list(datapoints.BoundingBoxFormat) - for bounding_boxes_loader, new_format in itertools.product(make_bounding_box_loaders(formats=formats), formats): - yield ArgsKwargs(bounding_boxes_loader, old_format=bounding_boxes_loader.format, new_format=new_format) +def sample_inputs_convert_format_bboxes(): + formats = list(datapoints.BBoxFormat) + for bboxes_loader, new_format in itertools.product(make_bbox_loaders(formats=formats), formats): + yield ArgsKwargs(bboxes_loader, old_format=bboxes_loader.format, new_format=new_format) -def reference_convert_format_bounding_boxes(bounding_boxes, old_format, new_format): - return torchvision.ops.box_convert( - bounding_boxes, in_fmt=old_format.name.lower(), out_fmt=new_format.name.lower() - ).to(bounding_boxes.dtype) +def reference_convert_format_bboxes(bboxes, old_format, new_format): + return torchvision.ops.box_convert(bboxes, in_fmt=old_format.name.lower(), out_fmt=new_format.name.lower()).to( + bboxes.dtype + ) -def reference_inputs_convert_format_bounding_boxes(): - for args_kwargs in sample_inputs_convert_format_bounding_boxes(): +def reference_inputs_convert_format_bboxes(): + for args_kwargs in sample_inputs_convert_format_bboxes(): if len(args_kwargs.args[0].shape) == 2: yield args_kwargs KERNEL_INFOS.append( KernelInfo( - F.convert_format_bounding_boxes, - sample_inputs_fn=sample_inputs_convert_format_bounding_boxes, - reference_fn=reference_convert_format_bounding_boxes, - reference_inputs_fn=reference_inputs_convert_format_bounding_boxes, + F.convert_format_bboxes, + sample_inputs_fn=sample_inputs_convert_format_bboxes, + reference_fn=reference_convert_format_bboxes, + reference_inputs_fn=reference_inputs_convert_format_bboxes, logs_usage=True, closeness_kwargs={ (("TestKernels", "test_against_reference"), torch.int64, "cpu"): dict(atol=1, rtol=0), @@ -290,11 +290,9 @@ def reference_inputs_crop_image_tensor(): yield ArgsKwargs(image_loader, **params) -def sample_inputs_crop_bounding_boxes(): - for bounding_boxes_loader, params in itertools.product( - make_bounding_box_loaders(), [_CROP_PARAMS[0], _CROP_PARAMS[-1]] - ): - yield ArgsKwargs(bounding_boxes_loader, format=bounding_boxes_loader.format, **params) +def sample_inputs_crop_bboxes(): + for bboxes_loader, params in itertools.product(make_bbox_loaders(), [_CROP_PARAMS[0], _CROP_PARAMS[-1]]): + yield ArgsKwargs(bboxes_loader, format=bboxes_loader.format, **params) def sample_inputs_crop_mask(): @@ -312,27 +310,27 @@ def sample_inputs_crop_video(): yield ArgsKwargs(video_loader, top=4, left=3, height=7, width=8) -def reference_crop_bounding_boxes(bounding_boxes, *, format, top, left, height, width): +def reference_crop_bboxes(bboxes, *, format, top, left, height, width): affine_matrix = np.array( [ [1, 0, -left], [0, 1, -top], ], - dtype="float64" if bounding_boxes.dtype == torch.float64 else "float32", + dtype="float64" if bboxes.dtype == torch.float64 else "float32", ) spatial_size = (height, width) - expected_bboxes = reference_affine_bounding_boxes_helper( - bounding_boxes, format=format, spatial_size=spatial_size, affine_matrix=affine_matrix + expected_bboxes = reference_affine_bboxes_helper( + bboxes, format=format, spatial_size=spatial_size, affine_matrix=affine_matrix ) return expected_bboxes, spatial_size -def reference_inputs_crop_bounding_boxes(): - for bounding_boxes_loader, params in itertools.product( - make_bounding_box_loaders(extra_dims=((), (4,))), [_CROP_PARAMS[0], _CROP_PARAMS[-1]] +def reference_inputs_crop_bboxes(): + for bboxes_loader, params in itertools.product( + make_bbox_loaders(extra_dims=((), (4,))), [_CROP_PARAMS[0], _CROP_PARAMS[-1]] ): - yield ArgsKwargs(bounding_boxes_loader, format=bounding_boxes_loader.format, **params) + yield ArgsKwargs(bboxes_loader, format=bboxes_loader.format, **params) KERNEL_INFOS.extend( @@ -346,10 +344,10 @@ def reference_inputs_crop_bounding_boxes(): float32_vs_uint8=True, ), KernelInfo( - F.crop_bounding_boxes, - sample_inputs_fn=sample_inputs_crop_bounding_boxes, - reference_fn=reference_crop_bounding_boxes, - reference_inputs_fn=reference_inputs_crop_bounding_boxes, + F.crop_bboxes, + sample_inputs_fn=sample_inputs_crop_bboxes, + reference_fn=reference_crop_bboxes, + reference_inputs_fn=reference_inputs_crop_bboxes, ), KernelInfo( F.crop_mask, @@ -406,9 +404,9 @@ def reference_inputs_resized_crop_image_tensor(): ) -def sample_inputs_resized_crop_bounding_boxes(): - for bounding_boxes_loader in make_bounding_box_loaders(): - yield ArgsKwargs(bounding_boxes_loader, format=bounding_boxes_loader.format, **_RESIZED_CROP_PARAMS[0]) +def sample_inputs_resized_crop_bboxes(): + for bboxes_loader in make_bbox_loaders(): + yield ArgsKwargs(bboxes_loader, format=bboxes_loader.format, **_RESIZED_CROP_PARAMS[0]) def sample_inputs_resized_crop_mask(): @@ -436,8 +434,8 @@ def sample_inputs_resized_crop_video(): }, ), KernelInfo( - F.resized_crop_bounding_boxes, - sample_inputs_fn=sample_inputs_resized_crop_bounding_boxes, + F.resized_crop_bboxes, + sample_inputs_fn=sample_inputs_resized_crop_bboxes, ), KernelInfo( F.resized_crop_mask, @@ -500,14 +498,14 @@ def reference_inputs_pad_image_tensor(): yield ArgsKwargs(image_loader, fill=fill, **params) -def sample_inputs_pad_bounding_boxes(): - for bounding_boxes_loader, padding in itertools.product( - make_bounding_box_loaders(), [1, (1,), (1, 2), (1, 2, 3, 4), [1], [1, 2], [1, 2, 3, 4]] +def sample_inputs_pad_bboxes(): + for bboxes_loader, padding in itertools.product( + make_bbox_loaders(), [1, (1,), (1, 2), (1, 2, 3, 4), [1], [1, 2], [1, 2, 3, 4]] ): yield ArgsKwargs( - bounding_boxes_loader, - format=bounding_boxes_loader.format, - spatial_size=bounding_boxes_loader.spatial_size, + bboxes_loader, + format=bboxes_loader.format, + spatial_size=bboxes_loader.spatial_size, padding=padding, padding_mode="constant", ) @@ -530,7 +528,7 @@ def sample_inputs_pad_video(): yield ArgsKwargs(video_loader, padding=[1]) -def reference_pad_bounding_boxes(bounding_boxes, *, format, spatial_size, padding, padding_mode): +def reference_pad_bboxes(bboxes, *, format, spatial_size, padding, padding_mode): left, right, top, bottom = _parse_pad_padding(padding) @@ -539,26 +537,26 @@ def reference_pad_bounding_boxes(bounding_boxes, *, format, spatial_size, paddin [1, 0, left], [0, 1, top], ], - dtype="float64" if bounding_boxes.dtype == torch.float64 else "float32", + dtype="float64" if bboxes.dtype == torch.float64 else "float32", ) height = spatial_size[0] + top + bottom width = spatial_size[1] + left + right - expected_bboxes = reference_affine_bounding_boxes_helper( - bounding_boxes, format=format, spatial_size=(height, width), affine_matrix=affine_matrix + expected_bboxes = reference_affine_bboxes_helper( + bboxes, format=format, spatial_size=(height, width), affine_matrix=affine_matrix ) return expected_bboxes, (height, width) -def reference_inputs_pad_bounding_boxes(): - for bounding_boxes_loader, padding in itertools.product( - make_bounding_box_loaders(extra_dims=((), (4,))), [1, (1,), (1, 2), (1, 2, 3, 4), [1], [1, 2], [1, 2, 3, 4]] +def reference_inputs_pad_bboxes(): + for bboxes_loader, padding in itertools.product( + make_bbox_loaders(extra_dims=((), (4,))), [1, (1,), (1, 2), (1, 2, 3, 4), [1], [1, 2], [1, 2, 3, 4]] ): yield ArgsKwargs( - bounding_boxes_loader, - format=bounding_boxes_loader.format, - spatial_size=bounding_boxes_loader.spatial_size, + bboxes_loader, + format=bboxes_loader.format, + spatial_size=bboxes_loader.spatial_size, padding=padding, padding_mode="constant", ) @@ -591,10 +589,10 @@ def pad_xfail_jit_fill_condition(args_kwargs): ], ), KernelInfo( - F.pad_bounding_boxes, - sample_inputs_fn=sample_inputs_pad_bounding_boxes, - reference_fn=reference_pad_bounding_boxes, - reference_inputs_fn=reference_inputs_pad_bounding_boxes, + F.pad_bboxes, + sample_inputs_fn=sample_inputs_pad_bboxes, + reference_fn=reference_pad_bboxes, + reference_inputs_fn=reference_inputs_pad_bboxes, test_marks=[ xfail_jit_python_scalar_arg("padding"), ], @@ -655,19 +653,19 @@ def reference_inputs_perspective_image_tensor(): ) -def sample_inputs_perspective_bounding_boxes(): - for bounding_boxes_loader in make_bounding_box_loaders(): +def sample_inputs_perspective_bboxes(): + for bboxes_loader in make_bbox_loaders(): yield ArgsKwargs( - bounding_boxes_loader, - format=bounding_boxes_loader.format, - spatial_size=bounding_boxes_loader.spatial_size, + bboxes_loader, + format=bboxes_loader.format, + spatial_size=bboxes_loader.spatial_size, startpoints=None, endpoints=None, coefficients=_PERSPECTIVE_COEFFS[0], ) - format = datapoints.BoundingBoxFormat.XYXY - loader = make_bounding_box_loader(format=format) + format = datapoints.BBoxFormat.XYXY + loader = make_bbox_loader(format=format) yield ArgsKwargs( loader, format=format, spatial_size=loader.spatial_size, startpoints=_STARTPOINTS, endpoints=_ENDPOINTS ) @@ -712,8 +710,8 @@ def sample_inputs_perspective_video(): test_marks=[xfail_jit_python_scalar_arg("fill")], ), KernelInfo( - F.perspective_bounding_boxes, - sample_inputs_fn=sample_inputs_perspective_bounding_boxes, + F.perspective_bboxes, + sample_inputs_fn=sample_inputs_perspective_bboxes, closeness_kwargs={ **scripted_vs_eager_float64_tolerances("cpu", atol=1e-6, rtol=1e-6), **scripted_vs_eager_float64_tolerances("cuda", atol=1e-6, rtol=1e-6), @@ -767,13 +765,13 @@ def reference_inputs_elastic_image_tensor(): yield ArgsKwargs(image_loader, interpolation=interpolation, displacement=displacement, fill=fill) -def sample_inputs_elastic_bounding_boxes(): - for bounding_boxes_loader in make_bounding_box_loaders(): - displacement = _get_elastic_displacement(bounding_boxes_loader.spatial_size) +def sample_inputs_elastic_bboxes(): + for bboxes_loader in make_bbox_loaders(): + displacement = _get_elastic_displacement(bboxes_loader.spatial_size) yield ArgsKwargs( - bounding_boxes_loader, - format=bounding_boxes_loader.format, - spatial_size=bounding_boxes_loader.spatial_size, + bboxes_loader, + format=bboxes_loader.format, + spatial_size=bboxes_loader.spatial_size, displacement=displacement, ) @@ -804,8 +802,8 @@ def sample_inputs_elastic_video(): test_marks=[xfail_jit_python_scalar_arg("fill")], ), KernelInfo( - F.elastic_bounding_boxes, - sample_inputs_fn=sample_inputs_elastic_bounding_boxes, + F.elastic_bboxes, + sample_inputs_fn=sample_inputs_elastic_bboxes, ), KernelInfo( F.elastic_mask, @@ -845,12 +843,12 @@ def reference_inputs_center_crop_image_tensor(): yield ArgsKwargs(image_loader, output_size=output_size) -def sample_inputs_center_crop_bounding_boxes(): - for bounding_boxes_loader, output_size in itertools.product(make_bounding_box_loaders(), _CENTER_CROP_OUTPUT_SIZES): +def sample_inputs_center_crop_bboxes(): + for bboxes_loader, output_size in itertools.product(make_bbox_loaders(), _CENTER_CROP_OUTPUT_SIZES): yield ArgsKwargs( - bounding_boxes_loader, - format=bounding_boxes_loader.format, - spatial_size=bounding_boxes_loader.spatial_size, + bboxes_loader, + format=bboxes_loader.format, + spatial_size=bboxes_loader.spatial_size, output_size=output_size, ) @@ -887,8 +885,8 @@ def sample_inputs_center_crop_video(): ], ), KernelInfo( - F.center_crop_bounding_boxes, - sample_inputs_fn=sample_inputs_center_crop_bounding_boxes, + F.center_crop_bboxes, + sample_inputs_fn=sample_inputs_center_crop_bboxes, test_marks=[ xfail_jit_python_scalar_arg("output_size"), ], @@ -1482,19 +1480,19 @@ def sample_inputs_adjust_saturation_video(): ) -def sample_inputs_clamp_bounding_boxes(): - for bounding_boxes_loader in make_bounding_box_loaders(): +def sample_inputs_clamp_bboxes(): + for bboxes_loader in make_bbox_loaders(): yield ArgsKwargs( - bounding_boxes_loader, - format=bounding_boxes_loader.format, - spatial_size=bounding_boxes_loader.spatial_size, + bboxes_loader, + format=bboxes_loader.format, + spatial_size=bboxes_loader.spatial_size, ) KERNEL_INFOS.append( KernelInfo( - F.clamp_bounding_boxes, - sample_inputs_fn=sample_inputs_clamp_bounding_boxes, + F.clamp_bboxes, + sample_inputs_fn=sample_inputs_clamp_bboxes, logs_usage=True, ) ) diff --git a/torchvision/datapoints/__init__.py b/torchvision/datapoints/__init__.py index fb51f0497ea..ea7f802d3c5 100644 --- a/torchvision/datapoints/__init__.py +++ b/torchvision/datapoints/__init__.py @@ -1,6 +1,6 @@ from torchvision import _BETA_TRANSFORMS_WARNING, _WARN_ABOUT_BETA_TRANSFORMS -from ._bounding_box import BoundingBoxes, BoundingBoxFormat +from ._bbox import BBoxes, BBoxFormat from ._datapoint import _FillType, _FillTypeJIT, _InputType, _InputTypeJIT from ._image import _ImageType, _ImageTypeJIT, _TensorImageType, _TensorImageTypeJIT, Image from ._mask import Mask diff --git a/torchvision/datapoints/_bounding_box.py b/torchvision/datapoints/_bounding_box.py index b3dc46348bc..c7300eb18b4 100644 --- a/torchvision/datapoints/_bounding_box.py +++ b/torchvision/datapoints/_bounding_box.py @@ -9,7 +9,7 @@ from ._datapoint import _FillTypeJIT, Datapoint -class BoundingBoxFormat(Enum): +class BBoxFormat(Enum): """[BETA] Coordinate format of a bounding box. Available formats are @@ -24,12 +24,12 @@ class BoundingBoxFormat(Enum): CXCYWH = "CXCYWH" -class BoundingBoxes(Datapoint): +class BBoxes(Datapoint): """[BETA] :class:`torch.Tensor` subclass for bounding boxes. Args: data: Any data that can be turned into a tensor with :func:`torch.as_tensor`. - format (BoundingBoxFormat, str): Format of the bounding box. + format (BBoxFormat, str): Format of the bounding box. spatial_size (two-tuple of ints): Height and width of the corresponding image or video. dtype (torch.dtype, optional): Desired data type of the bounding box. If omitted, will be inferred from ``data``. @@ -39,55 +39,55 @@ class BoundingBoxes(Datapoint): ``data`` is a :class:`torch.Tensor`, the value is taken from it. Otherwise, defaults to ``False``. """ - format: BoundingBoxFormat + format: BBoxFormat spatial_size: Tuple[int, int] @classmethod - def _wrap(cls, tensor: torch.Tensor, *, format: BoundingBoxFormat, spatial_size: Tuple[int, int]) -> BoundingBoxes: - bounding_boxes = tensor.as_subclass(cls) - bounding_boxes.format = format - bounding_boxes.spatial_size = spatial_size - return bounding_boxes + def _wrap(cls, tensor: torch.Tensor, *, format: BBoxFormat, spatial_size: Tuple[int, int]) -> BBoxes: + bboxes = tensor.as_subclass(cls) + bboxes.format = format + bboxes.spatial_size = spatial_size + return bboxes def __new__( cls, data: Any, *, - format: Union[BoundingBoxFormat, str], + format: Union[BBoxFormat, str], spatial_size: Tuple[int, int], dtype: Optional[torch.dtype] = None, device: Optional[Union[torch.device, str, int]] = None, requires_grad: Optional[bool] = None, - ) -> BoundingBoxes: + ) -> BBoxes: tensor = cls._to_tensor(data, dtype=dtype, device=device, requires_grad=requires_grad) if isinstance(format, str): - format = BoundingBoxFormat[format.upper()] + format = BBoxFormat[format.upper()] return cls._wrap(tensor, format=format, spatial_size=spatial_size) @classmethod def wrap_like( cls, - other: BoundingBoxes, + other: BBoxes, tensor: torch.Tensor, *, - format: Optional[BoundingBoxFormat] = None, + format: Optional[BBoxFormat] = None, spatial_size: Optional[Tuple[int, int]] = None, - ) -> BoundingBoxes: - """Wrap a :class:`torch.Tensor` as :class:`BoundingBoxes` from a reference. + ) -> BBoxes: + """Wrap a :class:`torch.Tensor` as :class:`BBoxes` from a reference. Args: - other (BoundingBoxes): Reference bounding box. - tensor (Tensor): Tensor to be wrapped as :class:`BoundingBoxes` - format (BoundingBoxFormat, str, optional): Format of the bounding box. If omitted, it is taken from the + other (BBoxes): Reference bounding box. + tensor (Tensor): Tensor to be wrapped as :class:`BBoxes` + format (BBoxFormat, str, optional): Format of the bounding box. If omitted, it is taken from the reference. spatial_size (two-tuple of ints, optional): Height and width of the corresponding image or video. If omitted, it is taken from the reference. """ if isinstance(format, str): - format = BoundingBoxFormat[format.upper()] + format = BBoxFormat[format.upper()] return cls._wrap( tensor, @@ -98,17 +98,17 @@ def wrap_like( def __repr__(self, *, tensor_contents: Any = None) -> str: # type: ignore[override] return self._make_repr(format=self.format, spatial_size=self.spatial_size) - def horizontal_flip(self) -> BoundingBoxes: - output = self._F.horizontal_flip_bounding_boxes( + def horizontal_flip(self) -> BBoxes: + output = self._F.horizontal_flip_bboxes( self.as_subclass(torch.Tensor), format=self.format, spatial_size=self.spatial_size ) - return BoundingBoxes.wrap_like(self, output) + return BBoxes.wrap_like(self, output) - def vertical_flip(self) -> BoundingBoxes: - output = self._F.vertical_flip_bounding_boxes( + def vertical_flip(self) -> BBoxes: + output = self._F.vertical_flip_bboxes( self.as_subclass(torch.Tensor), format=self.format, spatial_size=self.spatial_size ) - return BoundingBoxes.wrap_like(self, output) + return BBoxes.wrap_like(self, output) def resize( # type: ignore[override] self, @@ -116,26 +116,26 @@ def resize( # type: ignore[override] interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR, max_size: Optional[int] = None, antialias: Optional[Union[str, bool]] = "warn", - ) -> BoundingBoxes: - output, spatial_size = self._F.resize_bounding_boxes( + ) -> BBoxes: + output, spatial_size = self._F.resize_bboxes( self.as_subclass(torch.Tensor), spatial_size=self.spatial_size, size=size, max_size=max_size, ) - return BoundingBoxes.wrap_like(self, output, spatial_size=spatial_size) + return BBoxes.wrap_like(self, output, spatial_size=spatial_size) - def crop(self, top: int, left: int, height: int, width: int) -> BoundingBoxes: - output, spatial_size = self._F.crop_bounding_boxes( + def crop(self, top: int, left: int, height: int, width: int) -> BBoxes: + output, spatial_size = self._F.crop_bboxes( self.as_subclass(torch.Tensor), self.format, top=top, left=left, height=height, width=width ) - return BoundingBoxes.wrap_like(self, output, spatial_size=spatial_size) + return BBoxes.wrap_like(self, output, spatial_size=spatial_size) - def center_crop(self, output_size: List[int]) -> BoundingBoxes: - output, spatial_size = self._F.center_crop_bounding_boxes( + def center_crop(self, output_size: List[int]) -> BBoxes: + output, spatial_size = self._F.center_crop_bboxes( self.as_subclass(torch.Tensor), format=self.format, spatial_size=self.spatial_size, output_size=output_size ) - return BoundingBoxes.wrap_like(self, output, spatial_size=spatial_size) + return BBoxes.wrap_like(self, output, spatial_size=spatial_size) def resized_crop( self, @@ -146,26 +146,26 @@ def resized_crop( size: List[int], interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR, antialias: Optional[Union[str, bool]] = "warn", - ) -> BoundingBoxes: - output, spatial_size = self._F.resized_crop_bounding_boxes( + ) -> BBoxes: + output, spatial_size = self._F.resized_crop_bboxes( self.as_subclass(torch.Tensor), self.format, top, left, height, width, size=size ) - return BoundingBoxes.wrap_like(self, output, spatial_size=spatial_size) + return BBoxes.wrap_like(self, output, spatial_size=spatial_size) def pad( self, padding: Union[int, Sequence[int]], fill: Optional[Union[int, float, List[float]]] = None, padding_mode: str = "constant", - ) -> BoundingBoxes: - output, spatial_size = self._F.pad_bounding_boxes( + ) -> BBoxes: + output, spatial_size = self._F.pad_bboxes( self.as_subclass(torch.Tensor), format=self.format, spatial_size=self.spatial_size, padding=padding, padding_mode=padding_mode, ) - return BoundingBoxes.wrap_like(self, output, spatial_size=spatial_size) + return BBoxes.wrap_like(self, output, spatial_size=spatial_size) def rotate( self, @@ -174,8 +174,8 @@ def rotate( expand: bool = False, center: Optional[List[float]] = None, fill: _FillTypeJIT = None, - ) -> BoundingBoxes: - output, spatial_size = self._F.rotate_bounding_boxes( + ) -> BBoxes: + output, spatial_size = self._F.rotate_bboxes( self.as_subclass(torch.Tensor), format=self.format, spatial_size=self.spatial_size, @@ -183,7 +183,7 @@ def rotate( expand=expand, center=center, ) - return BoundingBoxes.wrap_like(self, output, spatial_size=spatial_size) + return BBoxes.wrap_like(self, output, spatial_size=spatial_size) def affine( self, @@ -194,8 +194,8 @@ def affine( interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST, fill: _FillTypeJIT = None, center: Optional[List[float]] = None, - ) -> BoundingBoxes: - output = self._F.affine_bounding_boxes( + ) -> BBoxes: + output = self._F.affine_bboxes( self.as_subclass(torch.Tensor), self.format, self.spatial_size, @@ -205,7 +205,7 @@ def affine( shear=shear, center=center, ) - return BoundingBoxes.wrap_like(self, output) + return BBoxes.wrap_like(self, output) def perspective( self, @@ -214,8 +214,8 @@ def perspective( interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR, fill: _FillTypeJIT = None, coefficients: Optional[List[float]] = None, - ) -> BoundingBoxes: - output = self._F.perspective_bounding_boxes( + ) -> BBoxes: + output = self._F.perspective_bboxes( self.as_subclass(torch.Tensor), format=self.format, spatial_size=self.spatial_size, @@ -223,15 +223,15 @@ def perspective( endpoints=endpoints, coefficients=coefficients, ) - return BoundingBoxes.wrap_like(self, output) + return BBoxes.wrap_like(self, output) def elastic( self, displacement: torch.Tensor, interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR, fill: _FillTypeJIT = None, - ) -> BoundingBoxes: - output = self._F.elastic_bounding_boxes( + ) -> BBoxes: + output = self._F.elastic_bboxes( self.as_subclass(torch.Tensor), self.format, self.spatial_size, displacement=displacement ) - return BoundingBoxes.wrap_like(self, output) + return BBoxes.wrap_like(self, output) diff --git a/torchvision/datapoints/_datapoint.py b/torchvision/datapoints/_datapoint.py index 35072159d7f..a813159064f 100644 --- a/torchvision/datapoints/_datapoint.py +++ b/torchvision/datapoints/_datapoint.py @@ -138,8 +138,8 @@ def __deepcopy__(self: D, memo: Dict[int, Any]) -> D: # *not* happen for `deepcopy(Tensor)`. A side-effect from detaching is that the `Tensor.requires_grad` # attribute is cleared, so we need to refill it before we return. # Note: We don't explicitly handle deep-copying of the metadata here. The only metadata we currently have is - # `BoundingBoxes.format` and `BoundingBoxes.spatial_size`, which are immutable and thus implicitly deep-copied by - # `BoundingBoxes.clone()`. + # `BBoxes.format` and `BBoxes.spatial_size`, which are immutable and thus implicitly deep-copied by + # `BBoxes.clone()`. return self.detach().clone().requires_grad_(self.requires_grad) # type: ignore[return-value] def horizontal_flip(self) -> Datapoint: diff --git a/torchvision/datapoints/_dataset_wrapper.py b/torchvision/datapoints/_dataset_wrapper.py index 3b162b69cbf..95b3aefae94 100644 --- a/torchvision/datapoints/_dataset_wrapper.py +++ b/torchvision/datapoints/_dataset_wrapper.py @@ -44,7 +44,7 @@ def wrap_dataset_for_transforms_v2(dataset, target_keys=None): the target and wrap the data in the corresponding ``torchvision.datapoints``. The original keys are preserved. If ``target_keys`` is ommitted, returns only the values for the ``"boxes"`` and ``"labels"``. * :class:`~torchvision.datasets.CelebA`: The target for ``target_type="bbox"`` is converted to the ``XYXY`` - coordinate format and wrapped into a :class:`~torchvision.datapoints.BoundingBoxes` datapoint. + coordinate format and wrapped into a :class:`~torchvision.datapoints.BBoxes` datapoint. * :class:`~torchvision.datasets.Kitti`: Instead returning the target as list of dicts, the wrapper returns a dict of lists. In addition, the key-value-pairs ``"boxes"`` and ``"labels"`` are added and wrap the data in the corresponding ``torchvision.datapoints``. The original keys are preserved. If ``target_keys`` is @@ -56,7 +56,7 @@ def wrap_dataset_for_transforms_v2(dataset, target_keys=None): a dictionary with the key-value-pairs ``"masks"`` (as :class:`~torchvision.datapoints.Mask` datapoint) and ``"labels"``. * :class:`~torchvision.datasets.WIDERFace`: The value for key ``"bbox"`` in the target is converted to ``XYXY`` - coordinate format and wrapped into a :class:`~torchvision.datapoints.BoundingBoxes` datapoint. + coordinate format and wrapped into a :class:`~torchvision.datapoints.BBoxes` datapoint. Image classification datasets @@ -360,13 +360,13 @@ def wrapper(idx, sample): target["image_id"] = image_id if "boxes" in target_keys: - target["boxes"] = F.convert_format_bounding_boxes( - datapoints.BoundingBoxes( + target["boxes"] = F.convert_format_bboxes( + datapoints.BBoxes( batched_target["bbox"], - format=datapoints.BoundingBoxFormat.XYWH, + format=datapoints.BBoxFormat.XYWH, spatial_size=spatial_size, ), - new_format=datapoints.BoundingBoxFormat.XYXY, + new_format=datapoints.BBoxFormat.XYXY, ) if "masks" in target_keys: @@ -442,12 +442,12 @@ def wrapper(idx, sample): target = {} if "boxes" in target_keys: - target["boxes"] = datapoints.BoundingBoxes( + target["boxes"] = datapoints.BBoxes( [ [int(bndbox[part]) for part in ("xmin", "ymin", "xmax", "ymax")] for bndbox in batched_instances["bndbox"] ], - format=datapoints.BoundingBoxFormat.XYXY, + format=datapoints.BBoxFormat.XYXY, spatial_size=(image.height, image.width), ) @@ -481,13 +481,13 @@ def wrapper(idx, sample): target, target_types=dataset.target_type, type_wrappers={ - "bbox": lambda item: F.convert_format_bounding_boxes( - datapoints.BoundingBoxes( + "bbox": lambda item: F.convert_format_bboxes( + datapoints.BBoxes( item, - format=datapoints.BoundingBoxFormat.XYWH, + format=datapoints.BBoxFormat.XYWH, spatial_size=(image.height, image.width), ), - new_format=datapoints.BoundingBoxFormat.XYXY, + new_format=datapoints.BBoxFormat.XYXY, ), }, ) @@ -532,9 +532,9 @@ def wrapper(idx, sample): target = {} if "boxes" in target_keys: - target["boxes"] = datapoints.BoundingBoxes( + target["boxes"] = datapoints.BBoxes( batched_target["bbox"], - format=datapoints.BoundingBoxFormat.XYXY, + format=datapoints.BBoxFormat.XYXY, spatial_size=(image.height, image.width), ) @@ -628,11 +628,11 @@ def wrapper(idx, sample): target = {key: target[key] for key in target_keys} if "bbox" in target_keys: - target["bbox"] = F.convert_format_bounding_boxes( - datapoints.BoundingBoxes( - target["bbox"], format=datapoints.BoundingBoxFormat.XYWH, spatial_size=(image.height, image.width) + target["bbox"] = F.convert_format_bboxes( + datapoints.BBoxes( + target["bbox"], format=datapoints.BBoxFormat.XYWH, spatial_size=(image.height, image.width) ), - new_format=datapoints.BoundingBoxFormat.XYXY, + new_format=datapoints.BBoxFormat.XYXY, ) return image, target diff --git a/torchvision/ops/boxes.py b/torchvision/ops/boxes.py index a541f8d880a..63687a13457 100644 --- a/torchvision/ops/boxes.py +++ b/torchvision/ops/boxes.py @@ -404,14 +404,14 @@ def masks_to_boxes(masks: torch.Tensor) -> torch.Tensor: n = masks.shape[0] - bounding_boxes = torch.zeros((n, 4), device=masks.device, dtype=torch.float) + bboxes = torch.zeros((n, 4), device=masks.device, dtype=torch.float) for index, mask in enumerate(masks): y, x = torch.where(mask != 0) - bounding_boxes[index, 0] = torch.min(x) - bounding_boxes[index, 1] = torch.min(y) - bounding_boxes[index, 2] = torch.max(x) - bounding_boxes[index, 3] = torch.max(y) + bboxes[index, 0] = torch.min(x) + bboxes[index, 1] = torch.min(y) + bboxes[index, 2] = torch.max(x) + bboxes[index, 3] = torch.max(y) - return bounding_boxes + return bboxes diff --git a/torchvision/prototype/datasets/_builtin/caltech.py b/torchvision/prototype/datasets/_builtin/caltech.py index 631de46b2b6..f5080d80c01 100644 --- a/torchvision/prototype/datasets/_builtin/caltech.py +++ b/torchvision/prototype/datasets/_builtin/caltech.py @@ -6,7 +6,7 @@ import torch from torchdata.datapipes.iter import Filter, IterDataPipe, IterKeyZipper, Mapper -from torchvision.datapoints import BoundingBoxes +from torchvision.datapoints import BBoxes from torchvision.prototype.datapoints import Label from torchvision.prototype.datasets.utils import Dataset, EncodedImage, GDriveResource, OnlineResource from torchvision.prototype.datasets.utils._internal import ( @@ -112,7 +112,7 @@ def _prepare_sample( image_path=image_path, image=image, ann_path=ann_path, - bounding_boxes=BoundingBoxes( + bboxes=BBoxes( ann["box_coord"].astype(np.int64).squeeze()[[2, 0, 3, 1]], format="xyxy", spatial_size=image.spatial_size, diff --git a/torchvision/prototype/datasets/_builtin/celeba.py b/torchvision/prototype/datasets/_builtin/celeba.py index 9112a80357c..c4109b8c7ff 100644 --- a/torchvision/prototype/datasets/_builtin/celeba.py +++ b/torchvision/prototype/datasets/_builtin/celeba.py @@ -4,7 +4,7 @@ import torch from torchdata.datapipes.iter import Filter, IterDataPipe, IterKeyZipper, Mapper, Zipper -from torchvision.datapoints import BoundingBoxes +from torchvision.datapoints import BBoxes from torchvision.prototype.datapoints import Label from torchvision.prototype.datasets.utils import Dataset, EncodedImage, GDriveResource, OnlineResource from torchvision.prototype.datasets.utils._internal import ( @@ -100,7 +100,7 @@ def _resources(self) -> List[OnlineResource]: sha256="f0e5da289d5ccf75ffe8811132694922b60f2af59256ed362afa03fefba324d0", file_name="list_attr_celeba.txt", ) - bounding_boxes = GDriveResource( + bboxes = GDriveResource( "0B7EVK8r0v71pbThiMVRxWXZ4dU0", sha256="7487a82e57c4bb956c5445ae2df4a91ffa717e903c5fa22874ede0820c8ec41b", file_name="list_bbox_celeba.txt", @@ -110,7 +110,7 @@ def _resources(self) -> List[OnlineResource]: sha256="6c02a87569907f6db2ba99019085697596730e8129f67a3d61659f198c48d43b", file_name="list_landmarks_align_celeba.txt", ) - return [splits, images, identities, attributes, bounding_boxes, landmarks] + return [splits, images, identities, attributes, bboxes, landmarks] def _filter_split(self, data: Tuple[str, Dict[str, str]]) -> bool: split_id = { @@ -137,15 +137,15 @@ def _prepare_sample( path, buffer = image_data image = EncodedImage.from_file(buffer) - (_, identity), (_, attributes), (_, bounding_boxes), (_, landmarks) = ann_data + (_, identity), (_, attributes), (_, bboxes), (_, landmarks) = ann_data return dict( path=path, image=image, identity=Label(int(identity["identity"])), attributes={attr: value == "1" for attr, value in attributes.items()}, - bounding_boxes=BoundingBoxes( - [int(bounding_boxes[key]) for key in ("x_1", "y_1", "width", "height")], + bboxes=BBoxes( + [int(bboxes[key]) for key in ("x_1", "y_1", "width", "height")], format="xywh", spatial_size=image.spatial_size, ), @@ -156,7 +156,7 @@ def _prepare_sample( ) def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]: - splits_dp, images_dp, identities_dp, attributes_dp, bounding_boxes_dp, landmarks_dp = resource_dps + splits_dp, images_dp, identities_dp, attributes_dp, bboxes_dp, landmarks_dp = resource_dps splits_dp = CelebACSVParser(splits_dp, fieldnames=("image_id", "split_id")) splits_dp = Filter(splits_dp, self._filter_split) @@ -169,7 +169,7 @@ def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, for dp, fieldnames in ( (identities_dp, ("image_id", "identity")), (attributes_dp, None), - (bounding_boxes_dp, None), + (bboxes_dp, None), (landmarks_dp, None), ) ] diff --git a/torchvision/prototype/datasets/_builtin/coco.py b/torchvision/prototype/datasets/_builtin/coco.py index abf19acec0d..0c7da1a5dc0 100644 --- a/torchvision/prototype/datasets/_builtin/coco.py +++ b/torchvision/prototype/datasets/_builtin/coco.py @@ -14,7 +14,7 @@ Mapper, UnBatcher, ) -from torchvision.datapoints import BoundingBoxes, Mask +from torchvision.datapoints import BBoxes, Mask from torchvision.prototype.datapoints import Label from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource from torchvision.prototype.datasets.utils._internal import ( @@ -126,7 +126,7 @@ def _decode_instances_anns(self, anns: List[Dict[str, Any]], image_meta: Dict[st ), areas=torch.as_tensor([ann["area"] for ann in anns]), crowds=torch.as_tensor([ann["iscrowd"] for ann in anns], dtype=torch.bool), - bounding_boxes=BoundingBoxes( + bboxes=BBoxes( [ann["bbox"] for ann in anns], format="xywh", spatial_size=spatial_size, diff --git a/torchvision/prototype/datasets/_builtin/cub200.py b/torchvision/prototype/datasets/_builtin/cub200.py index b301c6ba030..11218859470 100644 --- a/torchvision/prototype/datasets/_builtin/cub200.py +++ b/torchvision/prototype/datasets/_builtin/cub200.py @@ -15,7 +15,7 @@ Mapper, ) from torchdata.datapipes.map import IterToMapConverter -from torchvision.datapoints import BoundingBoxes +from torchvision.datapoints import BBoxes from torchvision.prototype.datapoints import Label from torchvision.prototype.datasets.utils import Dataset, EncodedImage, GDriveResource, OnlineResource from torchvision.prototype.datasets.utils._internal import ( @@ -112,7 +112,7 @@ def _2011_classify_archive(self, data: Tuple[str, Any]) -> Optional[int]: return 1 elif path.name == "images.txt": return 2 - elif path.name == "bounding_boxes.txt": + elif path.name == "bboxes.txt": return 3 else: return None @@ -134,12 +134,10 @@ def _2011_segmentation_key(self, data: Tuple[str, Any]) -> str: def _2011_prepare_ann( self, data: Tuple[str, Tuple[List[str], Tuple[str, BinaryIO]]], spatial_size: Tuple[int, int] ) -> Dict[str, Any]: - _, (bounding_boxes_data, segmentation_data) = data + _, (bboxes_data, segmentation_data) = data segmentation_path, segmentation_buffer = segmentation_data return dict( - bounding_boxes=BoundingBoxes( - [float(part) for part in bounding_boxes_data[1:]], format="xywh", spatial_size=spatial_size - ), + bboxes=BBoxes([float(part) for part in bboxes_data[1:]], format="xywh", spatial_size=spatial_size), segmentation_path=segmentation_path, segmentation=EncodedImage.from_file(segmentation_buffer), ) @@ -158,7 +156,7 @@ def _2010_prepare_ann( content = read_mat(buffer) return dict( ann_path=path, - bounding_boxes=BoundingBoxes( + bboxes=BBoxes( [int(content["bbox"][coord]) for coord in ("left", "bottom", "right", "top")], format="xyxy", spatial_size=spatial_size, @@ -191,7 +189,7 @@ def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, prepare_ann_fn: Callable if self._year == "2011": archive_dp, segmentations_dp = resource_dps - images_dp, split_dp, image_files_dp, bounding_boxes_dp = Demultiplexer( + images_dp, split_dp, image_files_dp, bboxes_dp = Demultiplexer( archive_dp, 4, self._2011_classify_archive, drop_none=True, buffer_size=INFINITE_BUFFER_SIZE ) @@ -204,11 +202,11 @@ def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, split_dp = Mapper(split_dp, getitem(0)) split_dp = Mapper(split_dp, image_files_map.__getitem__) - bounding_boxes_dp = CSVParser(bounding_boxes_dp, dialect="cub200") - bounding_boxes_dp = Mapper(bounding_boxes_dp, image_files_map.__getitem__, input_col=0) + bboxes_dp = CSVParser(bboxes_dp, dialect="cub200") + bboxes_dp = Mapper(bboxes_dp, image_files_map.__getitem__, input_col=0) anns_dp = IterKeyZipper( - bounding_boxes_dp, + bboxes_dp, segmentations_dp, key_fn=getitem(0), ref_key_fn=self._2011_segmentation_key, diff --git a/torchvision/prototype/datasets/_builtin/gtsrb.py b/torchvision/prototype/datasets/_builtin/gtsrb.py index 34651fcfce3..e0c5cc3f91f 100644 --- a/torchvision/prototype/datasets/_builtin/gtsrb.py +++ b/torchvision/prototype/datasets/_builtin/gtsrb.py @@ -2,7 +2,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union from torchdata.datapipes.iter import CSVDictParser, Demultiplexer, Filter, IterDataPipe, Mapper, Zipper -from torchvision.datapoints import BoundingBoxes +from torchvision.datapoints import BBoxes from torchvision.prototype.datapoints import Label from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource from torchvision.prototype.datasets.utils._internal import ( @@ -76,7 +76,7 @@ def _prepare_sample(self, data: Tuple[Tuple[str, Any], Dict[str, Any]]) -> Dict[ (path, buffer), csv_info = data label = int(csv_info["ClassId"]) - bounding_boxes = BoundingBoxes( + bboxes = BBoxes( [int(csv_info[k]) for k in ("Roi.X1", "Roi.Y1", "Roi.X2", "Roi.Y2")], format="xyxy", spatial_size=(int(csv_info["Height"]), int(csv_info["Width"])), @@ -86,7 +86,7 @@ def _prepare_sample(self, data: Tuple[Tuple[str, Any], Dict[str, Any]]) -> Dict[ "path": path, "image": EncodedImage.from_file(buffer), "label": Label(label, categories=self._categories), - "bounding_boxes": bounding_boxes, + "bboxes": bboxes, } def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]: diff --git a/torchvision/prototype/datasets/_builtin/stanford_cars.py b/torchvision/prototype/datasets/_builtin/stanford_cars.py index aefbbede2e3..11ed5b7117b 100644 --- a/torchvision/prototype/datasets/_builtin/stanford_cars.py +++ b/torchvision/prototype/datasets/_builtin/stanford_cars.py @@ -2,7 +2,7 @@ from typing import Any, BinaryIO, Dict, Iterator, List, Tuple, Union from torchdata.datapipes.iter import Filter, IterDataPipe, Mapper, Zipper -from torchvision.datapoints import BoundingBoxes +from torchvision.datapoints import BBoxes from torchvision.prototype.datapoints import Label from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource from torchvision.prototype.datasets.utils._internal import ( @@ -90,7 +90,7 @@ def _prepare_sample(self, data: Tuple[Tuple[str, BinaryIO], Tuple[int, int, int, path=path, image=image, label=Label(target[4] - 1, categories=self._categories), - bounding_boxes=BoundingBoxes(target[:4], format="xyxy", spatial_size=image.spatial_size), + bboxes=BBoxes(target[:4], format="xyxy", spatial_size=image.spatial_size), ) def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]: diff --git a/torchvision/prototype/datasets/_builtin/voc.py b/torchvision/prototype/datasets/_builtin/voc.py index 53dfbd185bc..782d947925f 100644 --- a/torchvision/prototype/datasets/_builtin/voc.py +++ b/torchvision/prototype/datasets/_builtin/voc.py @@ -5,7 +5,7 @@ from xml.etree import ElementTree from torchdata.datapipes.iter import Demultiplexer, Filter, IterDataPipe, IterKeyZipper, LineReader, Mapper -from torchvision.datapoints import BoundingBoxes +from torchvision.datapoints import BBoxes from torchvision.datasets import VOCDetection from torchvision.prototype.datapoints import Label from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource @@ -103,7 +103,7 @@ def _prepare_detection_ann(self, buffer: BinaryIO) -> Dict[str, Any]: anns = self._parse_detection_ann(buffer) instances = anns["object"] return dict( - bounding_boxes=BoundingBoxes( + bboxes=BBoxes( [ [int(instance["bndbox"][part]) for part in ("xmin", "ymin", "xmax", "ymax")] for instance in instances diff --git a/torchvision/prototype/transforms/_augment.py b/torchvision/prototype/transforms/_augment.py index 0e50fb75588..ce5a9d703cc 100644 --- a/torchvision/prototype/transforms/_augment.py +++ b/torchvision/prototype/transforms/_augment.py @@ -26,7 +26,7 @@ def _check_inputs(self, flat_inputs: List[Any]) -> None: and has_any(flat_inputs, proto_datapoints.OneHotLabel) ): raise TypeError(f"{type(self).__name__}() is only defined for tensor images/videos and one-hot labels.") - if has_any(flat_inputs, PIL.Image.Image, datapoints.BoundingBoxes, datapoints.Mask, proto_datapoints.Label): + if has_any(flat_inputs, PIL.Image.Image, datapoints.BBoxes, datapoints.Mask, proto_datapoints.Label): raise TypeError( f"{type(self).__name__}() does not support PIL images, bounding boxes, masks and plain labels." ) @@ -175,8 +175,8 @@ def _copy_paste( # There is a similar +1 in other reference implementations: # https://github.com/pytorch/vision/blob/b6feccbc4387766b76a3e22b13815dbbbfa87c0f/torchvision/models/detection/roi_heads.py#L418-L422 xyxy_boxes[:, 2:] += 1 - boxes = F.convert_format_bounding_boxes( - xyxy_boxes, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=bbox_format, inplace=True + boxes = F.convert_format_bboxes( + xyxy_boxes, old_format=datapoints.BBoxFormat.XYXY, new_format=bbox_format, inplace=True ) out_target["boxes"] = torch.cat([boxes, paste_boxes]) @@ -184,8 +184,8 @@ def _copy_paste( out_target["labels"] = torch.cat([labels, paste_labels]) # Check for degenerated boxes and remove them - boxes = F.convert_format_bounding_boxes( - out_target["boxes"], old_format=bbox_format, new_format=datapoints.BoundingBoxFormat.XYXY + boxes = F.convert_format_bboxes( + out_target["boxes"], old_format=bbox_format, new_format=datapoints.BBoxFormat.XYXY ) degenerate_boxes = boxes[:, 2:] <= boxes[:, :2] if degenerate_boxes.any(): @@ -201,14 +201,14 @@ def _extract_image_targets( self, flat_sample: List[Any] ) -> Tuple[List[datapoints._TensorImageType], List[Dict[str, Any]]]: # fetch all images, bboxes, masks and labels from unstructured input - # with List[image], List[BoundingBoxes], List[Mask], List[Label] + # with List[image], List[BBoxes], List[Mask], List[Label] images, bboxes, masks, labels = [], [], [], [] for obj in flat_sample: if isinstance(obj, datapoints.Image) or is_simple_tensor(obj): images.append(obj) elif isinstance(obj, PIL.Image.Image): images.append(F.to_image_tensor(obj)) - elif isinstance(obj, datapoints.BoundingBoxes): + elif isinstance(obj, datapoints.BBoxes): bboxes.append(obj) elif isinstance(obj, datapoints.Mask): masks.append(obj) @@ -218,7 +218,7 @@ def _extract_image_targets( if not (len(images) == len(bboxes) == len(masks) == len(labels)): raise TypeError( f"{type(self).__name__}() requires input sample to contain equal sized list of Images, " - "BoundingBoxeses, Masks and Labels or OneHotLabels." + "BBoxeses, Masks and Labels or OneHotLabels." ) targets = [] @@ -244,8 +244,8 @@ def _insert_outputs( elif is_simple_tensor(obj): flat_sample[i] = output_images[c0] c0 += 1 - elif isinstance(obj, datapoints.BoundingBoxes): - flat_sample[i] = datapoints.BoundingBoxes.wrap_like(obj, output_targets[c1]["boxes"]) + elif isinstance(obj, datapoints.BBoxes): + flat_sample[i] = datapoints.BBoxes.wrap_like(obj, output_targets[c1]["boxes"]) c1 += 1 elif isinstance(obj, datapoints.Mask): flat_sample[i] = datapoints.Mask.wrap_like(obj, output_targets[c2]["masks"]) diff --git a/torchvision/prototype/transforms/_geometry.py b/torchvision/prototype/transforms/_geometry.py index b328c132070..98b531a35c9 100644 --- a/torchvision/prototype/transforms/_geometry.py +++ b/torchvision/prototype/transforms/_geometry.py @@ -7,7 +7,7 @@ from torchvision.prototype.datapoints import Label, OneHotLabel from torchvision.transforms.v2 import functional as F, Transform from torchvision.transforms.v2._utils import _setup_fill_arg, _setup_size -from torchvision.transforms.v2.utils import has_any, is_simple_tensor, query_bounding_boxes, query_spatial_size +from torchvision.transforms.v2.utils import has_any, is_simple_tensor, query_bboxes, query_spatial_size class FixedSizeCrop(Transform): @@ -39,9 +39,9 @@ def _check_inputs(self, flat_inputs: List[Any]) -> None: f"{type(self).__name__}() requires input sample to contain an tensor or PIL image or a Video." ) - if has_any(flat_inputs, datapoints.BoundingBoxes) and not has_any(flat_inputs, Label, OneHotLabel): + if has_any(flat_inputs, datapoints.BBoxes) and not has_any(flat_inputs, Label, OneHotLabel): raise TypeError( - f"If a BoundingBoxes is contained in the input sample, " + f"If a BBoxes is contained in the input sample, " f"{type(self).__name__}() also requires it to contain a Label or OneHotLabel." ) @@ -59,25 +59,25 @@ def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]: top = int(offset_height * r) left = int(offset_width * r) - bounding_boxes: Optional[torch.Tensor] + bboxes: Optional[torch.Tensor] try: - bounding_boxes = query_bounding_boxes(flat_inputs) + bboxes = query_bboxes(flat_inputs) except ValueError: - bounding_boxes = None + bboxes = None - if needs_crop and bounding_boxes is not None: - format = bounding_boxes.format - bounding_boxes, spatial_size = F.crop_bounding_boxes( - bounding_boxes.as_subclass(torch.Tensor), + if needs_crop and bboxes is not None: + format = bboxes.format + bboxes, spatial_size = F.crop_bboxes( + bboxes.as_subclass(torch.Tensor), format=format, top=top, left=left, height=new_height, width=new_width, ) - bounding_boxes = F.clamp_bounding_boxes(bounding_boxes, format=format, spatial_size=spatial_size) - height_and_width = F.convert_format_bounding_boxes( - bounding_boxes, old_format=format, new_format=datapoints.BoundingBoxFormat.XYWH + bboxes = F.clamp_bboxes(bboxes, format=format, spatial_size=spatial_size) + height_and_width = F.convert_format_bboxes( + bboxes, old_format=format, new_format=datapoints.BBoxFormat.XYWH )[..., 2:] is_valid = torch.all(height_and_width > 0, dim=-1) else: @@ -112,12 +112,10 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: if params["is_valid"] is not None: if isinstance(inpt, (Label, OneHotLabel, datapoints.Mask)): inpt = inpt.wrap_like(inpt, inpt[params["is_valid"]]) # type: ignore[arg-type] - elif isinstance(inpt, datapoints.BoundingBoxes): - inpt = datapoints.BoundingBoxes.wrap_like( + elif isinstance(inpt, datapoints.BBoxes): + inpt = datapoints.BBoxes.wrap_like( inpt, - F.clamp_bounding_boxes( - inpt[params["is_valid"]], format=inpt.format, spatial_size=inpt.spatial_size - ), + F.clamp_bboxes(inpt[params["is_valid"]], format=inpt.format, spatial_size=inpt.spatial_size), ) if params["needs_pad"]: diff --git a/torchvision/transforms/v2/__init__.py b/torchvision/transforms/v2/__init__.py index b44f479c4b4..5840fcc4a2b 100644 --- a/torchvision/transforms/v2/__init__.py +++ b/torchvision/transforms/v2/__init__.py @@ -39,7 +39,7 @@ ScaleJitter, TenCrop, ) -from ._meta import ClampBoundingBoxes, ConvertBoundingBoxFormat +from ._meta import ClampBBoxes, ConvertBBoxFormat from ._misc import ( ConvertImageDtype, GaussianBlur, @@ -47,7 +47,7 @@ Lambda, LinearTransformation, Normalize, - SanitizeBoundingBoxes, + SanitizeBBoxes, ToDtype, ) from ._temporal import UniformTemporalSubsample diff --git a/torchvision/transforms/v2/_augment.py b/torchvision/transforms/v2/_augment.py index f9038c6af32..353214766e9 100644 --- a/torchvision/transforms/v2/_augment.py +++ b/torchvision/transforms/v2/_augment.py @@ -155,7 +155,7 @@ def forward(self, *inputs): flat_inputs, spec = tree_flatten(inputs) needs_transform_list = self._needs_transform_list(flat_inputs) - if has_any(flat_inputs, PIL.Image.Image, datapoints.BoundingBoxes, datapoints.Mask): + if has_any(flat_inputs, PIL.Image.Image, datapoints.BBoxes, datapoints.Mask): raise ValueError(f"{type(self).__name__}() does not support PIL images, bounding boxes and masks.") labels = self._labels_getter(inputs) diff --git a/torchvision/transforms/v2/_auto_augment.py b/torchvision/transforms/v2/_auto_augment.py index 785e1f6970b..48465b4b7c8 100644 --- a/torchvision/transforms/v2/_auto_augment.py +++ b/torchvision/transforms/v2/_auto_augment.py @@ -34,7 +34,7 @@ def _get_random_item(self, dct: Dict[str, Tuple[Callable, bool]]) -> Tuple[str, def _flatten_and_extract_image_or_video( self, inputs: Any, - unsupported_types: Tuple[Type, ...] = (datapoints.BoundingBoxes, datapoints.Mask), + unsupported_types: Tuple[Type, ...] = (datapoints.BBoxes, datapoints.Mask), ) -> Tuple[Tuple[List[Any], TreeSpec, int], Union[datapoints._ImageType, datapoints._VideoType]]: flat_inputs, spec = tree_flatten(inputs if len(inputs) > 1 else inputs[0]) needs_transform_list = self._needs_transform_list(flat_inputs) diff --git a/torchvision/transforms/v2/_geometry.py b/torchvision/transforms/v2/_geometry.py index a64f7a40e4b..b9dc68b625e 100644 --- a/torchvision/transforms/v2/_geometry.py +++ b/torchvision/transforms/v2/_geometry.py @@ -22,7 +22,7 @@ _setup_float_or_seq, _setup_size, ) -from .utils import has_all, has_any, is_simple_tensor, query_bounding_boxes, query_spatial_size +from .utils import has_all, has_any, is_simple_tensor, query_bboxes, query_spatial_size class RandomHorizontalFlip(_RandomApplyTransform): @@ -31,7 +31,7 @@ class RandomHorizontalFlip(_RandomApplyTransform): .. v2betastatus:: RandomHorizontalFlip transform If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, - :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.) + :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BBoxes` etc.) it can have arbitrary number of leading batch dimensions. For example, the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape. @@ -51,7 +51,7 @@ class RandomVerticalFlip(_RandomApplyTransform): .. v2betastatus:: RandomVerticalFlip transform If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, - :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.) + :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BBoxes` etc.) it can have arbitrary number of leading batch dimensions. For example, the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape. @@ -71,7 +71,7 @@ class Resize(Transform): .. v2betastatus:: Resize transform If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, - :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.) + :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BBoxes` etc.) it can have arbitrary number of leading batch dimensions. For example, the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape. @@ -165,7 +165,7 @@ class CenterCrop(Transform): .. v2betastatus:: CenterCrop transform If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, - :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.) + :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BBoxes` etc.) it can have arbitrary number of leading batch dimensions. For example, the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape. @@ -193,7 +193,7 @@ class RandomResizedCrop(Transform): .. v2betastatus:: RandomResizedCrop transform If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, - :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.) + :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BBoxes` etc.) it can have arbitrary number of leading batch dimensions. For example, the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape. @@ -371,8 +371,8 @@ def _transform( return F.five_crop(inpt, self.size) def _check_inputs(self, flat_inputs: List[Any]) -> None: - if has_any(flat_inputs, datapoints.BoundingBoxes, datapoints.Mask): - raise TypeError(f"BoundingBoxes'es and Mask's are not supported by {type(self).__name__}()") + if has_any(flat_inputs, datapoints.BBoxes, datapoints.Mask): + raise TypeError(f"BBoxes'es and Mask's are not supported by {type(self).__name__}()") class TenCrop(Transform): @@ -414,8 +414,8 @@ def __init__(self, size: Union[int, Sequence[int]], vertical_flip: bool = False) self.vertical_flip = vertical_flip def _check_inputs(self, flat_inputs: List[Any]) -> None: - if has_any(flat_inputs, datapoints.BoundingBoxes, datapoints.Mask): - raise TypeError(f"BoundingBoxes'es and Mask's are not supported by {type(self).__name__}()") + if has_any(flat_inputs, datapoints.BBoxes, datapoints.Mask): + raise TypeError(f"BBoxes'es and Mask's are not supported by {type(self).__name__}()") def _transform( self, inpt: Union[datapoints._ImageType, datapoints._VideoType], params: Dict[str, Any] @@ -440,7 +440,7 @@ class Pad(Transform): .. v2betastatus:: Pad transform If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, - :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.) + :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BBoxes` etc.) it can have arbitrary number of leading batch dimensions. For example, the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape. @@ -525,7 +525,7 @@ class RandomZoomOut(_RandomApplyTransform): output_height = input_height * r If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, - :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.) + :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BBoxes` etc.) it can have arbitrary number of leading batch dimensions. For example, the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape. @@ -584,7 +584,7 @@ class RandomRotation(Transform): .. v2betastatus:: RandomRotation transform If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, - :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.) + :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BBoxes` etc.) it can have arbitrary number of leading batch dimensions. For example, the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape. @@ -657,7 +657,7 @@ class RandomAffine(Transform): .. v2betastatus:: RandomAffine transform If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, - :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.) + :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BBoxes` etc.) it can have arbitrary number of leading batch dimensions. For example, the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape. @@ -778,7 +778,7 @@ class RandomCrop(Transform): .. v2betastatus:: RandomCrop transform If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, - :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.) + :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BBoxes` etc.) it can have arbitrary number of leading batch dimensions. For example, the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape. @@ -933,7 +933,7 @@ class RandomPerspective(_RandomApplyTransform): .. v2betastatus:: RandomPerspective transform If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, - :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.) + :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BBoxes` etc.) it can have arbitrary number of leading batch dimensions. For example, the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape. @@ -1019,7 +1019,7 @@ class ElasticTransform(Transform): .. v2betastatus:: RandomPerspective transform If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, - :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.) + :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BBoxes` etc.) it can have arbitrary number of leading batch dimensions. For example, the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape. @@ -1110,15 +1110,15 @@ class RandomIoUCrop(Transform): .. v2betastatus:: RandomIoUCrop transform - This transformation requires an image or video data and ``datapoints.BoundingBoxes`` in the input. + This transformation requires an image or video data and ``datapoints.BBoxes`` in the input. .. warning:: In order to properly remove the bounding boxes below the IoU threshold, `RandomIoUCrop` - must be followed by :class:`~torchvision.transforms.v2.SanitizeBoundingBoxes`, either immediately + must be followed by :class:`~torchvision.transforms.v2.SanitizeBBoxes`, either immediately after or later in the transforms pipeline. If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, - :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.) + :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BBoxes` etc.) it can have arbitrary number of leading batch dimensions. For example, the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape. @@ -1155,7 +1155,7 @@ def __init__( def _check_inputs(self, flat_inputs: List[Any]) -> None: if not ( - has_all(flat_inputs, datapoints.BoundingBoxes) + has_all(flat_inputs, datapoints.BBoxes) and has_any(flat_inputs, PIL.Image.Image, datapoints.Image, is_simple_tensor) ): raise TypeError( @@ -1165,7 +1165,7 @@ def _check_inputs(self, flat_inputs: List[Any]) -> None: def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]: orig_h, orig_w = query_spatial_size(flat_inputs) - bboxes = query_bounding_boxes(flat_inputs) + bboxes = query_bboxes(flat_inputs) while True: # sample an option @@ -1193,8 +1193,8 @@ def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]: continue # check for any valid boxes with centers within the crop area - xyxy_bboxes = F.convert_format_bounding_boxes( - bboxes.as_subclass(torch.Tensor), bboxes.format, datapoints.BoundingBoxFormat.XYXY + xyxy_bboxes = F.convert_format_bboxes( + bboxes.as_subclass(torch.Tensor), bboxes.format, datapoints.BBoxFormat.XYXY ) cx = 0.5 * (xyxy_bboxes[..., 0] + xyxy_bboxes[..., 2]) cy = 0.5 * (xyxy_bboxes[..., 1] + xyxy_bboxes[..., 3]) @@ -1220,9 +1220,9 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: output = F.crop(inpt, top=params["top"], left=params["left"], height=params["height"], width=params["width"]) - if isinstance(output, datapoints.BoundingBoxes): + if isinstance(output, datapoints.BBoxes): # We "mark" the invalid boxes as degenreate, and they can be - # removed by a later call to SanitizeBoundingBoxes() + # removed by a later call to SanitizeBBoxes() output[~params["is_within_crop_area"]] = 0 return output @@ -1235,7 +1235,7 @@ class ScaleJitter(Transform): .. v2betastatus:: ScaleJitter transform If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, - :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.) + :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BBoxes` etc.) it can have arbitrary number of leading batch dimensions. For example, the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape. @@ -1301,7 +1301,7 @@ class RandomShortestSize(Transform): .. v2betastatus:: RandomShortestSize transform If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, - :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.) + :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BBoxes` etc.) it can have arbitrary number of leading batch dimensions. For example, the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape. @@ -1380,7 +1380,7 @@ class RandomResize(Transform): output_height = size If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, - :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.) + :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BBoxes` etc.) it can have arbitrary number of leading batch dimensions. For example, the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape. diff --git a/torchvision/transforms/v2/_meta.py b/torchvision/transforms/v2/_meta.py index 71cc159c907..472ec185f6a 100644 --- a/torchvision/transforms/v2/_meta.py +++ b/torchvision/transforms/v2/_meta.py @@ -4,39 +4,39 @@ from torchvision.transforms.v2 import functional as F, Transform -class ConvertBoundingBoxFormat(Transform): +class ConvertBBoxFormat(Transform): """[BETA] Convert bounding box coordinates to the given ``format``, eg from "CXCYWH" to "XYXY". - .. v2betastatus:: ConvertBoundingBoxFormat transform + .. v2betastatus:: ConvertBBoxFormat transform Args: - format (str or datapoints.BoundingBoxFormat): output bounding box format. - Possible values are defined by :class:`~torchvision.datapoints.BoundingBoxFormat` and + format (str or datapoints.BBoxFormat): output bounding box format. + Possible values are defined by :class:`~torchvision.datapoints.BBoxFormat` and string values match the enums, e.g. "XYXY" or "XYWH" etc. """ - _transformed_types = (datapoints.BoundingBoxes,) + _transformed_types = (datapoints.BBoxes,) - def __init__(self, format: Union[str, datapoints.BoundingBoxFormat]) -> None: + def __init__(self, format: Union[str, datapoints.BBoxFormat]) -> None: super().__init__() if isinstance(format, str): - format = datapoints.BoundingBoxFormat[format] + format = datapoints.BBoxFormat[format] self.format = format - def _transform(self, inpt: datapoints.BoundingBoxes, params: Dict[str, Any]) -> datapoints.BoundingBoxes: - return F.convert_format_bounding_boxes(inpt, new_format=self.format) # type: ignore[return-value] + def _transform(self, inpt: datapoints.BBoxes, params: Dict[str, Any]) -> datapoints.BBoxes: + return F.convert_format_bboxes(inpt, new_format=self.format) # type: ignore[return-value] -class ClampBoundingBoxes(Transform): +class ClampBBoxes(Transform): """[BETA] Clamp bounding boxes to their corresponding image dimensions. The clamping is done according to the bounding boxes' ``spatial_size`` meta-data. - .. v2betastatus:: ClampBoundingBoxes transform + .. v2betastatus:: ClampBBoxes transform """ - _transformed_types = (datapoints.BoundingBoxes,) + _transformed_types = (datapoints.BBoxes,) - def _transform(self, inpt: datapoints.BoundingBoxes, params: Dict[str, Any]) -> datapoints.BoundingBoxes: - return F.clamp_bounding_boxes(inpt) # type: ignore[return-value] + def _transform(self, inpt: datapoints.BBoxes, params: Dict[str, Any]) -> datapoints.BBoxes: + return F.clamp_bboxes(inpt) # type: ignore[return-value] diff --git a/torchvision/transforms/v2/_misc.py b/torchvision/transforms/v2/_misc.py index a4cb594b2b3..5cc8993f117 100644 --- a/torchvision/transforms/v2/_misc.py +++ b/torchvision/transforms/v2/_misc.py @@ -10,7 +10,7 @@ from torchvision.transforms.v2 import functional as F, Transform from ._utils import _parse_labels_getter, _setup_float_or_seq, _setup_size -from .utils import has_any, is_simple_tensor, query_bounding_boxes +from .utils import has_any, is_simple_tensor, query_bboxes # TODO: do we want/need to expose this? @@ -332,16 +332,16 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: return F.to_dtype(inpt, dtype=self.dtype, scale=True) -class SanitizeBoundingBoxes(Transform): +class SanitizeBBoxes(Transform): """[BETA] Remove degenerate/invalid bounding boxes and their corresponding labels and masks. - .. v2betastatus:: SanitizeBoundingBoxes transform + .. v2betastatus:: SanitizeBBoxes transform This transform removes bounding boxes and their associated labels/masks that: - are below a given ``min_size``: by default this also removes degenerate boxes that have e.g. X2 <= X1. - have any coordinate outside of their corresponding image. You may want to - call :class:`~torchvision.transforms.v2.ClampBoundingBoxes` first to avoid undesired removals. + call :class:`~torchvision.transforms.v2.ClampBBoxes` first to avoid undesired removals. It is recommended to call it at the end of a pipeline, before passing the input to the models. It is critical to call this transform if @@ -384,10 +384,10 @@ def forward(self, *inputs: Any) -> Any: ) flat_inputs, spec = tree_flatten(inputs) - # TODO: this enforces one single BoundingBoxes entry. + # TODO: this enforces one single BBoxes entry. # Assuming this transform needs to be called at the end of *any* pipeline that has bboxes... # should we just enforce it for all transforms?? What are the benefits of *not* enforcing this? - boxes = query_bounding_boxes(flat_inputs) + boxes = query_bboxes(flat_inputs) if boxes.ndim != 2: raise ValueError(f"boxes must be of shape (num_boxes, 4), got {boxes.shape}") @@ -398,10 +398,10 @@ def forward(self, *inputs: Any) -> Any: ) boxes = cast( - datapoints.BoundingBoxes, - F.convert_format_bounding_boxes( + datapoints.BBoxes, + F.convert_format_bboxes( boxes, - new_format=datapoints.BoundingBoxFormat.XYXY, + new_format=datapoints.BBoxFormat.XYXY, ), ) ws, hs = boxes[:, 2] - boxes[:, 0], boxes[:, 3] - boxes[:, 1] @@ -415,7 +415,7 @@ def forward(self, *inputs: Any) -> Any: params = dict(valid=valid, labels=labels) flat_outputs = [ # Even-though it may look like we're transforming all inputs, we don't: - # _transform() will only care about BoundingBoxeses and the labels + # _transform() will only care about BBoxeses and the labels self._transform(inpt, params) for inpt in flat_inputs ] @@ -424,9 +424,9 @@ def forward(self, *inputs: Any) -> Any: def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: is_label = inpt is not None and inpt is params["labels"] - is_bounding_boxes_or_mask = isinstance(inpt, (datapoints.BoundingBoxes, datapoints.Mask)) + is_bboxes_or_mask = isinstance(inpt, (datapoints.BBoxes, datapoints.Mask)) - if not (is_label or is_bounding_boxes_or_mask): + if not (is_label or is_bboxes_or_mask): return inpt output = inpt[params["valid"]] diff --git a/torchvision/transforms/v2/functional/__init__.py b/torchvision/transforms/v2/functional/__init__.py index 16f5ff50071..c841eae38a2 100644 --- a/torchvision/transforms/v2/functional/__init__.py +++ b/torchvision/transforms/v2/functional/__init__.py @@ -3,8 +3,8 @@ from ._utils import is_simple_tensor # usort: skip from ._meta import ( - clamp_bounding_boxes, - convert_format_bounding_boxes, + clamp_bboxes, + convert_format_bboxes, get_dimensions_image_tensor, get_dimensions_image_pil, get_dimensions, @@ -15,7 +15,7 @@ get_num_channels_image_pil, get_num_channels_video, get_num_channels, - get_spatial_size_bounding_boxes, + get_spatial_size_bboxes, get_spatial_size_image_tensor, get_spatial_size_image_pil, get_spatial_size_mask, @@ -76,25 +76,25 @@ ) from ._geometry import ( affine, - affine_bounding_boxes, + affine_bboxes, affine_image_pil, affine_image_tensor, affine_mask, affine_video, center_crop, - center_crop_bounding_boxes, + center_crop_bboxes, center_crop_image_pil, center_crop_image_tensor, center_crop_mask, center_crop_video, crop, - crop_bounding_boxes, + crop_bboxes, crop_image_pil, crop_image_tensor, crop_mask, crop_video, elastic, - elastic_bounding_boxes, + elastic_bboxes, elastic_image_pil, elastic_image_tensor, elastic_mask, @@ -106,37 +106,37 @@ five_crop_video, hflip, # TODO: Consider moving all pure alias definitions at the bottom of the file horizontal_flip, - horizontal_flip_bounding_boxes, + horizontal_flip_bboxes, horizontal_flip_image_pil, horizontal_flip_image_tensor, horizontal_flip_mask, horizontal_flip_video, pad, - pad_bounding_boxes, + pad_bboxes, pad_image_pil, pad_image_tensor, pad_mask, pad_video, perspective, - perspective_bounding_boxes, + perspective_bboxes, perspective_image_pil, perspective_image_tensor, perspective_mask, perspective_video, resize, - resize_bounding_boxes, + resize_bboxes, resize_image_pil, resize_image_tensor, resize_mask, resize_video, resized_crop, - resized_crop_bounding_boxes, + resized_crop_bboxes, resized_crop_image_pil, resized_crop_image_tensor, resized_crop_mask, resized_crop_video, rotate, - rotate_bounding_boxes, + rotate_bboxes, rotate_image_pil, rotate_image_tensor, rotate_mask, @@ -146,7 +146,7 @@ ten_crop_image_tensor, ten_crop_video, vertical_flip, - vertical_flip_bounding_boxes, + vertical_flip_bboxes, vertical_flip_image_pil, vertical_flip_image_tensor, vertical_flip_mask, diff --git a/torchvision/transforms/v2/functional/_geometry.py b/torchvision/transforms/v2/functional/_geometry.py index 469e58ff9c4..77d19e2e976 100644 --- a/torchvision/transforms/v2/functional/_geometry.py +++ b/torchvision/transforms/v2/functional/_geometry.py @@ -23,7 +23,7 @@ from torchvision.utils import _log_api_usage_once -from ._meta import clamp_bounding_boxes, convert_format_bounding_boxes, get_spatial_size_image_pil +from ._meta import clamp_bboxes, convert_format_bboxes, get_spatial_size_image_pil from ._utils import is_simple_tensor @@ -51,21 +51,21 @@ def horizontal_flip_mask(mask: torch.Tensor) -> torch.Tensor: return horizontal_flip_image_tensor(mask) -def horizontal_flip_bounding_boxes( - bounding_boxes: torch.Tensor, format: datapoints.BoundingBoxFormat, spatial_size: Tuple[int, int] +def horizontal_flip_bboxes( + bboxes: torch.Tensor, format: datapoints.BBoxFormat, spatial_size: Tuple[int, int] ) -> torch.Tensor: - shape = bounding_boxes.shape + shape = bboxes.shape - bounding_boxes = bounding_boxes.clone().reshape(-1, 4) + bboxes = bboxes.clone().reshape(-1, 4) - if format == datapoints.BoundingBoxFormat.XYXY: - bounding_boxes[:, [2, 0]] = bounding_boxes[:, [0, 2]].sub_(spatial_size[1]).neg_() - elif format == datapoints.BoundingBoxFormat.XYWH: - bounding_boxes[:, 0].add_(bounding_boxes[:, 2]).sub_(spatial_size[1]).neg_() - else: # format == datapoints.BoundingBoxFormat.CXCYWH: - bounding_boxes[:, 0].sub_(spatial_size[1]).neg_() + if format == datapoints.BBoxFormat.XYXY: + bboxes[:, [2, 0]] = bboxes[:, [0, 2]].sub_(spatial_size[1]).neg_() + elif format == datapoints.BBoxFormat.XYWH: + bboxes[:, 0].add_(bboxes[:, 2]).sub_(spatial_size[1]).neg_() + else: # format == datapoints.BBoxFormat.CXCYWH: + bboxes[:, 0].sub_(spatial_size[1]).neg_() - return bounding_boxes.reshape(shape) + return bboxes.reshape(shape) def horizontal_flip_video(video: torch.Tensor) -> torch.Tensor: @@ -101,21 +101,21 @@ def vertical_flip_mask(mask: torch.Tensor) -> torch.Tensor: return vertical_flip_image_tensor(mask) -def vertical_flip_bounding_boxes( - bounding_boxes: torch.Tensor, format: datapoints.BoundingBoxFormat, spatial_size: Tuple[int, int] +def vertical_flip_bboxes( + bboxes: torch.Tensor, format: datapoints.BBoxFormat, spatial_size: Tuple[int, int] ) -> torch.Tensor: - shape = bounding_boxes.shape + shape = bboxes.shape - bounding_boxes = bounding_boxes.clone().reshape(-1, 4) + bboxes = bboxes.clone().reshape(-1, 4) - if format == datapoints.BoundingBoxFormat.XYXY: - bounding_boxes[:, [1, 3]] = bounding_boxes[:, [3, 1]].sub_(spatial_size[0]).neg_() - elif format == datapoints.BoundingBoxFormat.XYWH: - bounding_boxes[:, 1].add_(bounding_boxes[:, 3]).sub_(spatial_size[0]).neg_() - else: # format == datapoints.BoundingBoxFormat.CXCYWH: - bounding_boxes[:, 1].sub_(spatial_size[0]).neg_() + if format == datapoints.BBoxFormat.XYXY: + bboxes[:, [1, 3]] = bboxes[:, [3, 1]].sub_(spatial_size[0]).neg_() + elif format == datapoints.BBoxFormat.XYWH: + bboxes[:, 1].add_(bboxes[:, 3]).sub_(spatial_size[0]).neg_() + else: # format == datapoints.BBoxFormat.CXCYWH: + bboxes[:, 1].sub_(spatial_size[0]).neg_() - return bounding_boxes.reshape(shape) + return bboxes.reshape(shape) def vertical_flip_video(video: torch.Tensor) -> torch.Tensor: @@ -274,20 +274,20 @@ def resize_mask(mask: torch.Tensor, size: List[int], max_size: Optional[int] = N return output -def resize_bounding_boxes( - bounding_boxes: torch.Tensor, spatial_size: Tuple[int, int], size: List[int], max_size: Optional[int] = None +def resize_bboxes( + bboxes: torch.Tensor, spatial_size: Tuple[int, int], size: List[int], max_size: Optional[int] = None ) -> Tuple[torch.Tensor, Tuple[int, int]]: old_height, old_width = spatial_size new_height, new_width = _compute_resized_output_size(spatial_size, size=size, max_size=max_size) if (new_height, new_width) == (old_height, old_width): - return bounding_boxes, spatial_size + return bboxes, spatial_size w_ratio = new_width / old_width h_ratio = new_height / old_height - ratios = torch.tensor([w_ratio, h_ratio, w_ratio, h_ratio], device=bounding_boxes.device) + ratios = torch.tensor([w_ratio, h_ratio, w_ratio, h_ratio], device=bboxes.device) return ( - bounding_boxes.mul(ratios).to(bounding_boxes.dtype), + bboxes.mul(ratios).to(bboxes.dtype), (new_height, new_width), ) @@ -650,9 +650,9 @@ def affine_image_pil( return _FP.affine(image, matrix, interpolation=pil_modes_mapping[interpolation], fill=fill) -def _affine_bounding_boxes_with_expand( - bounding_boxes: torch.Tensor, - format: datapoints.BoundingBoxFormat, +def _affine_bboxes_with_expand( + bboxes: torch.Tensor, + format: datapoints.BBoxFormat, spatial_size: Tuple[int, int], angle: Union[int, float], translate: List[float], @@ -661,18 +661,16 @@ def _affine_bounding_boxes_with_expand( center: Optional[List[float]] = None, expand: bool = False, ) -> Tuple[torch.Tensor, Tuple[int, int]]: - if bounding_boxes.numel() == 0: - return bounding_boxes, spatial_size - - original_shape = bounding_boxes.shape - original_dtype = bounding_boxes.dtype - bounding_boxes = bounding_boxes.clone() if bounding_boxes.is_floating_point() else bounding_boxes.float() - dtype = bounding_boxes.dtype - device = bounding_boxes.device - bounding_boxes = ( - convert_format_bounding_boxes( - bounding_boxes, old_format=format, new_format=datapoints.BoundingBoxFormat.XYXY, inplace=True - ) + if bboxes.numel() == 0: + return bboxes, spatial_size + + original_shape = bboxes.shape + original_dtype = bboxes.dtype + bboxes = bboxes.clone() if bboxes.is_floating_point() else bboxes.float() + dtype = bboxes.dtype + device = bboxes.device + bboxes = ( + convert_format_bboxes(bboxes, old_format=format, new_format=datapoints.BBoxFormat.XYXY, inplace=True) ).reshape(-1, 4) angle, translate, shear, center = _affine_parse_args( @@ -697,7 +695,7 @@ def _affine_bounding_boxes_with_expand( # Tensor of points has shape (N * 4, 3), where N is the number of bboxes # Single point structure is similar to # [(xmin, ymin, 1), (xmax, ymin, 1), (xmax, ymax, 1), (xmin, ymax, 1)] - points = bounding_boxes[:, [[0, 1], [2, 1], [2, 3], [0, 3]]].reshape(-1, 2) + points = bboxes[:, [[0, 1], [2, 1], [2, 3], [0, 3]]].reshape(-1, 2) points = torch.cat([points, torch.ones(points.shape[0], 1, device=device, dtype=dtype)], dim=-1) # 2) Now let's transform the points using affine matrix transformed_points = torch.matmul(points, transposed_affine_matrix) @@ -730,18 +728,18 @@ def _affine_bounding_boxes_with_expand( new_width, new_height = _compute_affine_output_size(affine_vector, width, height) spatial_size = (new_height, new_width) - out_bboxes = clamp_bounding_boxes(out_bboxes, format=datapoints.BoundingBoxFormat.XYXY, spatial_size=spatial_size) - out_bboxes = convert_format_bounding_boxes( - out_bboxes, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=format, inplace=True + out_bboxes = clamp_bboxes(out_bboxes, format=datapoints.BBoxFormat.XYXY, spatial_size=spatial_size) + out_bboxes = convert_format_bboxes( + out_bboxes, old_format=datapoints.BBoxFormat.XYXY, new_format=format, inplace=True ).reshape(original_shape) out_bboxes = out_bboxes.to(original_dtype) return out_bboxes, spatial_size -def affine_bounding_boxes( - bounding_boxes: torch.Tensor, - format: datapoints.BoundingBoxFormat, +def affine_bboxes( + bboxes: torch.Tensor, + format: datapoints.BBoxFormat, spatial_size: Tuple[int, int], angle: Union[int, float], translate: List[float], @@ -749,8 +747,8 @@ def affine_bounding_boxes( shear: List[float], center: Optional[List[float]] = None, ) -> torch.Tensor: - out_box, _ = _affine_bounding_boxes_with_expand( - bounding_boxes, + out_box, _ = _affine_bboxes_with_expand( + bboxes, format=format, spatial_size=spatial_size, angle=angle, @@ -927,9 +925,9 @@ def rotate_image_pil( ) -def rotate_bounding_boxes( - bounding_boxes: torch.Tensor, - format: datapoints.BoundingBoxFormat, +def rotate_bboxes( + bboxes: torch.Tensor, + format: datapoints.BBoxFormat, spatial_size: Tuple[int, int], angle: float, expand: bool = False, @@ -938,8 +936,8 @@ def rotate_bounding_boxes( if center is not None and expand: warnings.warn("The provided center argument has no effect on the result if expand is True") - return _affine_bounding_boxes_with_expand( - bounding_boxes, + return _affine_bboxes_with_expand( + bboxes, format=format, spatial_size=spatial_size, angle=-angle, @@ -1165,9 +1163,9 @@ def pad_mask( return output -def pad_bounding_boxes( - bounding_boxes: torch.Tensor, - format: datapoints.BoundingBoxFormat, +def pad_bboxes( + bboxes: torch.Tensor, + format: datapoints.BBoxFormat, spatial_size: Tuple[int, int], padding: List[int], padding_mode: str = "constant", @@ -1178,18 +1176,18 @@ def pad_bounding_boxes( left, right, top, bottom = _parse_pad_padding(padding) - if format == datapoints.BoundingBoxFormat.XYXY: + if format == datapoints.BBoxFormat.XYXY: pad = [left, top, left, top] else: pad = [left, top, 0, 0] - bounding_boxes = bounding_boxes + torch.tensor(pad, dtype=bounding_boxes.dtype, device=bounding_boxes.device) + bboxes = bboxes + torch.tensor(pad, dtype=bboxes.dtype, device=bboxes.device) height, width = spatial_size height += top + bottom width += left + right spatial_size = (height, width) - return clamp_bounding_boxes(bounding_boxes, format=format, spatial_size=spatial_size), spatial_size + return clamp_bboxes(bboxes, format=format, spatial_size=spatial_size), spatial_size def pad_video( @@ -1245,9 +1243,9 @@ def crop_image_tensor(image: torch.Tensor, top: int, left: int, height: int, wid crop_image_pil = _FP.crop -def crop_bounding_boxes( - bounding_boxes: torch.Tensor, - format: datapoints.BoundingBoxFormat, +def crop_bboxes( + bboxes: torch.Tensor, + format: datapoints.BBoxFormat, top: int, left: int, height: int, @@ -1255,15 +1253,15 @@ def crop_bounding_boxes( ) -> Tuple[torch.Tensor, Tuple[int, int]]: # Crop or implicit pad if left and/or top have negative values: - if format == datapoints.BoundingBoxFormat.XYXY: + if format == datapoints.BBoxFormat.XYXY: sub = [left, top, left, top] else: sub = [left, top, 0, 0] - bounding_boxes = bounding_boxes - torch.tensor(sub, dtype=bounding_boxes.dtype, device=bounding_boxes.device) + bboxes = bboxes - torch.tensor(sub, dtype=bboxes.dtype, device=bboxes.device) spatial_size = (height, width) - return clamp_bounding_boxes(bounding_boxes, format=format, spatial_size=spatial_size), spatial_size + return clamp_bboxes(bboxes, format=format, spatial_size=spatial_size), spatial_size def crop_mask(mask: torch.Tensor, top: int, left: int, height: int, width: int) -> torch.Tensor: @@ -1409,27 +1407,25 @@ def perspective_image_pil( return _FP.perspective(image, perspective_coeffs, interpolation=pil_modes_mapping[interpolation], fill=fill) -def perspective_bounding_boxes( - bounding_boxes: torch.Tensor, - format: datapoints.BoundingBoxFormat, +def perspective_bboxes( + bboxes: torch.Tensor, + format: datapoints.BBoxFormat, spatial_size: Tuple[int, int], startpoints: Optional[List[List[int]]], endpoints: Optional[List[List[int]]], coefficients: Optional[List[float]] = None, ) -> torch.Tensor: - if bounding_boxes.numel() == 0: - return bounding_boxes + if bboxes.numel() == 0: + return bboxes perspective_coeffs = _perspective_coefficients(startpoints, endpoints, coefficients) - original_shape = bounding_boxes.shape - # TODO: first cast to float if bbox is int64 before convert_format_bounding_boxes - bounding_boxes = ( - convert_format_bounding_boxes(bounding_boxes, old_format=format, new_format=datapoints.BoundingBoxFormat.XYXY) - ).reshape(-1, 4) + original_shape = bboxes.shape + # TODO: first cast to float if bbox is int64 before convert_format_bboxes + bboxes = (convert_format_bboxes(bboxes, old_format=format, new_format=datapoints.BBoxFormat.XYXY)).reshape(-1, 4) - dtype = bounding_boxes.dtype if torch.is_floating_point(bounding_boxes) else torch.float32 - device = bounding_boxes.device + dtype = bboxes.dtype if torch.is_floating_point(bboxes) else torch.float32 + device = bboxes.device # perspective_coeffs are computed as endpoint -> start point # We have to invert perspective_coeffs for bboxes: @@ -1475,7 +1471,7 @@ def perspective_bounding_boxes( # Tensor of points has shape (N * 4, 3), where N is the number of bboxes # Single point structure is similar to # [(xmin, ymin, 1), (xmax, ymin, 1), (xmax, ymax, 1), (xmin, ymax, 1)] - points = bounding_boxes[:, [[0, 1], [2, 1], [2, 3], [0, 3]]].reshape(-1, 2) + points = bboxes[:, [[0, 1], [2, 1], [2, 3], [0, 3]]].reshape(-1, 2) points = torch.cat([points, torch.ones(points.shape[0], 1, device=points.device)], dim=-1) # 2) Now let's transform the points using perspective matrices # x_out = (coeffs[0] * x + coeffs[1] * y + coeffs[2]) / (coeffs[6] * x + coeffs[7] * y + 1) @@ -1490,16 +1486,16 @@ def perspective_bounding_boxes( transformed_points = transformed_points.reshape(-1, 4, 2) out_bbox_mins, out_bbox_maxs = torch.aminmax(transformed_points, dim=1) - out_bboxes = clamp_bounding_boxes( - torch.cat([out_bbox_mins, out_bbox_maxs], dim=1).to(bounding_boxes.dtype), - format=datapoints.BoundingBoxFormat.XYXY, + out_bboxes = clamp_bboxes( + torch.cat([out_bbox_mins, out_bbox_maxs], dim=1).to(bboxes.dtype), + format=datapoints.BBoxFormat.XYXY, spatial_size=spatial_size, ) # out_bboxes should be of shape [N boxes, 4] - return convert_format_bounding_boxes( - out_bboxes, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=format, inplace=True + return convert_format_bboxes( + out_bboxes, old_format=datapoints.BBoxFormat.XYXY, new_format=format, inplace=True ).reshape(original_shape) @@ -1648,27 +1644,25 @@ def _create_identity_grid(size: Tuple[int, int], device: torch.device, dtype: to return base_grid -def elastic_bounding_boxes( - bounding_boxes: torch.Tensor, - format: datapoints.BoundingBoxFormat, +def elastic_bboxes( + bboxes: torch.Tensor, + format: datapoints.BBoxFormat, spatial_size: Tuple[int, int], displacement: torch.Tensor, ) -> torch.Tensor: - if bounding_boxes.numel() == 0: - return bounding_boxes + if bboxes.numel() == 0: + return bboxes # TODO: add in docstring about approximation we are doing for grid inversion - device = bounding_boxes.device - dtype = bounding_boxes.dtype if torch.is_floating_point(bounding_boxes) else torch.float32 + device = bboxes.device + dtype = bboxes.dtype if torch.is_floating_point(bboxes) else torch.float32 if displacement.dtype != dtype or displacement.device != device: displacement = displacement.to(dtype=dtype, device=device) - original_shape = bounding_boxes.shape - # TODO: first cast to float if bbox is int64 before convert_format_bounding_boxes - bounding_boxes = ( - convert_format_bounding_boxes(bounding_boxes, old_format=format, new_format=datapoints.BoundingBoxFormat.XYXY) - ).reshape(-1, 4) + original_shape = bboxes.shape + # TODO: first cast to float if bbox is int64 before convert_format_bboxes + bboxes = (convert_format_bboxes(bboxes, old_format=format, new_format=datapoints.BBoxFormat.XYXY)).reshape(-1, 4) id_grid = _create_identity_grid(spatial_size, device=device, dtype=dtype) # We construct an approximation of inverse grid as inv_grid = id_grid - displacement @@ -1676,7 +1670,7 @@ def elastic_bounding_boxes( inv_grid = id_grid.sub_(displacement) # Get points from bboxes - points = bounding_boxes[:, [[0, 1], [2, 1], [2, 3], [0, 3]]].reshape(-1, 2) + points = bboxes[:, [[0, 1], [2, 1], [2, 3], [0, 3]]].reshape(-1, 2) if points.is_floating_point(): points = points.ceil_() index_xy = points.to(dtype=torch.long) @@ -1688,14 +1682,14 @@ def elastic_bounding_boxes( transformed_points = transformed_points.reshape(-1, 4, 2) out_bbox_mins, out_bbox_maxs = torch.aminmax(transformed_points, dim=1) - out_bboxes = clamp_bounding_boxes( - torch.cat([out_bbox_mins, out_bbox_maxs], dim=1).to(bounding_boxes.dtype), - format=datapoints.BoundingBoxFormat.XYXY, + out_bboxes = clamp_bboxes( + torch.cat([out_bbox_mins, out_bbox_maxs], dim=1).to(bboxes.dtype), + format=datapoints.BBoxFormat.XYXY, spatial_size=spatial_size, ) - return convert_format_bounding_boxes( - out_bboxes, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=format, inplace=True + return convert_format_bboxes( + out_bboxes, old_format=datapoints.BBoxFormat.XYXY, new_format=format, inplace=True ).reshape(original_shape) @@ -1818,17 +1812,15 @@ def center_crop_image_pil(image: PIL.Image.Image, output_size: List[int]) -> PIL return crop_image_pil(image, crop_top, crop_left, crop_height, crop_width) -def center_crop_bounding_boxes( - bounding_boxes: torch.Tensor, - format: datapoints.BoundingBoxFormat, +def center_crop_bboxes( + bboxes: torch.Tensor, + format: datapoints.BBoxFormat, spatial_size: Tuple[int, int], output_size: List[int], ) -> Tuple[torch.Tensor, Tuple[int, int]]: crop_height, crop_width = _center_crop_parse_output_size(output_size) crop_top, crop_left = _center_crop_compute_crop_anchor(crop_height, crop_width, *spatial_size) - return crop_bounding_boxes( - bounding_boxes, format, top=crop_top, left=crop_left, height=crop_height, width=crop_width - ) + return crop_bboxes(bboxes, format, top=crop_top, left=crop_left, height=crop_height, width=crop_width) def center_crop_mask(mask: torch.Tensor, output_size: List[int]) -> torch.Tensor: @@ -1895,17 +1887,17 @@ def resized_crop_image_pil( return resize_image_pil(image, size, interpolation=interpolation) -def resized_crop_bounding_boxes( - bounding_boxes: torch.Tensor, - format: datapoints.BoundingBoxFormat, +def resized_crop_bboxes( + bboxes: torch.Tensor, + format: datapoints.BBoxFormat, top: int, left: int, height: int, width: int, size: List[int], ) -> Tuple[torch.Tensor, Tuple[int, int]]: - bounding_boxes, _ = crop_bounding_boxes(bounding_boxes, format, top, left, height, width) - return resize_bounding_boxes(bounding_boxes, spatial_size=(height, width), size=size) + bboxes, _ = crop_bboxes(bboxes, format, top, left, height, width) + return resize_bboxes(bboxes, spatial_size=(height, width), size=size) def resized_crop_mask( diff --git a/torchvision/transforms/v2/functional/_meta.py b/torchvision/transforms/v2/functional/_meta.py index f564b180389..fa4c638b093 100644 --- a/torchvision/transforms/v2/functional/_meta.py +++ b/torchvision/transforms/v2/functional/_meta.py @@ -3,7 +3,7 @@ import PIL.Image import torch from torchvision import datapoints -from torchvision.datapoints import BoundingBoxFormat +from torchvision.datapoints import BBoxFormat from torchvision.transforms import _functional_pil as _FP from torchvision.utils import _log_api_usage_once @@ -109,8 +109,8 @@ def get_spatial_size_mask(mask: torch.Tensor) -> List[int]: @torch.jit.unused -def get_spatial_size_bounding_boxes(bounding_boxes: datapoints.BoundingBoxes) -> List[int]: - return list(bounding_boxes.spatial_size) +def get_spatial_size_bboxes(bboxes: datapoints.BBoxes) -> List[int]: + return list(bboxes.spatial_size) def get_spatial_size(inpt: datapoints._InputTypeJIT) -> List[int]: @@ -119,7 +119,7 @@ def get_spatial_size(inpt: datapoints._InputTypeJIT) -> List[int]: if torch.jit.is_scripting() or is_simple_tensor(inpt): return get_spatial_size_image_tensor(inpt) - elif isinstance(inpt, (datapoints.Image, datapoints.Video, datapoints.BoundingBoxes, datapoints.Mask)): + elif isinstance(inpt, (datapoints.Image, datapoints.Video, datapoints.BBoxes, datapoints.Mask)): return list(inpt.spatial_size) elif isinstance(inpt, PIL.Image.Image): return get_spatial_size_image_pil(inpt) @@ -185,97 +185,89 @@ def _xyxy_to_cxcywh(xyxy: torch.Tensor, inplace: bool) -> torch.Tensor: return xyxy -def _convert_format_bounding_boxes( - bounding_boxes: torch.Tensor, old_format: BoundingBoxFormat, new_format: BoundingBoxFormat, inplace: bool = False +def _convert_format_bboxes( + bboxes: torch.Tensor, old_format: BBoxFormat, new_format: BBoxFormat, inplace: bool = False ) -> torch.Tensor: if new_format == old_format: - return bounding_boxes + return bboxes # TODO: Add _xywh_to_cxcywh and _cxcywh_to_xywh to improve performance - if old_format == BoundingBoxFormat.XYWH: - bounding_boxes = _xywh_to_xyxy(bounding_boxes, inplace) - elif old_format == BoundingBoxFormat.CXCYWH: - bounding_boxes = _cxcywh_to_xyxy(bounding_boxes, inplace) + if old_format == BBoxFormat.XYWH: + bboxes = _xywh_to_xyxy(bboxes, inplace) + elif old_format == BBoxFormat.CXCYWH: + bboxes = _cxcywh_to_xyxy(bboxes, inplace) - if new_format == BoundingBoxFormat.XYWH: - bounding_boxes = _xyxy_to_xywh(bounding_boxes, inplace) - elif new_format == BoundingBoxFormat.CXCYWH: - bounding_boxes = _xyxy_to_cxcywh(bounding_boxes, inplace) + if new_format == BBoxFormat.XYWH: + bboxes = _xyxy_to_xywh(bboxes, inplace) + elif new_format == BBoxFormat.CXCYWH: + bboxes = _xyxy_to_cxcywh(bboxes, inplace) - return bounding_boxes + return bboxes -def convert_format_bounding_boxes( +def convert_format_bboxes( inpt: datapoints._InputTypeJIT, - old_format: Optional[BoundingBoxFormat] = None, - new_format: Optional[BoundingBoxFormat] = None, + old_format: Optional[BBoxFormat] = None, + new_format: Optional[BBoxFormat] = None, inplace: bool = False, ) -> datapoints._InputTypeJIT: # This being a kernel / dispatcher hybrid, we need an option to pass `old_format` explicitly for simple tensor - # inputs as well as extract it from `datapoints.BoundingBoxes` inputs. However, putting a default value on + # inputs as well as extract it from `datapoints.BBoxes` inputs. However, putting a default value on # `old_format` means we also need to put one on `new_format` to have syntactically correct Python. Here we mimic the # default error that would be thrown if `new_format` had no default value. if new_format is None: - raise TypeError("convert_format_bounding_boxes() missing 1 required argument: 'new_format'") + raise TypeError("convert_format_bboxes() missing 1 required argument: 'new_format'") if not torch.jit.is_scripting(): - _log_api_usage_once(convert_format_bounding_boxes) + _log_api_usage_once(convert_format_bboxes) if torch.jit.is_scripting() or is_simple_tensor(inpt): if old_format is None: raise ValueError("For simple tensor inputs, `old_format` has to be passed.") - return _convert_format_bounding_boxes(inpt, old_format=old_format, new_format=new_format, inplace=inplace) - elif isinstance(inpt, datapoints.BoundingBoxes): + return _convert_format_bboxes(inpt, old_format=old_format, new_format=new_format, inplace=inplace) + elif isinstance(inpt, datapoints.BBoxes): if old_format is not None: raise ValueError("For bounding box datapoint inputs, `old_format` must not be passed.") - output = _convert_format_bounding_boxes( + output = _convert_format_bboxes( inpt.as_subclass(torch.Tensor), old_format=inpt.format, new_format=new_format, inplace=inplace ) - return datapoints.BoundingBoxes.wrap_like(inpt, output, format=new_format) + return datapoints.BBoxes.wrap_like(inpt, output, format=new_format) else: raise TypeError( f"Input can either be a plain tensor or a bounding box datapoint, but got {type(inpt)} instead." ) -def _clamp_bounding_boxes( - bounding_boxes: torch.Tensor, format: BoundingBoxFormat, spatial_size: Tuple[int, int] -) -> torch.Tensor: +def _clamp_bboxes(bboxes: torch.Tensor, format: BBoxFormat, spatial_size: Tuple[int, int]) -> torch.Tensor: # TODO: Investigate if it makes sense from a performance perspective to have an implementation for every - # BoundingBoxFormat instead of converting back and forth - in_dtype = bounding_boxes.dtype - bounding_boxes = bounding_boxes.clone() if bounding_boxes.is_floating_point() else bounding_boxes.float() - xyxy_boxes = convert_format_bounding_boxes( - bounding_boxes, old_format=format, new_format=datapoints.BoundingBoxFormat.XYXY, inplace=True - ) + # BBoxFormat instead of converting back and forth + in_dtype = bboxes.dtype + bboxes = bboxes.clone() if bboxes.is_floating_point() else bboxes.float() + xyxy_boxes = convert_format_bboxes(bboxes, old_format=format, new_format=datapoints.BBoxFormat.XYXY, inplace=True) xyxy_boxes[..., 0::2].clamp_(min=0, max=spatial_size[1]) xyxy_boxes[..., 1::2].clamp_(min=0, max=spatial_size[0]) - out_boxes = convert_format_bounding_boxes( - xyxy_boxes, old_format=BoundingBoxFormat.XYXY, new_format=format, inplace=True - ) + out_boxes = convert_format_bboxes(xyxy_boxes, old_format=BBoxFormat.XYXY, new_format=format, inplace=True) return out_boxes.to(in_dtype) -def clamp_bounding_boxes( +def clamp_bboxes( inpt: datapoints._InputTypeJIT, - format: Optional[BoundingBoxFormat] = None, + format: Optional[BBoxFormat] = None, spatial_size: Optional[Tuple[int, int]] = None, ) -> datapoints._InputTypeJIT: if not torch.jit.is_scripting(): - _log_api_usage_once(clamp_bounding_boxes) + _log_api_usage_once(clamp_bboxes) if torch.jit.is_scripting() or is_simple_tensor(inpt): if format is None or spatial_size is None: raise ValueError("For simple tensor inputs, `format` and `spatial_size` has to be passed.") - return _clamp_bounding_boxes(inpt, format=format, spatial_size=spatial_size) - elif isinstance(inpt, datapoints.BoundingBoxes): + return _clamp_bboxes(inpt, format=format, spatial_size=spatial_size) + elif isinstance(inpt, datapoints.BBoxes): if format is not None or spatial_size is not None: raise ValueError("For bounding box datapoint inputs, `format` and `spatial_size` must not be passed.") - output = _clamp_bounding_boxes( - inpt.as_subclass(torch.Tensor), format=inpt.format, spatial_size=inpt.spatial_size - ) - return datapoints.BoundingBoxes.wrap_like(inpt, output) + output = _clamp_bboxes(inpt.as_subclass(torch.Tensor), format=inpt.format, spatial_size=inpt.spatial_size) + return datapoints.BBoxes.wrap_like(inpt, output) else: raise TypeError( f"Input can either be a plain tensor or a bounding box datapoint, but got {type(inpt)} instead." diff --git a/torchvision/transforms/v2/utils.py b/torchvision/transforms/v2/utils.py index 978333296d0..37eca94a24e 100644 --- a/torchvision/transforms/v2/utils.py +++ b/torchvision/transforms/v2/utils.py @@ -9,13 +9,13 @@ from torchvision.transforms.v2.functional import get_dimensions, get_spatial_size, is_simple_tensor -def query_bounding_boxes(flat_inputs: List[Any]) -> datapoints.BoundingBoxes: - bounding_boxes = [inpt for inpt in flat_inputs if isinstance(inpt, datapoints.BoundingBoxes)] - if not bounding_boxes: +def query_bboxes(flat_inputs: List[Any]) -> datapoints.BBoxes: + bboxes = [inpt for inpt in flat_inputs if isinstance(inpt, datapoints.BBoxes)] + if not bboxes: raise TypeError("No bounding box was found in the sample") - elif len(bounding_boxes) > 1: + elif len(bboxes) > 1: raise ValueError("Found multiple bounding boxes in the sample") - return bounding_boxes.pop() + return bboxes.pop() def query_chw(flat_inputs: List[Any]) -> Tuple[int, int, int]: @@ -36,9 +36,7 @@ def query_spatial_size(flat_inputs: List[Any]) -> Tuple[int, int]: sizes = { tuple(get_spatial_size(inpt)) for inpt in flat_inputs - if isinstance( - inpt, (datapoints.Image, PIL.Image.Image, datapoints.Video, datapoints.Mask, datapoints.BoundingBoxes) - ) + if isinstance(inpt, (datapoints.Image, PIL.Image.Image, datapoints.Video, datapoints.Mask, datapoints.BBoxes)) or is_simple_tensor(inpt) } if not sizes: diff --git a/torchvision/utils.py b/torchvision/utils.py index 6ec19a0e0a1..b78238a621c 100644 --- a/torchvision/utils.py +++ b/torchvision/utils.py @@ -13,7 +13,7 @@ __all__ = [ "make_grid", "save_image", - "draw_bounding_boxes", + "draw_bboxes", "draw_segmentation_masks", "draw_keypoints", "flow_to_image", @@ -151,7 +151,7 @@ def save_image( @torch.no_grad() -def draw_bounding_boxes( +def draw_bboxes( image: torch.Tensor, boxes: torch.Tensor, labels: Optional[List[str]] = None, @@ -189,7 +189,7 @@ def draw_bounding_boxes( """ if not torch.jit.is_scripting() and not torch.jit.is_tracing(): - _log_api_usage_once(draw_bounding_boxes) + _log_api_usage_once(draw_bboxes) if not isinstance(image, torch.Tensor): raise TypeError(f"Tensor expected, got {type(image)}") elif image.dtype != torch.uint8: