pytorch · NicolasHug · Aug 8, 2023 · Aug 1, 2023 · Aug 1, 2023 · Aug 1, 2023
diff --git a/test/common_utils.py b/test/common_utils.py
@@ -638,6 +638,7 @@ def make_bounding_box(
     dtype=None,
     device="cpu",
 ):
+    batch_dims = ()  # This is nasty but this whole thing will be removed soon.
     def sample_position(values, max_value):
         # We cannot use torch.randint directly here, because it only allows integer scalars as values for low and high.
         # However, if we have batch_dims, we need tensors as limits.

diff --git a/test/test_datapoints.py b/test/test_datapoints.py
@@ -22,7 +22,7 @@ def test_mask_instance(data):
     assert mask.ndim == 3 and mask.shape[0] == 1
 
 
-@pytest.mark.parametrize("data", [torch.randint(0, 32, size=(5, 4)), [[0, 0, 5, 5], [2, 2, 7, 7]]])
+@pytest.mark.parametrize("data", [torch.randint(0, 32, size=(5, 4)), [[0, 0, 5, 5], [2, 2, 7, 7]], [1, 2, 3, 4]])
 @pytest.mark.parametrize(
     "format", ["XYXY", "CXCYWH", datapoints.BoundingBoxFormat.XYXY, datapoints.BoundingBoxFormat.XYWH]
 )
@@ -35,6 +35,12 @@ def test_bbox_instance(data, format):
     assert bboxes.format == format
 
 
+def test_bbox_dim_error():
+    data_3d = [[[1, 2, 3, 4]]]
+    with pytest.raises(ValueError, match="Expected a 1D or 2D tensor, got 3D"):
+        datapoints.BoundingBoxes(data_3d, format="XYXY", canvas_size=(32, 32))
+
+
 @pytest.mark.parametrize(
     ("data", "input_requires_grad", "expected_requires_grad"),
     [

diff --git a/torchvision/datapoints/_bounding_box.py b/torchvision/datapoints/_bounding_box.py
@@ -27,6 +27,12 @@ class BoundingBoxFormat(Enum):
 class BoundingBoxes(Datapoint):
     """[BETA] :class:`torch.Tensor` subclass for bounding boxes.
 
+    .. note::
+        There should be only one :class:`~torchvision.datapoints.BoundingBoxes`
+        instance per sample e.g. ``{"img": img, "bbox": BoundingBoxes(...)}``,
+        although one :class:`~torchvision.datapoints.BoundingBoxes` object can
+        contain multiple bounding boxes.
+
     Args:
         data: Any data that can be turned into a tensor with :func:`torch.as_tensor`.
         format (BoundingBoxFormat, str): Format of the bounding box.
@@ -44,6 +50,10 @@ class BoundingBoxes(Datapoint):
 
     @classmethod
     def _wrap(cls, tensor: torch.Tensor, *, format: BoundingBoxFormat, canvas_size: Tuple[int, int]) -> BoundingBoxes:
+        if tensor.ndim == 1:
+            tensor = tensor.unsqueeze(0)
+        elif tensor.ndim != 2:
+            raise ValueError(f"Expected a 1D or 2D tensor, got {tensor.ndim}D")
 if boxes.ndim != 2: 
     raise ValueError(f"boxes must be of shape (num_boxes, 4), got {boxes.shape}") 
 if boxes.ndim != 2: 
     raise ValueError(f"boxes must be of shape (num_boxes, 4), got {boxes.shape}") 
         bounding_boxes = tensor.as_subclass(cls)
         bounding_boxes.format = format
         bounding_boxes.canvas_size = canvas_size

diff --git a/torchvision/prototype/transforms/_geometry.py b/torchvision/prototype/transforms/_geometry.py
@@ -7,7 +7,7 @@
 from torchvision.prototype.datapoints import Label, OneHotLabel
 from torchvision.transforms.v2 import functional as F, Transform
 from torchvision.transforms.v2._utils import _setup_fill_arg, _setup_size
-from torchvision.transforms.v2.utils import has_any, is_simple_tensor, query_bounding_boxes, query_size
+from torchvision.transforms.v2.utils import get_bounding_boxes, has_any, is_simple_tensor, query_size
 
 
 class FixedSizeCrop(Transform):
@@ -61,7 +61,7 @@ def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
 
         bounding_boxes: Optional[torch.Tensor]
         try:
-            bounding_boxes = query_bounding_boxes(flat_inputs)
+            bounding_boxes = get_bounding_boxes(flat_inputs)
         except ValueError:
             bounding_boxes = None
 

diff --git a/torchvision/transforms/v2/_geometry.py b/torchvision/transforms/v2/_geometry.py
@@ -22,7 +22,7 @@
     _setup_float_or_seq,
     _setup_size,
 )
-from .utils import has_all, has_any, is_simple_tensor, query_bounding_boxes, query_size
+from .utils import get_bounding_boxes, has_all, has_any, is_simple_tensor, query_size
 
 
 class RandomHorizontalFlip(_RandomApplyTransform):
@@ -1165,7 +1165,7 @@ def _check_inputs(self, flat_inputs: List[Any]) -> None:
 
     def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
         orig_h, orig_w = query_size(flat_inputs)
-        bboxes = query_bounding_boxes(flat_inputs)
+        bboxes = get_bounding_boxes(flat_inputs)
 
         while True:
             # sample an option

diff --git a/torchvision/transforms/v2/_misc.py b/torchvision/transforms/v2/_misc.py
@@ -10,7 +10,7 @@
 from torchvision.transforms.v2 import functional as F, Transform
 
 from ._utils import _parse_labels_getter, _setup_float_or_seq, _setup_size
-from .utils import has_any, is_simple_tensor, query_bounding_boxes
+from .utils import get_bounding_boxes, has_any, is_simple_tensor
 
 
 # TODO: do we want/need to expose this?
@@ -384,10 +384,7 @@ def forward(self, *inputs: Any) -> Any:
             )
 
         flat_inputs, spec = tree_flatten(inputs)
-        # TODO: this enforces one single BoundingBoxes entry.
-        # Assuming this transform needs to be called at the end of *any* pipeline that has bboxes...
-        # should we just enforce it for all transforms?? What are the benefits of *not* enforcing this?
-        boxes = query_bounding_boxes(flat_inputs)
+        boxes = get_bounding_boxes(flat_inputs)
 
         if boxes.ndim != 2:
             raise ValueError(f"boxes must be of shape (num_boxes, 4), got {boxes.shape}")

diff --git a/torchvision/transforms/v2/utils.py b/torchvision/transforms/v2/utils.py
@@ -9,13 +9,12 @@
 from torchvision.transforms.v2.functional import get_dimensions, get_size, is_simple_tensor
 
 
-def query_bounding_boxes(flat_inputs: List[Any]) -> datapoints.BoundingBoxes:
-    bounding_boxes = [inpt for inpt in flat_inputs if isinstance(inpt, datapoints.BoundingBoxes)]
-    if not bounding_boxes:
-        raise TypeError("No bounding boxes were found in the sample")
-    elif len(bounding_boxes) > 1:
-        raise ValueError("Found multiple bounding boxes instances in the sample")
-    return bounding_boxes.pop()
+def get_bounding_boxes(flat_inputs: List[Any]) -> datapoints.BoundingBoxes:
+    # This assumes there is only one bbox per sample as per the general convention
+    try:
+        return next(inpt for inpt in flat_inputs if isinstance(inpt, datapoints.BoundingBoxes))
+    except StopIteration:
+        raise ValueError("No bounding boxes were found in the sample")
 
 
 def query_chw(flat_inputs: List[Any]) -> Tuple[int, int, int]: