diff --git a/torchvision/models/detection/faster_rcnn.py b/torchvision/models/detection/faster_rcnn.py index 057fb2bf221..e5922de3fc8 100644 --- a/torchvision/models/detection/faster_rcnn.py +++ b/torchvision/models/detection/faster_rcnn.py @@ -30,11 +30,11 @@ class FasterRCNN(GeneralizedRCNN): The behavior of the model changes depending if it is in training or evaluation mode. - During training, the model expects both the input tensors, as well as a targets dictionary, + During training, the model expects both the input tensors, as well as a targets (list of dictionary), containing: - - boxes (Tensor[N, 4]): the ground-truth boxes in [x0, y0, x1, y1] format, with values + - boxes (FloatTensor[N, 4]): the ground-truth boxes in [x0, y0, x1, y1] format, with values between 0 and H and 0 and W - - labels (Tensor[N]): the class label for each ground-truth box + - labels (Int64Tensor[N]): the class label for each ground-truth box The model returns a Dict[Tensor] during training, containing the classification and regression losses for both the RPN and the R-CNN. @@ -42,9 +42,9 @@ class FasterRCNN(GeneralizedRCNN): During inference, the model requires only the input tensors, and returns the post-processed predictions as a List[Dict[Tensor]], one for each input image. The fields of the Dict are as follows: - - boxes (Tensor[N, 4]): the predicted boxes in [x0, y0, x1, y1] format, with values between + - boxes (FloatTensor[N, 4]): the predicted boxes in [x0, y0, x1, y1] format, with values between 0 and H and 0 and W - - labels (Tensor[N]): the predicted labels for each image + - labels (Int64Tensor[N]): the predicted labels for each image - scores (Tensor[N]): the scores or each prediction Arguments: @@ -298,11 +298,11 @@ def fasterrcnn_resnet50_fpn(pretrained=False, progress=True, The behavior of the model changes depending if it is in training or evaluation mode. - During training, the model expects both the input tensors, as well as a targets dictionary, + During training, the model expects both the input tensors, as well as a targets (list of dictionary), containing: - - boxes (``Tensor[N, 4]``): the ground-truth boxes in ``[x0, y0, x1, y1]`` format, with values + - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x0, y0, x1, y1]`` format, with values between ``0`` and ``H`` and ``0`` and ``W`` - - labels (``Tensor[N]``): the class label for each ground-truth box + - labels (``Int64Tensor[N]``): the class label for each ground-truth box The model returns a ``Dict[Tensor]`` during training, containing the classification and regression losses for both the RPN and the R-CNN. @@ -310,9 +310,9 @@ def fasterrcnn_resnet50_fpn(pretrained=False, progress=True, During inference, the model requires only the input tensors, and returns the post-processed predictions as a ``List[Dict[Tensor]]``, one for each input image. The fields of the ``Dict`` are as follows: - - boxes (``Tensor[N, 4]``): the predicted boxes in ``[x0, y0, x1, y1]`` format, with values between + - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x0, y0, x1, y1]`` format, with values between ``0`` and ``H`` and ``0`` and ``W`` - - labels (``Tensor[N]``): the predicted labels for each image + - labels (``Int64Tensor[N]``): the predicted labels for each image - scores (``Tensor[N]``): the scores or each prediction Example:: diff --git a/torchvision/models/detection/keypoint_rcnn.py b/torchvision/models/detection/keypoint_rcnn.py index 6cac815f918..a944c023326 100644 --- a/torchvision/models/detection/keypoint_rcnn.py +++ b/torchvision/models/detection/keypoint_rcnn.py @@ -24,12 +24,12 @@ class KeypointRCNN(FasterRCNN): The behavior of the model changes depending if it is in training or evaluation mode. - During training, the model expects both the input tensors, as well as a targets dictionary, + During training, the model expects both the input tensors, as well as a targets (list of dictionary), containing: - - boxes (Tensor[N, 4]): the ground-truth boxes in [x0, y0, x1, y1] format, with values + - boxes (FloatTensor[N, 4]): the ground-truth boxes in [x0, y0, x1, y1] format, with values between 0 and H and 0 and W - - labels (Tensor[N]): the class label for each ground-truth box - - keypoints (Tensor[N, K, 3]): the K keypoints location for each of the N instances, in the + - labels (Int64Tensor[N]): the class label for each ground-truth box + - keypoints (FloatTensor[N, K, 3]): the K keypoints location for each of the N instances, in the format [x, y, visibility], where visibility=0 means that the keypoint is not visible. The model returns a Dict[Tensor] during training, containing the classification and regression @@ -38,11 +38,11 @@ class KeypointRCNN(FasterRCNN): During inference, the model requires only the input tensors, and returns the post-processed predictions as a List[Dict[Tensor]], one for each input image. The fields of the Dict are as follows: - - boxes (Tensor[N, 4]): the predicted boxes in [x0, y0, x1, y1] format, with values between + - boxes (FloatTensor[N, 4]): the predicted boxes in [x0, y0, x1, y1] format, with values between 0 and H and 0 and W - - labels (Tensor[N]): the predicted labels for each image + - labels (Int64Tensor[N]): the predicted labels for each image - scores (Tensor[N]): the scores or each prediction - - keypoints (Tensor[N, K, 3]): the locations of the predicted keypoints, in [x, y, v] format. + - keypoints (FloatTensor[N, K, 3]): the locations of the predicted keypoints, in [x, y, v] format. Arguments: backbone (nn.Module): the network used to compute the features for the model. @@ -274,12 +274,12 @@ def keypointrcnn_resnet50_fpn(pretrained=False, progress=True, The behavior of the model changes depending if it is in training or evaluation mode. - During training, the model expects both the input tensors, as well as a targets dictionary, + During training, the model expects both the input tensors, as well as a targets (list of dictionary), containing: - - boxes (``Tensor[N, 4]``): the ground-truth boxes in ``[x0, y0, x1, y1]`` format, with values + - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x0, y0, x1, y1]`` format, with values between ``0`` and ``H`` and ``0`` and ``W`` - - labels (``Tensor[N]``): the class label for each ground-truth box - - keypoints (``Tensor[N, K, 3]``): the ``K`` keypoints location for each of the ``N`` instances, in the + - labels (``Int64Tensor[N]``): the class label for each ground-truth box + - keypoints (``FloatTensor[N, K, 3]``): the ``K`` keypoints location for each of the ``N`` instances, in the format ``[x, y, visibility]``, where ``visibility=0`` means that the keypoint is not visible. The model returns a ``Dict[Tensor]`` during training, containing the classification and regression @@ -288,11 +288,11 @@ def keypointrcnn_resnet50_fpn(pretrained=False, progress=True, During inference, the model requires only the input tensors, and returns the post-processed predictions as a ``List[Dict[Tensor]]``, one for each input image. The fields of the ``Dict`` are as follows: - - boxes (``Tensor[N, 4]``): the predicted boxes in ``[x0, y0, x1, y1]`` format, with values between + - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x0, y0, x1, y1]`` format, with values between ``0`` and ``H`` and ``0`` and ``W`` - - labels (``Tensor[N]``): the predicted labels for each image + - labels (``Int64Tensor[N]``): the predicted labels for each image - scores (``Tensor[N]``): the scores or each prediction - - keypoints (``Tensor[N, K, 3]``): the locations of the predicted keypoints, in ``[x, y, v]`` format. + - keypoints (``FloatTensor[N, K, 3]``): the locations of the predicted keypoints, in ``[x, y, v]`` format. Example:: diff --git a/torchvision/models/detection/mask_rcnn.py b/torchvision/models/detection/mask_rcnn.py index f992b49fdc4..c311526fe13 100644 --- a/torchvision/models/detection/mask_rcnn.py +++ b/torchvision/models/detection/mask_rcnn.py @@ -26,12 +26,12 @@ class MaskRCNN(FasterRCNN): The behavior of the model changes depending if it is in training or evaluation mode. - During training, the model expects both the input tensors, as well as a targets dictionary, + During training, the model expects both the input tensors, as well as a targets (list of dictionary), containing: - - boxes (Tensor[N, 4]): the ground-truth boxes in [x0, y0, x1, y1] format, with values + - boxes (FloatTensor[N, 4]): the ground-truth boxes in [x0, y0, x1, y1] format, with values between 0 and H and 0 and W - - labels (Tensor[N]): the class label for each ground-truth box - - masks (Tensor[N, 1, H, W]): the segmentation binary masks for each instance + - labels (Int64Tensor[N]): the class label for each ground-truth box + - masks (UInt8Tensor[N, 1, H, W]): the segmentation binary masks for each instance The model returns a Dict[Tensor] during training, containing the classification and regression losses for both the RPN and the R-CNN, and the mask loss. @@ -39,11 +39,11 @@ class MaskRCNN(FasterRCNN): During inference, the model requires only the input tensors, and returns the post-processed predictions as a List[Dict[Tensor]], one for each input image. The fields of the Dict are as follows: - - boxes (Tensor[N, 4]): the predicted boxes in [x0, y0, x1, y1] format, with values between + - boxes (FloatTensor[N, 4]): the predicted boxes in [x0, y0, x1, y1] format, with values between 0 and H and 0 and W - - labels (Tensor[N]): the predicted labels for each image + - labels (Int64Tensor[N]): the predicted labels for each image - scores (Tensor[N]): the scores or each prediction - - masks (Tensor[N, 1, H, W]): the predicted masks for each instance, in 0-1 range. In order to + - masks (UInt8Tensor[N, 1, H, W]): the predicted masks for each instance, in 0-1 range. In order to obtain the final segmentation masks, the soft masks can be thresholded, generally with a value of 0.5 (mask >= 0.5) @@ -273,12 +273,12 @@ def maskrcnn_resnet50_fpn(pretrained=False, progress=True, The behavior of the model changes depending if it is in training or evaluation mode. - During training, the model expects both the input tensors, as well as a targets dictionary, + During training, the model expects both the input tensors, as well as a targets (list of dictionary), containing: - - boxes (``Tensor[N, 4]``): the ground-truth boxes in ``[x0, y0, x1, y1]`` format, with values + - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x0, y0, x1, y1]`` format, with values between ``0`` and ``H`` and ``0`` and ``W`` - - labels (``Tensor[N]``): the class label for each ground-truth box - - masks (``Tensor[N, H, W]``): the segmentation binary masks for each instance + - labels (``Int64Tensor[N]``): the class label for each ground-truth box + - masks (``UInt8Tensor[N, 1, H, W]``): the segmentation binary masks for each instance The model returns a ``Dict[Tensor]`` during training, containing the classification and regression losses for both the RPN and the R-CNN, and the mask loss. @@ -286,11 +286,11 @@ def maskrcnn_resnet50_fpn(pretrained=False, progress=True, During inference, the model requires only the input tensors, and returns the post-processed predictions as a ``List[Dict[Tensor]]``, one for each input image. The fields of the ``Dict`` are as follows: - - boxes (``Tensor[N, 4]``): the predicted boxes in ``[x0, y0, x1, y1]`` format, with values between + - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x0, y0, x1, y1]`` format, with values between ``0`` and ``H`` and ``0`` and ``W`` - - labels (``Tensor[N]``): the predicted labels for each image + - labels (``Int64Tensor[N]``): the predicted labels for each image - scores (``Tensor[N]``): the scores or each prediction - - masks (``Tensor[N, H, W]``): the predicted masks for each instance, in ``0-1`` range. In order to + - masks (``UInt8Tensor[N, 1, H, W]``): the predicted masks for each instance, in ``0-1`` range. In order to obtain the final segmentation masks, the soft masks can be thresholded, generally with a value of 0.5 (``mask >= 0.5``) diff --git a/torchvision/models/detection/roi_heads.py b/torchvision/models/detection/roi_heads.py index 771d3d591ab..beea0062f3a 100644 --- a/torchvision/models/detection/roi_heads.py +++ b/torchvision/models/detection/roi_heads.py @@ -17,6 +17,8 @@ def fastrcnn_loss(class_logits, box_regression, labels, regression_targets): Arguments: class_logits (Tensor) box_regression (Tensor) + labels (list[BoxList]) + regression_targets (Tensor) Returns: classification_loss (Tensor) @@ -55,7 +57,7 @@ def maskrcnn_inference(x, labels): Arguments: x (Tensor): the mask logits - boxes (list[BoxList]): bounding boxes that are used as + labels (list[BoxList]): bounding boxes that are used as reference, one for ech image Returns: @@ -250,7 +252,7 @@ def keypointrcnn_inference(x, boxes): # the next two functions should be merged inside Masker # but are kept here for the moment while we need them -# temporarily gor paste_mask_in_image +# temporarily for paste_mask_in_image def expand_boxes(boxes, scale): w_half = (boxes[:, 2] - boxes[:, 0]) * .5 h_half = (boxes[:, 3] - boxes[:, 1]) * .5 @@ -525,6 +527,13 @@ def forward(self, features, proposals, image_shapes, targets=None): image_shapes (List[Tuple[H, W]]) targets (List[Dict]) """ + if targets is not None: + for t in targets: + assert t["boxes"].dtype.is_floating_point, 'target boxes must of float type' + assert t["labels"].dtype == torch.int64, 'target labels must of int64 type' + if self.has_keypoint: + assert t["keypoints"].dtype == torch.float32, 'target keypoints must of float type' + if self.training: proposals, matched_idxs, labels, regression_targets = self.select_training_samples(proposals, targets)