diff --git a/torchvision/models/detection/faster_rcnn.py b/torchvision/models/detection/faster_rcnn.py
index 057fb2bf221..e5922de3fc8 100644
--- a/torchvision/models/detection/faster_rcnn.py
+++ b/torchvision/models/detection/faster_rcnn.py
@@ -30,11 +30,11 @@ class FasterRCNN(GeneralizedRCNN):
 
     The behavior of the model changes depending if it is in training or evaluation mode.
 
-    During training, the model expects both the input tensors, as well as a targets dictionary,
+    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
     containing:
-        - boxes (Tensor[N, 4]): the ground-truth boxes in [x0, y0, x1, y1] format, with values
+        - boxes (FloatTensor[N, 4]): the ground-truth boxes in [x0, y0, x1, y1] format, with values
           between 0 and H and 0 and W
-        - labels (Tensor[N]): the class label for each ground-truth box
+        - labels (Int64Tensor[N]): the class label for each ground-truth box
 
     The model returns a Dict[Tensor] during training, containing the classification and regression
     losses for both the RPN and the R-CNN.
@@ -42,9 +42,9 @@ class FasterRCNN(GeneralizedRCNN):
     During inference, the model requires only the input tensors, and returns the post-processed
     predictions as a List[Dict[Tensor]], one for each input image. The fields of the Dict are as
     follows:
-        - boxes (Tensor[N, 4]): the predicted boxes in [x0, y0, x1, y1] format, with values between
+        - boxes (FloatTensor[N, 4]): the predicted boxes in [x0, y0, x1, y1] format, with values between
           0 and H and 0 and W
-        - labels (Tensor[N]): the predicted labels for each image
+        - labels (Int64Tensor[N]): the predicted labels for each image
         - scores (Tensor[N]): the scores or each prediction
 
     Arguments:
@@ -298,11 +298,11 @@ def fasterrcnn_resnet50_fpn(pretrained=False, progress=True,
 
     The behavior of the model changes depending if it is in training or evaluation mode.
 
-    During training, the model expects both the input tensors, as well as a targets dictionary,
+    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
     containing:
-        - boxes (``Tensor[N, 4]``): the ground-truth boxes in ``[x0, y0, x1, y1]`` format, with values
+        - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x0, y0, x1, y1]`` format, with values
           between ``0`` and ``H`` and ``0`` and ``W``
-        - labels (``Tensor[N]``): the class label for each ground-truth box
+        - labels (``Int64Tensor[N]``): the class label for each ground-truth box
 
     The model returns a ``Dict[Tensor]`` during training, containing the classification and regression
     losses for both the RPN and the R-CNN.
@@ -310,9 +310,9 @@ def fasterrcnn_resnet50_fpn(pretrained=False, progress=True,
     During inference, the model requires only the input tensors, and returns the post-processed
     predictions as a ``List[Dict[Tensor]]``, one for each input image. The fields of the ``Dict`` are as
     follows:
-        - boxes (``Tensor[N, 4]``): the predicted boxes in ``[x0, y0, x1, y1]`` format, with values between
+        - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x0, y0, x1, y1]`` format, with values between
           ``0`` and ``H`` and ``0`` and ``W``
-        - labels (``Tensor[N]``): the predicted labels for each image
+        - labels (``Int64Tensor[N]``): the predicted labels for each image
         - scores (``Tensor[N]``): the scores or each prediction
 
     Example::
diff --git a/torchvision/models/detection/keypoint_rcnn.py b/torchvision/models/detection/keypoint_rcnn.py
index 6cac815f918..a944c023326 100644
--- a/torchvision/models/detection/keypoint_rcnn.py
+++ b/torchvision/models/detection/keypoint_rcnn.py
@@ -24,12 +24,12 @@ class KeypointRCNN(FasterRCNN):
 
     The behavior of the model changes depending if it is in training or evaluation mode.
 
-    During training, the model expects both the input tensors, as well as a targets dictionary,
+    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
     containing:
-        - boxes (Tensor[N, 4]): the ground-truth boxes in [x0, y0, x1, y1] format, with values
+        - boxes (FloatTensor[N, 4]): the ground-truth boxes in [x0, y0, x1, y1] format, with values
           between 0 and H and 0 and W
-        - labels (Tensor[N]): the class label for each ground-truth box
-        - keypoints (Tensor[N, K, 3]): the K keypoints location for each of the N instances, in the
+        - labels (Int64Tensor[N]): the class label for each ground-truth box
+        - keypoints (FloatTensor[N, K, 3]): the K keypoints location for each of the N instances, in the
           format [x, y, visibility], where visibility=0 means that the keypoint is not visible.
 
     The model returns a Dict[Tensor] during training, containing the classification and regression
@@ -38,11 +38,11 @@ class KeypointRCNN(FasterRCNN):
     During inference, the model requires only the input tensors, and returns the post-processed
     predictions as a List[Dict[Tensor]], one for each input image. The fields of the Dict are as
     follows:
-        - boxes (Tensor[N, 4]): the predicted boxes in [x0, y0, x1, y1] format, with values between
+        - boxes (FloatTensor[N, 4]): the predicted boxes in [x0, y0, x1, y1] format, with values between
           0 and H and 0 and W
-        - labels (Tensor[N]): the predicted labels for each image
+        - labels (Int64Tensor[N]): the predicted labels for each image
         - scores (Tensor[N]): the scores or each prediction
-        - keypoints (Tensor[N, K, 3]): the locations of the predicted keypoints, in [x, y, v] format.
+        - keypoints (FloatTensor[N, K, 3]): the locations of the predicted keypoints, in [x, y, v] format.
 
     Arguments:
         backbone (nn.Module): the network used to compute the features for the model.
@@ -274,12 +274,12 @@ def keypointrcnn_resnet50_fpn(pretrained=False, progress=True,
 
     The behavior of the model changes depending if it is in training or evaluation mode.
 
-    During training, the model expects both the input tensors, as well as a targets dictionary,
+    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
     containing:
-        - boxes (``Tensor[N, 4]``): the ground-truth boxes in ``[x0, y0, x1, y1]`` format, with values
+        - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x0, y0, x1, y1]`` format, with values
           between ``0`` and ``H`` and ``0`` and ``W``
-        - labels (``Tensor[N]``): the class label for each ground-truth box
-        - keypoints (``Tensor[N, K, 3]``): the ``K`` keypoints location for each of the ``N`` instances, in the
+        - labels (``Int64Tensor[N]``): the class label for each ground-truth box
+        - keypoints (``FloatTensor[N, K, 3]``): the ``K`` keypoints location for each of the ``N`` instances, in the
           format ``[x, y, visibility]``, where ``visibility=0`` means that the keypoint is not visible.
 
     The model returns a ``Dict[Tensor]`` during training, containing the classification and regression
@@ -288,11 +288,11 @@ def keypointrcnn_resnet50_fpn(pretrained=False, progress=True,
     During inference, the model requires only the input tensors, and returns the post-processed
     predictions as a ``List[Dict[Tensor]]``, one for each input image. The fields of the ``Dict`` are as
     follows:
-        - boxes (``Tensor[N, 4]``): the predicted boxes in ``[x0, y0, x1, y1]`` format, with values between
+        - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x0, y0, x1, y1]`` format, with values between
           ``0`` and ``H`` and ``0`` and ``W``
-        - labels (``Tensor[N]``): the predicted labels for each image
+        - labels (``Int64Tensor[N]``): the predicted labels for each image
         - scores (``Tensor[N]``): the scores or each prediction
-        - keypoints (``Tensor[N, K, 3]``): the locations of the predicted keypoints, in ``[x, y, v]`` format.
+        - keypoints (``FloatTensor[N, K, 3]``): the locations of the predicted keypoints, in ``[x, y, v]`` format.
 
     Example::
 
diff --git a/torchvision/models/detection/mask_rcnn.py b/torchvision/models/detection/mask_rcnn.py
index f992b49fdc4..c311526fe13 100644
--- a/torchvision/models/detection/mask_rcnn.py
+++ b/torchvision/models/detection/mask_rcnn.py
@@ -26,12 +26,12 @@ class MaskRCNN(FasterRCNN):
 
     The behavior of the model changes depending if it is in training or evaluation mode.
 
-    During training, the model expects both the input tensors, as well as a targets dictionary,
+    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
     containing:
-        - boxes (Tensor[N, 4]): the ground-truth boxes in [x0, y0, x1, y1] format, with values
+        - boxes (FloatTensor[N, 4]): the ground-truth boxes in [x0, y0, x1, y1] format, with values
           between 0 and H and 0 and W
-        - labels (Tensor[N]): the class label for each ground-truth box
-        - masks (Tensor[N, 1, H, W]): the segmentation binary masks for each instance
+        - labels (Int64Tensor[N]): the class label for each ground-truth box
+        - masks (UInt8Tensor[N, 1, H, W]): the segmentation binary masks for each instance
 
     The model returns a Dict[Tensor] during training, containing the classification and regression
     losses for both the RPN and the R-CNN, and the mask loss.
@@ -39,11 +39,11 @@ class MaskRCNN(FasterRCNN):
     During inference, the model requires only the input tensors, and returns the post-processed
     predictions as a List[Dict[Tensor]], one for each input image. The fields of the Dict are as
     follows:
-        - boxes (Tensor[N, 4]): the predicted boxes in [x0, y0, x1, y1] format, with values between
+        - boxes (FloatTensor[N, 4]): the predicted boxes in [x0, y0, x1, y1] format, with values between
           0 and H and 0 and W
-        - labels (Tensor[N]): the predicted labels for each image
+        - labels (Int64Tensor[N]): the predicted labels for each image
         - scores (Tensor[N]): the scores or each prediction
-        - masks (Tensor[N, 1, H, W]): the predicted masks for each instance, in 0-1 range. In order to
+        - masks (UInt8Tensor[N, 1, H, W]): the predicted masks for each instance, in 0-1 range. In order to
           obtain the final segmentation masks, the soft masks can be thresholded, generally
           with a value of 0.5 (mask >= 0.5)
 
@@ -273,12 +273,12 @@ def maskrcnn_resnet50_fpn(pretrained=False, progress=True,
 
     The behavior of the model changes depending if it is in training or evaluation mode.
 
-    During training, the model expects both the input tensors, as well as a targets dictionary,
+    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
     containing:
-        - boxes (``Tensor[N, 4]``): the ground-truth boxes in ``[x0, y0, x1, y1]`` format, with values
+        - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x0, y0, x1, y1]`` format, with values
           between ``0`` and ``H`` and ``0`` and ``W``
-        - labels (``Tensor[N]``): the class label for each ground-truth box
-        - masks (``Tensor[N, H, W]``): the segmentation binary masks for each instance
+        - labels (``Int64Tensor[N]``): the class label for each ground-truth box
+        - masks (``UInt8Tensor[N, 1, H, W]``): the segmentation binary masks for each instance
 
     The model returns a ``Dict[Tensor]`` during training, containing the classification and regression
     losses for both the RPN and the R-CNN, and the mask loss.
@@ -286,11 +286,11 @@ def maskrcnn_resnet50_fpn(pretrained=False, progress=True,
     During inference, the model requires only the input tensors, and returns the post-processed
     predictions as a ``List[Dict[Tensor]]``, one for each input image. The fields of the ``Dict`` are as
     follows:
-        - boxes (``Tensor[N, 4]``): the predicted boxes in ``[x0, y0, x1, y1]`` format, with values between
+        - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x0, y0, x1, y1]`` format, with values between
           ``0`` and ``H`` and ``0`` and ``W``
-        - labels (``Tensor[N]``): the predicted labels for each image
+        - labels (``Int64Tensor[N]``): the predicted labels for each image
         - scores (``Tensor[N]``): the scores or each prediction
-        - masks (``Tensor[N, H, W]``): the predicted masks for each instance, in ``0-1`` range. In order to
+        - masks (``UInt8Tensor[N, 1, H, W]``): the predicted masks for each instance, in ``0-1`` range. In order to
           obtain the final segmentation masks, the soft masks can be thresholded, generally
           with a value of 0.5 (``mask >= 0.5``)
 
diff --git a/torchvision/models/detection/roi_heads.py b/torchvision/models/detection/roi_heads.py
index 771d3d591ab..beea0062f3a 100644
--- a/torchvision/models/detection/roi_heads.py
+++ b/torchvision/models/detection/roi_heads.py
@@ -17,6 +17,8 @@ def fastrcnn_loss(class_logits, box_regression, labels, regression_targets):
     Arguments:
         class_logits (Tensor)
         box_regression (Tensor)
+        labels (list[BoxList])
+        regression_targets (Tensor)
 
     Returns:
         classification_loss (Tensor)
@@ -55,7 +57,7 @@ def maskrcnn_inference(x, labels):
 
     Arguments:
         x (Tensor): the mask logits
-        boxes (list[BoxList]): bounding boxes that are used as
+        labels (list[BoxList]): bounding boxes that are used as
             reference, one for ech image
 
     Returns:
@@ -250,7 +252,7 @@ def keypointrcnn_inference(x, boxes):
 
 # the next two functions should be merged inside Masker
 # but are kept here for the moment while we need them
-# temporarily gor paste_mask_in_image
+# temporarily for paste_mask_in_image
 def expand_boxes(boxes, scale):
     w_half = (boxes[:, 2] - boxes[:, 0]) * .5
     h_half = (boxes[:, 3] - boxes[:, 1]) * .5
@@ -525,6 +527,13 @@ def forward(self, features, proposals, image_shapes, targets=None):
             image_shapes (List[Tuple[H, W]])
             targets (List[Dict])
         """
+        if targets is not None:
+            for t in targets:
+                assert t["boxes"].dtype.is_floating_point, 'target boxes must of float type'
+                assert t["labels"].dtype == torch.int64, 'target labels must of int64 type'
+                if self.has_keypoint:
+                    assert t["keypoints"].dtype == torch.float32, 'target keypoints must of float type'
+
         if self.training:
             proposals, matched_idxs, labels, regression_targets = self.select_training_samples(proposals, targets)