|
34 | 34 | 'RoIPool', |
35 | 35 | 'psroi_pool', |
36 | 36 | 'PSRoIPool', |
| 37 | + 'roi_align', |
| 38 | + 'RoIAlign', |
37 | 39 | ] |
38 | 40 |
|
39 | 41 |
|
@@ -1138,3 +1140,160 @@ def forward(self, x, boxes, boxes_num): |
1138 | 1140 | def extra_repr(self): |
1139 | 1141 | main_str = 'output_size={_output_size}, spatial_scale={_spatial_scale}' |
1140 | 1142 | return main_str.format(**self.__dict__) |
| 1143 | + |
| 1144 | + |
| 1145 | +def roi_align(x, |
| 1146 | + boxes, |
| 1147 | + boxes_num, |
| 1148 | + output_size, |
| 1149 | + spatial_scale=1.0, |
| 1150 | + sampling_ratio=-1, |
| 1151 | + aligned=True, |
| 1152 | + name=None): |
| 1153 | + """ |
| 1154 | + This operator implements the roi_align layer. |
| 1155 | + Region of Interest (RoI) Align operator (also known as RoI Align) is to |
| 1156 | + perform bilinear interpolation on inputs of nonuniform sizes to obtain |
| 1157 | + fixed-size feature maps (e.g. 7*7), as described in Mask R-CNN. |
| 1158 | +
|
| 1159 | + Dividing each region proposal into equal-sized sections with the pooled_width |
| 1160 | + and pooled_height. Location remains the origin result. |
| 1161 | +
|
| 1162 | + In each ROI bin, the value of the four regularly sampled locations are |
| 1163 | + computed directly through bilinear interpolation. The output is the mean of |
| 1164 | + four locations. Thus avoid the misaligned problem. |
| 1165 | +
|
| 1166 | + Args: |
| 1167 | + x (Tensor): Input feature, 4D-Tensor with the shape of [N,C,H,W], |
| 1168 | + where N is the batch size, C is the input channel, H is Height, |
| 1169 | + W is weight. The data type is float32 or float64. |
| 1170 | + boxes (Tensor): Boxes (RoIs, Regions of Interest) to pool over. It |
| 1171 | + should be a 2-D Tensor of shape (num_boxes, 4). The data type is |
| 1172 | + float32 or float64. Given as [[x1, y1, x2, y2], ...], (x1, y1) is |
| 1173 | + the top left coordinates, and (x2, y2) is the bottom right coordinates. |
| 1174 | + boxes_num (Tensor): The number of boxes contained in each picture in |
| 1175 | + the batch, the data type is int32. |
| 1176 | + output_size (int or Tuple[int, int]): The pooled output size(h, w), data |
| 1177 | + type is int32. If int, h and w are both equal to output_size. |
| 1178 | + spatial_scale (float32): Multiplicative spatial scale factor to translate |
| 1179 | + ROI coords from their input scale to the scale used when pooling. |
| 1180 | + Default: 1.0 |
| 1181 | + sampling_ratio (int32): number of sampling points in the interpolation |
| 1182 | + grid used to compute the output value of each pooled output bin. |
| 1183 | + If > 0, then exactly ``sampling_ratio x sampling_ratio`` sampling |
| 1184 | + points per bin are used. |
| 1185 | + If <= 0, then an adaptive number of grid points are used (computed |
| 1186 | + as ``ceil(roi_width / output_width)``, and likewise for height). |
| 1187 | + Default: -1 |
| 1188 | + aligned (bool): If False, use the legacy implementation. If True, pixel |
| 1189 | + shift the box coordinates it by -0.5 for a better alignment with the |
| 1190 | + two neighboring pixel indices. This version is used in Detectron2. |
| 1191 | + Default: True |
| 1192 | + name(str, optional): For detailed information, please refer to : |
| 1193 | + ref:`api_guide_Name`. Usually name is no need to set and None by |
| 1194 | + default. |
| 1195 | +
|
| 1196 | + Returns: |
| 1197 | + Tensor: The output of ROIAlignOp is a 4-D tensor with shape (num_boxes, |
| 1198 | + channels, pooled_h, pooled_w). The data type is float32 or float64. |
| 1199 | +
|
| 1200 | + Examples: |
| 1201 | + .. code-block:: python |
| 1202 | +
|
| 1203 | + import paddle |
| 1204 | + from paddle.vision.ops import roi_align |
| 1205 | +
|
| 1206 | + data = paddle.rand([1, 256, 32, 32]) |
| 1207 | + boxes = paddle.rand([3, 4]) |
| 1208 | + boxes[:, 2] += boxes[:, 0] + 3 |
| 1209 | + boxes[:, 3] += boxes[:, 1] + 4 |
| 1210 | + boxes_num = paddle.to_tensor([3]).astype('int32') |
| 1211 | + align_out = roi_align(data, boxes, boxes_num, output_size=3) |
| 1212 | + assert align_out.shape == [3, 256, 3, 3] |
| 1213 | + """ |
| 1214 | + |
| 1215 | + check_type(output_size, 'output_size', (int, tuple), 'roi_align') |
| 1216 | + if isinstance(output_size, int): |
| 1217 | + output_size = (output_size, output_size) |
| 1218 | + |
| 1219 | + pooled_height, pooled_width = output_size |
| 1220 | + if in_dygraph_mode(): |
| 1221 | + assert boxes_num is not None, "boxes_num should not be None in dygraph mode." |
| 1222 | + align_out = core.ops.roi_align( |
| 1223 | + x, boxes, boxes_num, "pooled_height", pooled_height, "pooled_width", |
| 1224 | + pooled_width, "spatial_scale", spatial_scale, "sampling_ratio", |
| 1225 | + sampling_ratio, "aligned", aligned) |
| 1226 | + return align_out |
| 1227 | + |
| 1228 | + else: |
| 1229 | + check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'roi_align') |
| 1230 | + check_variable_and_dtype(boxes, 'boxes', ['float32', 'float64'], |
| 1231 | + 'roi_align') |
| 1232 | + helper = LayerHelper('roi_align', **locals()) |
| 1233 | + dtype = helper.input_dtype() |
| 1234 | + align_out = helper.create_variable_for_type_inference(dtype) |
| 1235 | + inputs = { |
| 1236 | + "X": x, |
| 1237 | + "ROIs": boxes, |
| 1238 | + } |
| 1239 | + if boxes_num is not None: |
| 1240 | + inputs['RoisNum'] = boxes_num |
| 1241 | + helper.append_op( |
| 1242 | + type="roi_align", |
| 1243 | + inputs=inputs, |
| 1244 | + outputs={"Out": align_out}, |
| 1245 | + attrs={ |
| 1246 | + "pooled_height": pooled_height, |
| 1247 | + "pooled_width": pooled_width, |
| 1248 | + "spatial_scale": spatial_scale, |
| 1249 | + "sampling_ratio": sampling_ratio, |
| 1250 | + "aligned": aligned, |
| 1251 | + }) |
| 1252 | + return align_out |
| 1253 | + |
| 1254 | + |
| 1255 | +class RoIAlign(Layer): |
| 1256 | + """ |
| 1257 | + This interface is used to construct a callable object of the `RoIAlign` class. |
| 1258 | + Please refer to :ref:`api_paddle_vision_ops_roi_align`. |
| 1259 | +
|
| 1260 | + Args: |
| 1261 | + output_size (int or tuple[int, int]): The pooled output size(h, w), |
| 1262 | + data type is int32. If int, h and w are both equal to output_size. |
| 1263 | + spatial_scale (float32, optional): Multiplicative spatial scale factor |
| 1264 | + to translate ROI coords from their input scale to the scale used |
| 1265 | + when pooling. Default: 1.0 |
| 1266 | +
|
| 1267 | + Returns: |
| 1268 | + align_out (Tensor): The output of ROIAlign operator is a 4-D tensor with |
| 1269 | + shape (num_boxes, channels, pooled_h, pooled_w). |
| 1270 | +
|
| 1271 | + Examples: |
| 1272 | + .. code-block:: python |
| 1273 | +
|
| 1274 | + import paddle |
| 1275 | + from paddle.vision.ops import RoIAlign |
| 1276 | +
|
| 1277 | + data = paddle.rand([1, 256, 32, 32]) |
| 1278 | + boxes = paddle.rand([3, 4]) |
| 1279 | + boxes[:, 2] += boxes[:, 0] + 3 |
| 1280 | + boxes[:, 3] += boxes[:, 1] + 4 |
| 1281 | + boxes_num = paddle.to_tensor([3]).astype('int32') |
| 1282 | + roi_align = RoIAlign(output_size=(4, 3)) |
| 1283 | + align_out = roi_align(data, boxes, boxes_num) |
| 1284 | + assert align_out.shape == [3, 256, 4, 3] |
| 1285 | + """ |
| 1286 | + |
| 1287 | + def __init__(self, output_size, spatial_scale=1.0): |
| 1288 | + super(RoIAlign, self).__init__() |
| 1289 | + self._output_size = output_size |
| 1290 | + self._spatial_scale = spatial_scale |
| 1291 | + |
| 1292 | + def forward(self, x, boxes, boxes_num, aligned=True): |
| 1293 | + return roi_align( |
| 1294 | + x=x, |
| 1295 | + boxes=boxes, |
| 1296 | + boxes_num=boxes_num, |
| 1297 | + output_size=self._output_size, |
| 1298 | + spatial_scale=self._spatial_scale, |
| 1299 | + aligned=aligned) |
0 commit comments