PaddlePaddle · heavengate · Apr 26, 2019 · Apr 22, 2019 · Apr 22, 2019 · Apr 22, 2019
diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
@@ -358,7 +358,7 @@ paddle.fluid.layers.generate_mask_labels (ArgSpec(args=['im_info', 'gt_classes',
 paddle.fluid.layers.iou_similarity (ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '587845f60c5d97ffdf2dfd21da52eca1'))
 paddle.fluid.layers.box_coder (ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name', 'axis'], varargs=None, keywords=None, defaults=('encode_center_size', True, None, 0)), ('document', '032d0f4b7d8f6235ee5d91e473344f0e'))
 paddle.fluid.layers.polygon_box_transform (ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '0e5ac2507723a0b5adec473f9556799b'))
-paddle.fluid.layers.yolov3_loss (ArgSpec(args=['x', 'gt_box', 'gt_label', 'anchors', 'anchor_mask', 'class_num', 'ignore_thresh', 'downsample_ratio', 'gt_score', 'use_label_smooth', 'name'], varargs=None, keywords=None, defaults=(None, True, None)), ('document', '4d170807a13d33925d1049d2892832bf'))
+paddle.fluid.layers.yolov3_loss (ArgSpec(args=['x', 'gt_box', 'gt_label', 'anchors', 'anchor_mask', 'class_num', 'ignore_thresh', 'downsample_ratio', 'gt_score', 'use_label_smooth', 'name'], varargs=None, keywords=None, defaults=(None, True, None)), ('document', 'eb62b1ff7cc981f3483a62321a491f2e'))
 paddle.fluid.layers.yolo_box (ArgSpec(args=['x', 'img_size', 'anchors', 'class_num', 'conf_thresh', 'downsample_ratio', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '5566169a5ab993d177792c023c7fb340'))
 paddle.fluid.layers.box_clip (ArgSpec(args=['input', 'im_info', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '397e9e02b451d99c56e20f268fa03f2e'))
 paddle.fluid.layers.multiclass_nms (ArgSpec(args=['bboxes', 'scores', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'normalized', 'nms_eta', 'background_label', 'name'], varargs=None, keywords=None, defaults=(0.3, True, 1.0, 0, None)), ('document', 'ca7d1107b6c5d2d6d8221039a220fde0'))

diff --git a/paddle/fluid/operators/detection/yolov3_loss_op.cc b/paddle/fluid/operators/detection/yolov3_loss_op.cc
@@ -202,13 +202,13 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
          the max IoU should be 1, and if the anchor box has IoU bigger than ignore 
          thresh, the confidence score loss of this anchor box will be ignored.
 
-         Therefore, the yolov3 loss consist of three major parts, box location loss,
-         confidence score loss, and classification loss. The L1 loss is used for 
-         box coordinates (w, h), and sigmoid cross entropy loss is used for box 
-         coordinates (x, y), confidence score loss and classification loss.
+         Therefore, the yolov3 loss consists of three major parts: box location loss,
+         objectness loss and classification loss. The L1 loss is used for 
+         box coordinates (w, h), sigmoid cross entropy loss is used for box 
+         coordinates (x, y), objectness loss and classification loss.
 
-         Each groud truth box find a best matching anchor box in all anchors, 
-         prediction of this anchor box will incur all three parts of losses, and
+         Each groud truth box finds a best matching anchor box in all anchors. 
+         Prediction of this anchor box will incur all three parts of losses, and
          prediction of anchor boxes with no GT box matched will only incur objectness
          loss.
 

diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
@@ -525,8 +525,10 @@ def yolov3_loss(x,
     Args:
         x (Variable): ${x_comment}
         gt_box (Variable): groud truth boxes, should be in shape of [N, B, 4],
-                          in the third dimenstion, x, y, w, h should be stored 
-                          and x, y, w, h should be relative value of input image.
+                          in the third dimenstion, x, y, w, h should be stored. 
+                          x,y is the center cordinate of boxes, w, h are the
+                          width and height, x, y, w, h should be divided by 
+                          input image height to scale to [0, 1].
                           N is the batch number and B is the max box number in 
                           an image.
         gt_label (Variable): class id of ground truth boxes, shoud be in shape