jwyang · Yang-007 · May 7, 2022
diff --git a/lib/model/rpn/anchor_target_layer.py b/lib/model/rpn/anchor_target_layer.py
@@ -34,6 +34,11 @@ class _AnchorTargetLayer(nn.Module):
         labels and bounding-box regression targets.
     """
     def __init__(self, feat_stride, scales, ratios):
+        '''
+        :param feat_stride: 16
+        :param scales: [8,16,32]
+        :param ratios: [0.5,1,2]
+        '''
         super(_AnchorTargetLayer, self).__init__()
 
         self._feat_stride = feat_stride
@@ -53,7 +58,7 @@ def forward(self, input):
         #   apply predicted bbox deltas at cell i to each of the 9 anchors
         # filter out-of-image anchors
 
-        rpn_cls_score = input[0]
+        rpn_cls_score = input[0]  # (B, 18, h, w)
         gt_boxes = input[1]
         im_info = input[2]
         num_boxes = input[3]
@@ -74,7 +79,7 @@ def forward(self, input):
         A = self._num_anchors
         K = shifts.size(0)
 
-        self._anchors = self._anchors.type_as(gt_boxes) # move to specific gpu.
+        self._anchors = self._anchors.type_as(gt_boxes)  # move to specific gpu.
         all_anchors = self._anchors.view(1, A, 4) + shifts.view(K, 1, 4)
         all_anchors = all_anchors.view(K * A, 4)
 

diff --git a/lib/model/rpn/bbox_transform.py b/lib/model/rpn/bbox_transform.py
@@ -74,7 +74,13 @@ def bbox_transform_batch(ex_rois, gt_rois):
 
     return targets
 
-def bbox_transform_inv(boxes, deltas, batch_size):
+def bbox_transform_inv(boxes, deltas):
+    '''
+    :param boxes: (B, 9*50*38, 4) [x1, y1, x2, y2]
+    :param deltas: (B, 9*50*38, 4) [delta_x1, delta_y1, delta_x2, delta_y2]
+    :param batch_size: B
+    :return:
+    '''
     widths = boxes[:, :, 2] - boxes[:, :, 0] + 1.0
     heights = boxes[:, :, 3] - boxes[:, :, 1] + 1.0
     ctr_x = boxes[:, :, 0] + 0.5 * widths

diff --git a/lib/model/rpn/generate_anchors.py b/lib/model/rpn/generate_anchors.py
@@ -26,40 +26,53 @@
 #       -79  -167    96   184
 #      -167  -343   184   360
 
-#array([[ -83.,  -39.,  100.,   56.],
-#       [-175.,  -87.,  192.,  104.],
-#       [-359., -183.,  376.,  200.],
-#       [ -55.,  -55.,   72.,   72.],
-#       [-119., -119.,  136.,  136.],
-#       [-247., -247.,  264.,  264.],
-#       [ -35.,  -79.,   52.,   96.],
-#       [ -79., -167.,   96.,  184.],
-#       [-167., -343.,  184.,  360.]])
+# array([[ -83.,  -39.,  100.,   56.],
+#        [-175.,  -87.,  192.,  104.],
+#        [-359., -183.,  376.,  200.],
+#        [ -55.,  -55.,   72.,   72.],
+#        [-119., -119.,  136.,  136.],
+#        [-247., -247.,  264.,  264.],
+#        [ -35.,  -79.,   52.,   96.],
+#        [ -79., -167.,   96.,  184.],
+#        [-167., -343.,  184.,  360.]])
 
 try:
     xrange          # Python 2
 except NameError:
     xrange = range  # Python 3
 
-
+# scales = np.array([8,16,32]), ratios = np.array([0.5,1,2])
 def generate_anchors(base_size=16, ratios=[0.5, 1, 2],
                      scales=2**np.arange(3, 6)):
     """
     Generate anchor (reference) windows by enumerating aspect ratios X
     scales wrt a reference (0, 0, 15, 15) window.
     """
 
-    base_anchor = np.array([1, 1, base_size, base_size]) - 1
-    ratio_anchors = _ratio_enum(base_anchor, ratios)
+    base_anchor = np.array([1, 1, base_size, base_size]) - 1  # base_anchor = np.array([0, 0, 15, 15])
+    ratio_anchors = _ratio_enum(base_anchor, ratios)  # ratios = ratios = np.array([0.5,1,2])
     anchors = np.vstack([_scale_enum(ratio_anchors[i, :], scales)
                          for i in xrange(ratio_anchors.shape[0])])
     return anchors
 
+def _ratio_enum(anchor, ratios):
+    """
+    Enumerate a set of anchors for each aspect ratio wrt an anchor.
+    """
+    # anchor = np.array([0, 0, 15, 15]), ratios = ratios = np.array([0.5,1,2])
+    w, h, x_ctr, y_ctr = _whctrs(anchor)  # 16, 16, 7.5, 7.5
+    size = w * h
+    size_ratios = size / ratios  # np.array([512, 256, 128])
+    ws = np.round(np.sqrt(size_ratios))  # np.array([23, 16, 11])
+    hs = np.round(ws * ratios)  # np.array([12, 16, 22])
+    anchors = _mkanchors(ws, hs, x_ctr, y_ctr)
+    return anchors
+
 def _whctrs(anchor):
     """
     Return width, height, x center, and y center for an anchor (window).
     """
-
+    # anchor = np.array([0, 0, 15, 15])
     w = anchor[2] - anchor[0] + 1
     h = anchor[3] - anchor[1] + 1
     x_ctr = anchor[0] + 0.5 * (w - 1)
@@ -72,26 +85,15 @@ def _mkanchors(ws, hs, x_ctr, y_ctr):
     (x_ctr, y_ctr), output a set of anchors (windows).
     """
 
-    ws = ws[:, np.newaxis]
-    hs = hs[:, np.newaxis]
+    ws = ws[:, np.newaxis]  # np.array([[23], [16], [11]])
+    hs = hs[:, np.newaxis]  # np.array([[12], [16], [22]])
     anchors = np.hstack((x_ctr - 0.5 * (ws - 1),
                          y_ctr - 0.5 * (hs - 1),
                          x_ctr + 0.5 * (ws - 1),
                          y_ctr + 0.5 * (hs - 1)))
     return anchors
 
-def _ratio_enum(anchor, ratios):
-    """
-    Enumerate a set of anchors for each aspect ratio wrt an anchor.
-    """
 
-    w, h, x_ctr, y_ctr = _whctrs(anchor)
-    size = w * h
-    size_ratios = size / ratios
-    ws = np.round(np.sqrt(size_ratios))
-    hs = np.round(ws * ratios)
-    anchors = _mkanchors(ws, hs, x_ctr, y_ctr)
-    return anchors
 
 def _scale_enum(anchor, scales):
     """

diff --git a/lib/model/rpn/proposal_layer.py b/lib/model/rpn/proposal_layer.py
@@ -28,7 +28,7 @@ class _ProposalLayer(nn.Module):
     Outputs object detection proposals by applying estimated bounding-box
     transformations to a set of regular boxes (called "anchors").
     """
-
+    # feat_stride = 16, scales = [8,16,32], ratios = [0.5,1,2]
     def __init__(self, feat_stride, scales, ratios):
         super(_ProposalLayer, self).__init__()
 
@@ -47,7 +47,14 @@ def __init__(self, feat_stride, scales, ratios):
         #     top[1].reshape(1, 1, 1, 1)
 
     def forward(self, input):
-
+        '''
+        :param input: (rpn_cls_prob.data, rpn_bbox_pred.data, im_info, cfg_key)
+        :rpn_cls_prob.data: (B, 18, h, w)
+        :rpn_bbox_pred.data: (B, 36, h, w)
+        :im_info
+        :cfg_key
+        :return:
+        '''
         # Algorithm:
         #
         # for each (H, W) location i
@@ -64,49 +71,49 @@ def forward(self, input):
 
         # the first set of _num_anchors channels are bg probs
         # the second set are the fg probs
-        scores = input[0][:, self._num_anchors:, :, :]
+        scores = input[0][:, self._num_anchors:, :, :]  # (B, 9, h, w)
         bbox_deltas = input[1]
         im_info = input[2]
         cfg_key = input[3]
 
-        pre_nms_topN  = cfg[cfg_key].RPN_PRE_NMS_TOP_N
-        post_nms_topN = cfg[cfg_key].RPN_POST_NMS_TOP_N
-        nms_thresh    = cfg[cfg_key].RPN_NMS_THRESH
-        min_size      = cfg[cfg_key].RPN_MIN_SIZE
+        pre_nms_topN  = cfg[cfg_key].RPN_PRE_NMS_TOP_N  # 6000
+        post_nms_topN = cfg[cfg_key].RPN_POST_NMS_TOP_N  # 300
+        nms_thresh    = cfg[cfg_key].RPN_NMS_THRESH  # 0.7
+        min_size      = cfg[cfg_key].RPN_MIN_SIZE  # 8
 
         batch_size = bbox_deltas.size(0)
 
-        feat_height, feat_width = scores.size(2), scores.size(3)
+        feat_height, feat_width = scores.size(2), scores.size(3)  # h, w
         shift_x = np.arange(0, feat_width) * self._feat_stride
         shift_y = np.arange(0, feat_height) * self._feat_stride
         shift_x, shift_y = np.meshgrid(shift_x, shift_y)
         shifts = torch.from_numpy(np.vstack((shift_x.ravel(), shift_y.ravel(),
                                   shift_x.ravel(), shift_y.ravel())).transpose())
         shifts = shifts.contiguous().type_as(scores).float()
 
-        A = self._num_anchors
-        K = shifts.size(0)
+        A = self._num_anchors  # 9
+        K = shifts.size(0)  # ceil(800/16)*ceil(600/16) = h*w = 50*38
 
         self._anchors = self._anchors.type_as(scores)
         # anchors = self._anchors.view(1, A, 4) + shifts.view(1, K, 4).permute(1, 0, 2).contiguous()
-        anchors = self._anchors.view(1, A, 4) + shifts.view(K, 1, 4)
-        anchors = anchors.view(1, K * A, 4).expand(batch_size, K * A, 4)
+        anchors = self._anchors.view(1, A, 4) + shifts.view(K, 1, 4)  #
+        anchors = anchors.view(1, K * A, 4).expand(batch_size, K * A, 4)  # (B, 9*50*38, 4)
 
         # Transpose and reshape predicted bbox transformations to get them
         # into the same order as the anchors:
 
-        bbox_deltas = bbox_deltas.permute(0, 2, 3, 1).contiguous()
-        bbox_deltas = bbox_deltas.view(batch_size, -1, 4)
+        bbox_deltas = bbox_deltas.permute(0, 2, 3, 1).contiguous()  # (B, h, w, 36)
+        bbox_deltas = bbox_deltas.view(batch_size, -1, 4)  # (B, h*w*9, 4)
 
         # Same story for the scores:
-        scores = scores.permute(0, 2, 3, 1).contiguous()
-        scores = scores.view(batch_size, -1)
+        scores = scores.permute(0, 2, 3, 1).contiguous()  # (B, h, w, 9)
+        scores = scores.view(batch_size, -1)  # (B, h*w*9)
 
         # Convert anchors into proposals via bbox transformations
-        proposals = bbox_transform_inv(anchors, bbox_deltas, batch_size)
+        proposals = bbox_transform_inv(anchors, bbox_deltas)
 
         # 2. clip predicted boxes to image
-        proposals = clip_boxes(proposals, im_info, batch_size)
+        proposals = clip_boxes(proposals, im_info, batch_size)  # (B, 9*50*38, 4) [x1, y1, x2, y2]
         # proposals = clip_boxes_batch(proposals, im_info, batch_size)
 
         # assign the score to 0 if it's non keep.
@@ -150,7 +157,7 @@ def forward(self, input):
 
             if post_nms_topN > 0:
                 keep_idx_i = keep_idx_i[:post_nms_topN]
-            proposals_single = proposals_single[keep_idx_i, :]
+            proposals_single = proposals_single[keep_idx_i, :]  # (post_nms_topN, 4)
             scores_single = scores_single[keep_idx_i, :]
 
             # padding 0 at the end.

diff --git a/lib/model/rpn/rpn.py b/lib/model/rpn/rpn.py
@@ -20,20 +20,20 @@ def __init__(self, din):
         super(_RPN, self).__init__()
 
         self.din = din  # get depth of input feature map, e.g., 512
-        self.anchor_scales = cfg.ANCHOR_SCALES
-        self.anchor_ratios = cfg.ANCHOR_RATIOS
-        self.feat_stride = cfg.FEAT_STRIDE[0]
+        self.anchor_scales = cfg.ANCHOR_SCALES  # [8,16,32]
+        self.anchor_ratios = cfg.ANCHOR_RATIOS  # [0.5,1,2]
+        self.feat_stride = cfg.FEAT_STRIDE[0]  # 16
 
         # define the convrelu layers processing input feature map
         self.RPN_Conv = nn.Conv2d(self.din, 512, 3, 1, 1, bias=True)
 
-        # define bg/fg classifcation score layer
-        self.nc_score_out = len(self.anchor_scales) * len(self.anchor_ratios) * 2 # 2(bg/fg) * 9 (anchors)
-        self.RPN_cls_score = nn.Conv2d(512, self.nc_score_out, 1, 1, 0)
+        # define bg(background)/fg(foreground) classification score layer
+        self.nc_score_out = len(self.anchor_scales) * len(self.anchor_ratios) * 2  # 18 = 9 (anchors) * 2 (bg/fg)
+        self.RPN_cls_score = nn.Conv2d(512, self.nc_score_out, 1, 1, 0)  # (B, C, h, w)->(B, 18, h, w)
 
         # define anchor box offset prediction layer
-        self.nc_bbox_out = len(self.anchor_scales) * len(self.anchor_ratios) * 4 # 4(coords) * 9 (anchors)
-        self.RPN_bbox_pred = nn.Conv2d(512, self.nc_bbox_out, 1, 1, 0)
+        self.nc_bbox_out = len(self.anchor_scales) * len(self.anchor_ratios) * 4  # 9 (anchors) * 4(coords: x1, y1,  x2, y2)
+        self.RPN_bbox_pred = nn.Conv2d(512, self.nc_bbox_out, 1, 1, 0)  # (B, C, h, w)->(B, 36, h, w)
 
         # define proposal layer
         self.RPN_proposal = _ProposalLayer(self.feat_stride, self.anchor_scales, self.anchor_ratios)
@@ -56,24 +56,30 @@ def reshape(x, d):
         return x
 
     def forward(self, base_feat, im_info, gt_boxes, num_boxes):
-
+        """
+        :param base_feat: torch.size(B,C,h,w)
+        :param im_info:
+        :param gt_boxes:
+        :param num_boxes:
+        :return:
+        """
         batch_size = base_feat.size(0)
 
         # return feature map after convrelu layer
         rpn_conv1 = F.relu(self.RPN_Conv(base_feat), inplace=True)
         # get rpn classification score
-        rpn_cls_score = self.RPN_cls_score(rpn_conv1)
+        rpn_cls_score = self.RPN_cls_score(rpn_conv1)  # (B, 18, h, w)
 
-        rpn_cls_score_reshape = self.reshape(rpn_cls_score, 2)
+        rpn_cls_score_reshape = self.reshape(rpn_cls_score, 2)  # (B, 2, 9*h, w)
         rpn_cls_prob_reshape = F.softmax(rpn_cls_score_reshape, 1)
-        rpn_cls_prob = self.reshape(rpn_cls_prob_reshape, self.nc_score_out)
+        rpn_cls_prob = self.reshape(rpn_cls_prob_reshape, self.nc_score_out)  # (B, 18, h, w)
 
         # get rpn offsets to the anchor boxes
-        rpn_bbox_pred = self.RPN_bbox_pred(rpn_conv1)
+        rpn_bbox_pred = self.RPN_bbox_pred(rpn_conv1)  # (B, 36, h, w)
 
         # proposal layer
         cfg_key = 'TRAIN' if self.training else 'TEST'
-
+        # rois: (B, post_nms_topN, 5) 5: [B_index, x1, y1, x2, y2] after NMS
         rois = self.RPN_proposal((rpn_cls_prob.data, rpn_bbox_pred.data,
                                  im_info, cfg_key))
 
@@ -87,7 +93,7 @@ def forward(self, base_feat, im_info, gt_boxes, num_boxes):
             rpn_data = self.RPN_anchor_target((rpn_cls_score.data, gt_boxes, im_info, num_boxes))
 
             # compute classification loss
-            rpn_cls_score = rpn_cls_score_reshape.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, 2)
+            rpn_cls_score = rpn_cls_score_reshape.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, 2)  # (B, 9*h*w, 2)
             rpn_label = rpn_data[0].view(batch_size, -1)
 
             rpn_keep = Variable(rpn_label.view(-1).ne(-1).nonzero().view(-1))