Training visualisation for RetinaNet (#890)

RaymondCM · facebook-github-bot · commit 76e543e699dc · 2020-02-24T12:14:45.000-08:00
Summary: Purely additive PR, added a training visualization option to the RetinaNet class, similar to the rcnn that allows logging of best scoring predictions every cfg.VIS_PERIOD storage iters. Pull Request resolved: #890 Reviewed By: rbgirshick Differential Revision: D20065554 Pulled By: ppwwyyxx fbshipit-source-id: 800631d38faf7cc93eb43cf62cf74187a52bb76d
diff --git a/detectron2/modeling/meta_arch/retinanet.py b/detectron2/modeling/meta_arch/retinanet.py
@@ -1,6 +1,7 @@
 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 import logging
 import math
+import numpy as np
 from typing import List
 import torch
 from fvcore.nn import sigmoid_focal_loss_jit, smooth_l1_loss
@@ -77,6 +78,9 @@ def __init__(self, cfg):
         self.topk_candidates          = cfg.MODEL.RETINANET.TOPK_CANDIDATES_TEST
         self.nms_threshold            = cfg.MODEL.RETINANET.NMS_THRESH_TEST
         self.max_detections_per_image = cfg.TEST.DETECTIONS_PER_IMAGE
+        # Vis parameters
+        self.vis_period               = cfg.VIS_PERIOD
+        self.input_format             = cfg.INPUT.FORMAT
         # fmt: on
 
         self.backbone = build_backbone(cfg)
@@ -108,6 +112,44 @@ def __init__(self, cfg):
         self.loss_normalizer = 100  # initialize with any reasonable #fg that's not too small
         self.loss_normalizer_momentum = 0.9
 
+    def visualize_training(self, batched_inputs, results):
+        """
+        A function used to visualize ground truth images and final network predictions.
+        It shows ground truth bounding boxes on the original image and up to 20
+        predicted object bounding boxes on the original image.
+
+        Args:
+            batched_inputs (list): a list that contains input to the model.
+            results (List[Instances]): a list of #images elements.
+        """
+        from detectron2.utils.visualizer import Visualizer
+
+        assert len(batched_inputs) == len(
+            results
+        ), "Cannot visualize inputs and results of different sizes"
+        storage = get_event_storage()
+        max_boxes = 20
+
+        image_index = 0  # only visualize a single image
+        img = batched_inputs[image_index]["image"].cpu().numpy()
+        assert img.shape[0] == 3, "Images should have 3 channels."
+        if self.input_format == "BGR":
+            img = img[::-1, :, :]
+        img = img.transpose(1, 2, 0)
+        v_gt = Visualizer(img, None)
+        v_gt = v_gt.overlay_instances(boxes=batched_inputs[image_index]["instances"].gt_boxes)
+        anno_img = v_gt.get_image()
+        processed_results = detector_postprocess(results[image_index], img.shape[0], img.shape[1])
+        predicted_boxes = processed_results.pred_boxes.tensor.detach().cpu().numpy()
+
+        v_pred = Visualizer(img, None)
+        v_pred = v_pred.overlay_instances(boxes=predicted_boxes[0:max_boxes])
+        prop_img = v_pred.get_image()
+        vis_img = np.vstack((anno_img, prop_img))
+        vis_img = vis_img.transpose(2, 0, 1)
+        vis_name = f"Top: GT bounding boxes; Bottom: {max_boxes} Highest Scoring Results"
+        storage.put_image(vis_name, vis_img)
+
     def forward(self, batched_inputs):
         """
         Args:
@@ -144,7 +186,15 @@ def forward(self, batched_inputs):
 
         if self.training:
             gt_classes, gt_anchors_reg_deltas = self.get_ground_truth(anchors, gt_instances)
-            return self.losses(gt_classes, gt_anchors_reg_deltas, box_cls, box_delta)
+            losses = self.losses(gt_classes, gt_anchors_reg_deltas, box_cls, box_delta)
+
+            if self.vis_period > 0:
+                storage = get_event_storage()
+                if storage.iter % self.vis_period == 0:
+                    results = self.inference(box_cls, box_delta, anchors, images.image_sizes)
+                    self.visualize_training(batched_inputs, results)
+
+            return losses
         else:
             results = self.inference(box_cls, box_delta, anchors, images.image_sizes)
             processed_results = []