Add depth Loss for 3D gaussian

ingra14m · ingra14m · commit 179d7688e958 · 2023-08-01T11:30:35.000+08:00
diff --git a/SIBR_viewers b/SIBR_viewers
@@ -1 +1 @@
-Subproject commit 440bd4c75b2029d0d5f60f6c5859a273e7ae651b
+Subproject commit 29dd2f3a5dc866664b8e0f04bce34dc81e5e6088
diff --git a/scene/cameras.py b/scene/cameras.py
@@ -17,7 +17,7 @@
 class Camera(nn.Module):
     def __init__(self, colmap_id, R, T, FoVx, FoVy, image, gt_alpha_mask,
                  image_name, uid,
-                 trans=np.array([0.0, 0.0, 0.0]), scale=1.0, data_device = "cuda"
+                 trans=np.array([0.0, 0.0, 0.0]), scale=1.0, data_device = "cuda", gt_depth=None
                  ):
         super(Camera, self).__init__()
 
@@ -37,6 +37,7 @@ def __init__(self, colmap_id, R, T, FoVx, FoVy, image, gt_alpha_mask,
             self.data_device = torch.device("cuda")
 
         self.original_image = image.clamp(0.0, 1.0).to(self.data_device)
+        self.depth = gt_depth.to(self.data_device) if gt_depth is not None else None
         self.image_width = self.original_image.shape[2]
         self.image_height = self.original_image.shape[1]
 
diff --git a/scene/dataset_readers.py b/scene/dataset_readers.py
@@ -34,6 +34,7 @@ class CameraInfo(NamedTuple):
     image_name: str
     width: int
     height: int
+    depth: np.array
 
 class SceneInfo(NamedTuple):
     point_cloud: BasicPointCloud
@@ -183,10 +184,15 @@ def readCamerasFromTransforms(path, transformsfile, white_background, extension=
         contents = json.load(json_file)
         fovx = contents["camera_angle_x"]
 
+        is_test = 'test' in transformsfile
+
         frames = contents["frames"]
         for idx, frame in enumerate(frames):
             cam_name = os.path.join(path, frame["file_path"] + extension)
 
+            if is_test:
+                depth_name = os.path.join(path, frame["file_path"] + "_depth_0001" + extension)
+
             matrix = np.linalg.inv(np.array(frame["transform_matrix"]))
             R = -np.transpose(matrix[:3,:3])
             R[:,0] = -R[:,0]
@@ -195,6 +201,7 @@ def readCamerasFromTransforms(path, transformsfile, white_background, extension=
             image_path = os.path.join(path, cam_name)
             image_name = Path(cam_name).stem
             image = Image.open(image_path)
+            depth = Image.open(depth_name).convert('RGBA') if is_test else None
 
             im_data = np.array(image.convert("RGBA"))
 
@@ -209,15 +216,15 @@ def readCamerasFromTransforms(path, transformsfile, white_background, extension=
             FovX = fovy
 
             cam_infos.append(CameraInfo(uid=idx, R=R, T=T, FovY=FovY, FovX=FovX, image=image,
-                            image_path=image_path, image_name=image_name, width=image.size[0], height=image.size[1]))
+                            image_path=image_path, image_name=image_name, width=image.size[0], height=image.size[1], depth=depth))
             
     return cam_infos
 
 def readNerfSyntheticInfo(path, white_background, eval, extension=".png"):
     print("Reading Training Transforms")
-    train_cam_infos = readCamerasFromTransforms(path, "transforms_train.json", white_background, extension)
+    train_cam_infos = readCamerasFromTransforms(path, "transforms_test.json", white_background, extension)
     print("Reading Test Transforms")
-    test_cam_infos = readCamerasFromTransforms(path, "transforms_test.json", white_background, extension)
+    test_cam_infos = readCamerasFromTransforms(path, "transforms_train.json", white_background, extension)
     
     if not eval:
         train_cam_infos.extend(test_cam_infos)
diff --git a/submodules/diff-gaussian-rasterization b/submodules/diff-gaussian-rasterization
@@ -1 +1 @@
-Subproject commit fc0cfe904a7870245437d9bfe17f819d9260281d
+Subproject commit 4c6d25016042507dd17a73b5e43c062aa981e90c
diff --git a/train.py b/train.py
@@ -1,4 +1,4 @@
-#
+7#
 # Copyright (C) 2023, Inria
 # GRAPHDECO research group, https://team.inria.fr/graphdeco
 # All rights reserved.
@@ -69,16 +69,20 @@ def training(dataset, opt, pipe, testing_iterations, saving_iterations):
         # Pick a random Camera
         if not viewpoint_stack:
             viewpoint_stack = scene.getTrainCameras().copy()
-        viewpoint_cam = viewpoint_stack.pop(randint(0, len(viewpoint_stack)-1))
 
+        viewpoint_cam = viewpoint_stack.pop(randint(0, len(viewpoint_stack)-1))
+        gt_depth = viewpoint_cam.depth.unsqueeze(0)
         # Render
         render_pkg = render(viewpoint_cam, gaussians, pipe, background)
         image, viewspace_point_tensor, visibility_filter, radii = render_pkg["render"], render_pkg["viewspace_points"], render_pkg["visibility_filter"], render_pkg["radii"]
+        depth = render_pkg["depth"]
 
         # Loss
         gt_image = viewpoint_cam.original_image.cuda()
         Ll1 = l1_loss(image, gt_image)
         loss = (1.0 - opt.lambda_dssim) * Ll1 + opt.lambda_dssim * (1.0 - ssim(image, gt_image))
+        depth_loss = l1_loss(depth, gt_depth) * 0.1
+        loss = loss + depth_loss
         loss.backward()
 
         iter_end.record()
@@ -199,7 +203,7 @@ def training_report(tb_writer, iteration, Ll1, loss, l1_loss, elapsed, testing_i
     safe_state(args.quiet)
 
     # Start GUI server, configure and run training
-    network_gui.init(args.ip, args.port)
+    # network_gui.init(args.ip, args.port)
     torch.autograd.set_detect_anomaly(args.detect_anomaly)
     training(lp.extract(args), op.extract(args), pp.extract(args), args.test_iterations, args.save_iterations)
 
diff --git a/utils/camera_utils.py b/utils/camera_utils.py
@@ -39,8 +39,16 @@ def loadCam(args, id, cam_info, resolution_scale):
         resolution = (int(orig_w / scale), int(orig_h / scale))
 
     resized_image_rgb = PILtoTorch(cam_info.image, resolution)
+    resized_depth_rgb = PILtoTorch(cam_info.depth, resolution) if cam_info.depth is not None else None
 
     gt_image = resized_image_rgb[:3, ...]
+    if resized_depth_rgb is not None:
+        depth_mask = resized_depth_rgb[3, ...] > 0
+        gt_depth = resized_depth_rgb[0, ...]
+        gt_depth[depth_mask] = 2. + 6. * (1 - gt_depth[depth_mask])
+    else:
+        gt_depth = None
+
     loaded_mask = None
 
     if resized_image_rgb.shape[1] == 4:
@@ -49,7 +57,7 @@ def loadCam(args, id, cam_info, resolution_scale):
     return Camera(colmap_id=cam_info.uid, R=cam_info.R, T=cam_info.T, 
                   FoVx=cam_info.FovX, FoVy=cam_info.FovY, 
                   image=gt_image, gt_alpha_mask=loaded_mask,
-                  image_name=cam_info.image_name, uid=id, data_device=args.data_device)
+                  image_name=cam_info.image_name, uid=id, data_device=args.data_device, gt_depth=gt_depth)
 
 def cameraList_from_camInfos(cam_infos, resolution_scale, args):
     camera_list = []