threestudio-project · Adamdad · Jan 2, 2024 · DSaurus · Jan 3, 2024 · Adamdad
diff --git a/configs/stable-zero123-diffusers.yaml b/configs/stable-zero123-diffusers.yaml
@@ -0,0 +1,146 @@
+name: "zero123-sai"
+tag: "${data.random_camera.height}_${rmspace:${basename:${data.image_path}},_}"
+exp_root_dir: "outputs"
+seed: 0
+
+data_type: "single-image-datamodule"
+data: # threestudio/data/image.py -> SingleImageDataModuleConfig
+  image_path: ./load/images/hamburger_rgba.png
+  height: [128, 256, 512]
+  width: [128, 256, 512]
+  resolution_milestones: [200, 300]
+  default_elevation_deg: 5.0
+  default_azimuth_deg: 0.0
+  default_camera_distance: 3.8
+  default_fovy_deg: 20.0
+  requires_depth: ${cmaxgt0orcmaxgt0:${system.loss.lambda_depth},${system.loss.lambda_depth_rel}}
+  requires_normal: ${cmaxgt0:${system.loss.lambda_normal}}
+  random_camera: # threestudio/data/uncond.py -> RandomCameraDataModuleConfig
+    height: [64, 128, 256]
+    width: [64, 128, 256]
+    batch_size: [12, 8, 4]
+    resolution_milestones: [200, 300]
+    eval_height: 512
+    eval_width: 512
+    eval_batch_size: 1
+    elevation_range: [-10, 80]
+    azimuth_range: [-180, 180]
+    camera_distance_range: [3.8, 3.8]
+    fovy_range: [20.0, 20.0] # Zero123 has fixed fovy
+    progressive_until: 0
+    camera_perturb: 0.0
+    center_perturb: 0.0
+    up_perturb: 0.0
+    light_position_perturb: 1.0
+    light_distance_range: [7.5, 10.0]
+    eval_elevation_deg: ${data.default_elevation_deg}
+    eval_camera_distance: ${data.default_camera_distance}
+    eval_fovy_deg: ${data.default_fovy_deg}
+    light_sample_strategy: "dreamfusion"
+    batch_uniform_azimuth: False
+    n_val_views: 30
+    n_test_views: 120
+
+system_type: "zero123-system"
+system:
+  geometry_type: "implicit-volume"
+  geometry:
+    radius: 2.0
+    normal_type: "analytic"
+
+    # use Magic3D density initialization instead
+    density_bias: "blob_magic3d"
+    density_activation: softplus
+    density_blob_scale: 10.
+    density_blob_std: 0.5
+
+    # coarse to fine hash grid encoding
+    # to ensure smooth analytic normals
+    pos_encoding_config:
+      otype: HashGrid
+      n_levels: 16
+      n_features_per_level: 2
+      log2_hashmap_size: 19
+      base_resolution: 16
+      per_level_scale: 1.447269237440378 # max resolution 4096
+    mlp_network_config:
+      otype: "VanillaMLP"
+      activation: "ReLU"
+      output_activation: "none"
+      n_neurons: 64
+      n_hidden_layers: 2
+
+  material_type: "diffuse-with-point-light-material"
+  material:
+    ambient_only_steps: 100000
+    textureless_prob: 0.05
+    albedo_activation: sigmoid
+
+  background_type: "solid-color-background" # unused
+
+  renderer_type: "nerf-volume-renderer"
+  renderer:
+    radius: ${system.geometry.radius}
+    num_samples_per_ray: 512
+    return_comp_normal: ${cmaxgt0:${system.loss.lambda_normal_smooth}}
+    return_normal_perturb: ${cmaxgt0:${system.loss.lambda_3d_normal_smooth}}
+
+  prompt_processor_type: "dummy-prompt-processor" # Zero123 doesn't use prompts
+  prompt_processor:
+    pretrained_model_name_or_path: ""
+    prompt: ""
+
+  guidance_type: "zero123-unified-guidance"
+  guidance:
+    pretrained_model_name_or_path: "ashawkey/stable-zero123-diffusers"
+
+    cond_image_path: ${data.image_path}
+    cond_elevation_deg: ${data.default_elevation_deg}
+    cond_azimuth_deg: ${data.default_azimuth_deg}
+    cond_camera_distance: ${data.default_camera_distance}
+    guidance_scale: 3.0
+    min_step_percent: [50, 0.7, 0.3, 200]  # (start_iter, start_val, end_val, end_iter)
+    max_step_percent: [50, 0.98, 0.8, 200]
+
+  freq:
+    ref_only_steps: 0
+    guidance_eval: 0
+
+  loggers:
+    wandb:
+      enable: false
+      project: "threestudio"
+      name: None
+
+  loss:
+    lambda_sds: 0.1
+    lambda_rgb: [100, 500., 1000., 400]
+    lambda_mask: 50.
+    lambda_depth: 0. # 0.05
+    lambda_depth_rel: 0. # [0, 0, 0.05, 100]
+    lambda_normal: 0. # [0, 0, 0.05, 100]
+    lambda_normal_smooth: [100, 7.0, 5.0, 150, 10.0, 200]
+    lambda_3d_normal_smooth: [100, 7.0, 5.0, 150, 10.0, 200]
+    lambda_orient: 1.0
+    lambda_sparsity: 0.5 # should be tweaked for every model
+    lambda_opaque: 0.5
+
+  optimizer:
+    name: Adam
+    args:
+      lr: 0.01
+      betas: [0.9, 0.99]
+      eps: 1.e-8
+
+trainer:
+  max_steps: 600
+  log_every_n_steps: 1
+  num_sanity_val_steps: 0
+  val_check_interval: 100
+  enable_progress_bar: true
+  precision: 32
+
+checkpoint:
+  save_last: true # save at each validation time
+  save_top_k: -1
+  every_n_train_steps: 100 # ${trainer.max_steps}
diff --git a/threestudio/models/guidance/zero123_unified_guidance.py b/threestudio/models/guidance/zero123_unified_guidance.py
@@ -82,17 +82,29 @@ class NonTrainableModules:
             torch.float16 if self.cfg.half_precision_weights else torch.float32
         )
 
-        threestudio.info(f"Loading Zero123 ...")
+        self.use_stable_zero123 = 'stable' in self.cfg.pretrained_model_name_or_path
+        if self.use_stable_zero123:
+            # stable-zero123 has a different camera embedding
+            threestudio.info(f"Loaded Stable Zero123!")
+            pipe_kwargs = {
+                "safety_checker": None,
+                "requires_safety_checker": False,
+                "variant": "fp16" if self.cfg.half_precision_weights else None,
+                "torch_dtype": self.weights_dtype,
+            }
+        else:
+            pipe_kwargs = {
+                "safety_checker": None,
+                "requires_safety_checker": False,
+                "trust_remote_code": True,
+                # "variant": "fp16" if self.cfg.half_precision_weights else None,
+                "torch_dtype": self.weights_dtype,
+            }
+            threestudio.info(f"Loading Zero123 ...")
 
         # need to make sure the pipeline file is in path
         sys.path.append("extern/")
 
-        pipe_kwargs = {
-            "safety_checker": None,
-            "requires_safety_checker": False,
-            "variant": "fp16" if self.cfg.half_precision_weights else None,
-            "torch_dtype": self.weights_dtype,
-        }
         pipe = Zero123Pipeline.from_pretrained(
             self.cfg.pretrained_model_name_or_path,
             **pipe_kwargs,
@@ -262,15 +274,31 @@ def get_image_camera_embeddings(
         camera_distances: Float[Tensor, "B"],
     ) -> Float[Tensor, "B 1 D"]:
         batch_size = elevation_deg.shape[0]
-        camera_embeddings: Float[Tensor, "B 1 4"] = torch.stack(
-            [
-                torch.deg2rad(self.cfg.cond_elevation_deg - elevation_deg),
-                torch.sin(torch.deg2rad(azimuth_deg - self.cfg.cond_azimuth_deg)),
-                torch.cos(torch.deg2rad(azimuth_deg - self.cfg.cond_azimuth_deg)),
-                camera_distances - self.cfg.cond_camera_distance,
-            ],
-            dim=-1,
-        )[:, None, :]
+        if self.use_stable_zero123:
+            camera_embeddings: Float[Tensor, "B 1 4"] = torch.stack(
+                [
+                    torch.deg2rad(
+                        (90 - elevation_deg) - (90 - self.cfg.cond_elevation_deg)
+                    ),  # Zero123 polar is 90-elevation
+                    torch.sin(torch.deg2rad(azimuth_deg - self.cfg.cond_azimuth_deg)),
+                    torch.cos(torch.deg2rad(azimuth_deg - self.cfg.cond_azimuth_deg)),
+                    torch.deg2rad(
+                        90 - torch.full_like(elevation_deg, self.cfg.cond_elevation_deg)
+                    ),
+                ],
+                dim=-1,
+            )[:, None, :].to(self.device)
+        else:
+            # original zero123 camera embedding
+            camera_embeddings: Float[Tensor, "B 1 4"] = torch.stack(
+                [
+                    torch.deg2rad(self.cfg.cond_elevation_deg - elevation_deg),
+                    torch.sin(torch.deg2rad(azimuth_deg - self.cfg.cond_azimuth_deg)),
+                    torch.cos(torch.deg2rad(azimuth_deg - self.cfg.cond_azimuth_deg)),
+                    camera_distances - self.cfg.cond_camera_distance,
+                ],
+                dim=-1,
+            )[:, None, :]
 
         image_camera_embeddings = self.pipe.clip_camera_projection(
             torch.cat(
@@ -638,6 +666,7 @@ def forward(
         loss_sd = 0.5 * F.mse_loss(latents, target, reduction="sum") / batch_size
 
         guidance_out = {
+            "loss_sds": loss_sd,
             "loss_sd": loss_sd,
             "grad_norm": grad.norm(),
             "timesteps": t,