AnjieCheng · Dec 14, 2024
diff --git a/‎dataset_pipeline/README.md
+51-10 b/‎dataset_pipeline/README.md
+51-10
diff --git a/‎dataset_pipeline/configs/v2.py
+2 b/‎dataset_pipeline/configs/v2.py
+2
diff --git a/‎dataset_pipeline/configs/v2_geocalib.py
+98 b/‎dataset_pipeline/configs/v2_geocalib.py
+98
diff --git a/‎dataset_pipeline/osdsynth/processor/instruction.py
+308 b/‎dataset_pipeline/osdsynth/processor/instruction.py
+308
diff --git a/‎dataset_pipeline/osdsynth/processor/instruction_template.py
+115 b/‎dataset_pipeline/osdsynth/processor/instruction_template.py
+115
diff --git a/‎dataset_pipeline/osdsynth/processor/pointcloud.py
+30-10 b/‎dataset_pipeline/osdsynth/processor/pointcloud.py
+30-10
diff --git a/‎dataset_pipeline/run_llm.py
+148 b/‎dataset_pipeline/run_llm.py
+148
diff --git a/‎dataset_pipeline/run_template_facts.py
+157 b/‎dataset_pipeline/run_template_facts.py
+157
diff --git a/‎dataset_pipeline/run.py ‎dataset_pipeline/run_template_qa.py b/‎dataset_pipeline/run.py ‎dataset_pipeline/run_template_qa.py
@@ -5,14 +5,11 @@
 #### Environment
 
 ```sh
-conda create -n osd_pipeline anaconda python=3.10
+conda create -n osd_pipeline python=3.10 -y
 conda activate osd_pipeline
 
 ##### Install Pytorch according to your own setup #####
-# For example, if you have a GPU with CUDA 12.1
 pip install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cu121
-# This is optional if you prefer to system built-in nvcc.
-conda install -c nvidia cuda-toolkit -y
 
 # We use mmengine for config management
 pip install -U openmim
@@ -25,19 +22,35 @@ pip install https://github.com/zju3dv/Wis3D/releases/download/2.0.0/wis3d-2.0.0-
 pip install 'git+https://github.com/facebookresearch/detectron2.git'
 
 # Some other libraries
-pip install iopath pyequilib==0.3.0 albumentations einops
+pip install iopath pyequilib==0.3.0 albumentations einops open3d imageio
 pip install mmcv==2.0.0 -f https://download.openmmlab.com/mmcv/dist/cu116/torch1.13/index.html
-
 ```
 
 #### Install Grounded-SAM package
 
-```
-mkdir external && cd osdsynth/external
+```sh
+mkdir osdsynth/external && cd osdsynth/external
 git clone https://github.com/IDEA-Research/Grounded-Segment-Anything.git
 ```
 
-Follow the instructions on the original [repo](https://github.com/IDEA-Research/Grounded-Segment-Anything#install-without-docker). Our pipeline has been tested with the codebase at this [commit](https://github.com/open-mmlab/mmengine/commit/85c83ba61689907fb1775713622b1b146d82277b). Grounded-SAM codebase at later commits may require some adaptations. If you encounter problems installing the RAM package, try upgrade your `setuptools` version to the latest version.
+Follow the instructions on the original [repo](https://github.com/IDEA-Research/Grounded-Segment-Anything#install-without-docker) to build Segment Anything, Grounding DINO, and RAM, respectively. Our pipeline has been tested with the codebase at this [commit](https://github.com/IDEA-Research/Grounded-Segment-Anything/tree/126abe633ffe333e16e4a0a4e946bc1003caf757).
+
+```sh
+cd Grounded-Segment-Anything/
+
+# Install Segment Anything
+python -m pip install -e segment_anything
+
+# Install Grounding DINO
+pip install --no-build-isolation -e GroundingDINO
+
+# Install RAM
+git clone https://github.com/xinyu1205/recognize-anything.git
+pip install -r ./recognize-anything/requirements.txt
+pip install setuptools --upgrade
+pip install -e ./recognize-anything/
+```
+
 
 #### Install Perspective Fields package
 
@@ -56,10 +69,11 @@ sh ./scripts/download_all_weights.sh
 
 ### Inference
 
+#### Template-based QA
 To specify the folder containing the images for testing, use the `--input` argument. You can also adjust the settings in `configs/v2.py` to better suit your images, like modifying the SAM thresholds or tweaking the DBSCAN hyperparameters.
 
 ```sh
-python run.py --config configs/v2.py --input PATH_TO_INPUT --vis
+python run_template_qa.py --config configs/v2.py --input PATH_TO_INPUT --vis True
 ```
 
 The results are saved in two formats. One is in JSON, where the Open3D bounding boxes are serialized. If you'd like to recreate the Open3D bounding box object for each detection, you can use the following code:
@@ -73,6 +87,33 @@ bbox = o3d.geometry.AxisAlignedBoundingBox(
 
 The other format is compatible with Wis3D point clouds. You can use the instructions below to visualize these results.
 
+
+#### LLM-rephrased QA
+
+**Step1:** Generate template-based descriptions with the following command, this will save a `llm_prompts.json` in the output json folder.
+
+```sh
+python run_template_facts.py --config configs/v2.py --input PATH_TO_INPUT --vis True
+```
+
+**Step2:** Prepare a clean environment and install sglang
+```sh
+conda create -n sglang python=3.10 -y
+conda activate sglang
+
+pip install --upgrade pip
+pip install "sglang[all]"
+
+pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
+```
+
+**Step3:** Run llm rephrase, currently the script is using Llama-3.1-70B
+```sh
+export HF_TOKEN=<key>
+python run_llm.py --llm-prompts-path /PATH/SAMPLE_llm_prompts.json --port 3000 --gpus 8
+
+```
+
 ### Wis3D Visualization
 
 ```sh
 
@@ -94,3 +94,5 @@
 wid3d_interval = 1
 
 use_clip = False
+
+perspective_model_variant = "perspective_fields"
@@ -0,0 +1,98 @@
+# Class related params
+class_set = "ram"
+add_bg_classes = False
+accumu_classes = False
+exp_suffix = None
+rm_bg_classes = True
+
+add_classes = []
+remove_classes = [
+    "room",
+    "kitchen",
+    "office",
+    "house",
+    "home",
+    "building",
+    "corner",
+    "shadow",
+    "carpet",
+    "photo",
+    "sea",
+    "shade",
+    "stall",
+    "space",
+    "aquarium",
+    "apartment",
+    "image",
+    "city",
+    "blue",
+    "skylight",
+    "hallway",
+    "bureau",
+    "modern",
+    "salon",
+    "doorway",
+    "wall lamp",
+    "scene",
+    "sun",
+    "sky",
+    "smile",
+    "cloudy",
+    "comfort",
+    "white",
+    "black",
+    "red",
+    "green",
+    "blue",
+    "yellow",
+    "purple",
+    "pink",
+    "stand",
+    "wear",
+    "area",
+    "shine",
+    "lay",
+    "walk",
+    "lead",
+    "bite",
+    "sing",
+]
+bg_classes = ["wall", "floor", "ceiling"]
+
+# Sam related params
+sam_variant = "sam-hq"
+
+# Tag2text related params
+specified_tags = "None"
+
+# Grounding Dino related params
+box_threshold = 0.25
+text_threshold = 0.2
+nms_threshold = 0.5
+
+# LLaVa related params
+masking_option = "none"
+
+# Selection criteria on the 2D masks
+mask_area_threshold = 25  # mask with pixel area less than this will be skipped
+mask_conf_threshold = 0.3  # mask with lower confidence score will be skipped default 0.2
+max_bbox_area_ratio = 0.75  # boxes with larger areas than this will be skipped
+skip_bg = False
+min_points_threshold = 16  # projected and sampled pcd with less points will be skipped
+min_points_threshold_after_denoise = 10
+
+# point cloud processing
+downsample_voxel_size = 0.025
+dbscan_remove_noise = True
+dbscan_eps = 0.2  # v1 use 0.2
+dbscan_min_points = 10
+
+# bounding-box related
+spatial_sim_type = "overlap"  # "iou", "giou", "overlap"
+
+save_interval = 1
+wid3d_interval = 1
+
+use_clip = False
+
+perspective_model_variant = "geo_calib"
@@ -0,0 +1,308 @@
+import random
+import numpy as np
+from itertools import combinations
+import json
+
+from osdsynth.processor.instruction_template import *
+from osdsynth.processor.prompt_utils import *
+from osdsynth.processor.pointcloud import human_like_distance, calculate_distances_between_point_clouds
+
+
+def left_predicate(A, B):
+    true_responses = left_true_responses
+    false_responses = left_false_responses
+
+    A_desc, A_cloud = A["caption"], A["pcd"]
+    B_desc, B_cloud = B["caption"], B["pcd"]
+    A_desc, B_desc = A_desc.lower(), B_desc.lower()
+
+    A_pos = A_cloud.get_center()
+    B_pos = B_cloud.get_center()
+
+    is_left = A_pos[0] > B_pos[0]  # Compare X coordinates
+
+    response_template = random.choice(true_responses if is_left else false_responses)
+    answer = response_template.replace("[A]", A_desc).replace("[B]", B_desc)
+
+    return answer
+
+
+def below_predicate(A, B):
+    true_responses = below_true_responses
+    false_responses = below_false_responses
+
+    A_desc, A_cloud = A["caption"], A["pcd"]
+    B_desc, B_cloud = B["caption"], B["pcd"]
+    A_desc, B_desc = A_desc.lower(), B_desc.lower()
+
+    A_pos = A_cloud.get_center()
+    B_pos = B_cloud.get_center()
+
+    is_below = A_pos[1] < B_pos[1]
+
+    response_template = random.choice(true_responses if is_below else false_responses)
+
+    answer = response_template.replace("[A]", A_desc).replace("[B]", B_desc)
+
+    return answer
+
+
+def short_predicate(A, B):
+    true_responses = short_true_responses
+    false_responses = short_false_responses
+
+    A_desc, A_cloud = A["caption"], A["pcd"]
+    B_desc, B_cloud = B["caption"], B["pcd"]
+    A_desc, B_desc = A_desc.lower(), B_desc.lower()
+
+    height_A = A_cloud.get_axis_aligned_bounding_box().get_extent()[1]
+    height_B = B_cloud.get_axis_aligned_bounding_box().get_extent()[1]
+
+    is_shorter = height_A < height_B
+
+    response_template = random.choice(true_responses if is_shorter else false_responses)
+
+    answer = response_template.replace("[A]", A_desc).replace("[B]", B_desc)
+
+    return answer
+
+
+def thin_predicate(A, B):
+    true_responses = thin_true_responses
+    false_responses = thin_false_responses
+
+    A_desc, A_cloud = A["caption"], A["pcd"]
+    B_desc, B_cloud = B["caption"], B["pcd"]
+    A_desc, B_desc = A_desc.lower(), B_desc.lower()
+
+    width_A = A_cloud.get_axis_aligned_bounding_box().get_extent()[0]
+    width_B = B_cloud.get_axis_aligned_bounding_box().get_extent()[0]
+
+    is_thinner = width_A < width_B
+
+    response_template = random.choice(true_responses if is_thinner else false_responses)
+
+    answer = response_template.replace("[A]", A_desc).replace("[B]", B_desc)
+
+    return answer
+
+
+def small_predicate(A, B):
+    true_responses = small_true_responses
+    false_responses = small_false_responses
+
+    A_desc, A_cloud = A["caption"], A["pcd"]
+    B_desc, B_cloud = B["caption"], B["pcd"]
+    A_desc, B_desc = A_desc.lower(), B_desc.lower()
+
+    extent_A = A_cloud.get_axis_aligned_bounding_box().get_extent()
+    volume_A = extent_A[0] * extent_A[1] * extent_A[2]
+
+    extent_B = B_cloud.get_axis_aligned_bounding_box().get_extent()
+    volume_B = extent_B[0] * extent_B[1] * extent_B[2]
+
+    is_smaller = volume_A < volume_B
+
+    response_template = random.choice(true_responses if is_smaller else false_responses)
+
+    answer = response_template.replace("[A]", A_desc).replace("[B]", B_desc)
+
+    return answer
+
+
+def front_predicate(A, B):
+    true_responses = front_true
+    false_responses = front_false
+
+    A_desc, A_cloud = A["caption"], A["pcd"]
+    B_desc, B_cloud = B["caption"], B["pcd"]
+    A_desc, B_desc = A_desc.lower(), B_desc.lower()
+
+    # Calculate the minimum z-value for both A and B
+    A_min_z = A_cloud.get_min_bound()[2]
+    B_min_z = B_cloud.get_min_bound()[2]
+    # Determine if A is behind B based on the minimum z-value
+    is_in_front = A_min_z < B_min_z
+
+    response_template = random.choice(true_responses if is_in_front else false_responses)
+
+    answer = response_template.replace("[A]", A_desc).replace("[B]", B_desc)
+
+    return answer
+
+
+# Distance prompts
+
+
+def generate_spatial_reasoning_data(A, B, human_readable_dist, template_answers):
+    A_desc, B_desc = A["caption"].lower(), B["caption"].lower()
+
+    answer_template = random.choice(template_answers)
+
+    # Replace placeholders with actual values
+    answer = answer_template.replace("[A]", A_desc).replace("[B]", B_desc).replace("[X]", human_readable_dist)
+
+    # Add to the dataset
+    return answer
+
+
+def vertical_distance_data(A, B, use_center=True):
+    template_answers = vertical_distance_answers
+
+    # Get the bounding boxes for both A and B
+    A_box = A["pcd"].get_axis_aligned_bounding_box()
+    B_box = B["pcd"].get_axis_aligned_bounding_box()
+
+    if use_center:
+        A_center = A_box.get_axis_aligned_bounding_box().get_center()
+        B_center = B_box.get_axis_aligned_bounding_box().get_center()
+        vertical_distance = abs(A_center[1] - B_center[1])
+    else:
+        # Determine the highest and lowest points (in terms of y-value) of each object
+        A_min_y, A_max_y = A_box.get_min_bound()[1], A_box.get_max_bound()[1]
+        B_min_y, B_max_y = B_box.get_min_bound()[1], B_box.get_max_bound()[1]
+
+        # Assuming A is above B, adjust if it's the other way around
+        if A_min_y < B_min_y:
+            # This means B is above A, swap the values
+            A_min_y, A_max_y, B_min_y, B_max_y = B_min_y, B_max_y, A_min_y, A_max_y
+
+        # The vertical distance is now the difference between the lowest point of the higher object (B_max_y)
+        # and the highest point of the lower object (A_min_y), considering A is below B after the possible swap.
+        vertical_distance = A_min_y - B_max_y if A_min_y > B_max_y else 0
+
+    human_readable_dist = human_like_distance(vertical_distance)
+
+    return generate_spatial_reasoning_data(A, B, human_readable_dist, template_answers)
+
+
+def distance(A, B):
+    template_answers = distance_template_answers
+    distance = calculate_distances_between_point_clouds(A["pcd"], B["pcd"])
+    return generate_spatial_reasoning_data(
+        A,
+        B,
+        distance,
+        template_answers,
+    )
+
+
+def horizontal_distance_data(A, B, use_center=True):
+    template_answers = horizontal_distance_answers
+
+    # Extract bounding boxes for A and B
+    A_box = A["pcd"].get_axis_aligned_bounding_box()
+    B_box = B["pcd"].get_axis_aligned_bounding_box()
+
+    if use_center:
+        A_center = A_box.get_center()
+        B_center = B_box.get_center()
+        horizontal_distance = np.sqrt((A_center[0] - B_center[0]) ** 2)
+    else:
+        # Extract min and max bounds for A and B on x and z axes
+        A_min, A_max = A_box.get_min_bound(), A_box.get_max_bound()
+        B_min, B_max = B_box.get_min_bound(), B_box.get_max_bound()
+
+        # Calculate the shortest horizontal (x, z plane) distance between the two boxes
+        horizontal_distance = max(A_min[0] - B_max[0], B_min[0] - A_max[0], 0)
+
+    human_readable_dist = human_like_distance(horizontal_distance)
+
+    return generate_spatial_reasoning_data(A, B, human_readable_dist, template_answers)
+
+
+def width_data(A, B=None):
+    A_desc = A["caption"].lower()
+
+    template_answers = width_answers
+
+    width = A["pcd"].get_axis_aligned_bounding_box().get_extent()[0]
+
+    human_readable_width = human_like_distance(width)
+    answer_template = random.choice(template_answers)
+
+    answer = answer_template.replace("[A]", A_desc).replace("[X]", human_readable_width)
+
+    return answer
+
+
+def height_data(A, B=None):
+    A_desc = A["caption"].lower()
+
+    template_answers = height_answers
+
+    height = A["pcd"].get_axis_aligned_bounding_box().get_extent()[1]
+
+    human_readable_height = human_like_distance(height)
+    answer_template = random.choice(template_answers)
+
+    answer = answer_template.replace("[A]", A_desc).replace("[X]", human_readable_height)
+
+    return answer
+
+
+def direction(A, B):
+    template_responses = direction_responses
+
+    A_desc, A_cloud = A["caption"], A["pcd"]
+    B_desc, B_cloud = B["caption"], B["pcd"]
+    A_desc, B_desc = A_desc.lower(), B_desc.lower()
+
+    A_pos = (A_cloud.get_center()[0], A_cloud.get_center()[2])  # Only x, z
+    B_pos = (B_cloud.get_center()[0], B_cloud.get_center()[2])  # Only x, z
+
+    clock_position = calculate_angle_clockwise(A_pos, B_pos)
+
+    answer_template = random.choice(template_responses)
+
+    answer = answer_template.replace("[X]", str(int(clock_position))).replace("[A]", A_desc).replace("[B]", B_desc)
+
+    return answer
+
+
+class PromptGenerator:
+    def __init__(self, cfg, logger, device):
+        """Initialize the class."""
+        self.cfg = cfg
+        self.logger = logger
+        self.device = device
+        self.vis = True
+
+    def evaluate_predicates_on_pairs(self, detections):
+
+        all_combinations = list(combinations(range(len(detections)), 2))
+        random.shuffle(all_combinations)
+        selected_combinations = all_combinations[:3]
+        object_pairs = [(detections[i], detections[j]) for i, j in selected_combinations]
+
+        all_prompt_variants = [
+            # direction,
+            left_predicate,
+            thin_predicate,
+            small_predicate,
+            front_predicate,
+            below_predicate,
+            short_predicate,
+            vertical_distance_data,
+            horizontal_distance_data,
+            width_data,
+            height_data,
+            distance,
+        ]
+
+        results = []
+
+        for A, B in object_pairs:
+
+            to_remove = set()  # A set to hold items to remove
+
+            # Remove all items in `to_remove` from `all_prompt_variants`, if present
+            all_prompt_variants = [item for item in all_prompt_variants if item not in to_remove]
+
+            # selected_predicates_choices = all_prompt_variants
+            selected_predicates_choices = random.sample(all_prompt_variants, 3)
+
+            for prompt_func in selected_predicates_choices:
+                results.append(prompt_func(A, B))
+
+        return results
@@ -0,0 +1,115 @@
+direction_responses = [
+    "[B] is roughly at [X] o'clock from [A].",
+    "[A] find [B] around the [X] o'clock direction.",
+]
+
+height_answers = [
+    "The height of [A] is [X].",
+    "[A] is [X] tall.",
+    "[A] is [X] in height.",
+]
+
+width_answers = [
+    "The width of [A] is [X].",
+    "[A] is [X] wide.",
+    "[A] is [X] in width.",
+]
+
+horizontal_distance_answers = [
+    "[A] and [B] are [X] apart horizontally.",
+    "[A] is [X] away from [B] horizontally.",
+    "A horizontal distance of [X] exists between [A] and [B].",
+    "[A] is [X] from [B] horizontally.",
+    "Horizontally, [A] and [B] are [X] apart.",
+    "[A] and [B] are [X] apart horizontally from each other.",
+    "The horizontal distance of [A] from [B] is [X].",
+]
+
+vertical_distance_answers = [
+    "[A] and [B] are [X] apart vertically.",
+    "[A] is [X] away from [B] vertically.",
+    "A vertical distance of [X] exists between [A] and [B].",
+    "[A] is [X] from [B] vertically.",
+    "[A] and [B] are [X] apart vertically from each other.",
+    "Vertically, [A] and [B] are [X] apart.",
+    "The vertical distance of [A] from [B] is [X].",
+]
+
+front_true = [
+    "[A] is closer to the viewer than [B].",
+    "[A] is in front of [B].",
+]
+
+front_false = [
+    "[A] is further to the viewer than [B].",
+    "[B] is behind [A].",
+]
+
+small_true_responses = [
+    "[A] is smaller than [B].",
+    "[A] has a smaller size compared to [B].",
+    "[A] occupies less space than [B].",
+]
+
+small_false_responses = [
+    "[A] is bigger than [B].",
+    "[A] has a larger size compared to [B].",
+    "[A] is larger in size than [B].",
+]
+
+thin_true_responses = [
+    "[A] is thinner than [B].",
+    "[A] has a lesser width compared to [B].",
+    "[A]'s width is less than [B]'s.",
+]
+
+thin_false_responses = [
+    "[A] might be wider than [B]",
+    "[A]'s width surpass [B]'s width.",
+    "[A]'s width is larger than [B]'s.",
+]
+
+short_true_responses = [
+    "[A] is shorter than [B].",
+    "[A] has a lesser height compared to [B].",
+    "[A] is not as tall as [B].",
+]
+
+short_false_responses = [
+    "[A] is taller than [B].",
+    "[A] has a greater height compared to [B].",
+    "[A] is much taller as [B].",
+]
+
+below_true_responses = [
+    "[A] is below [B].",
+    "[A] is positioned under [B].",
+    "[A] is located below [B].",
+]
+
+below_false_responses = [
+    "[A] is above [B].",
+    "[A] is positioned over [B].",
+    "[A] is located above [B].",
+]
+
+left_true_responses = [
+    "[A] is to the left of [B].",
+    "[A] is positioned on the left side of [B].",
+    "You'll find [A] to the left of [B].",
+]
+
+left_false_responses = [
+    "[A] is to the right of [B].",
+    "[A] is positioned on the right side of [B].",
+    "You'll find [A] to the right of [B].",
+]
+
+distance_template_answers = [
+    "[A] and [B] are [X] apart.",
+    "[A] is [X] away from [B].",
+    "A distance of [X] exists between [A] and [B].",
+    "[A] is [X] from [B].",
+    "[A] and [B] are [X] apart from each other.",
+    "The distance of [A] from [B] is [X].",
+]
@@ -29,7 +29,16 @@ def __init__(self, cfg, logger, device, init_models=True):
 
         if init_models:
             # Initialize the perspective_fields_model
-            self.perspective_fields_model = get_perspective_fields_model(cfg, device)
+            if self.cfg.perspective_model_variant == "perspective_fields":
+                print(f"Using Perspective Fields")
+                self.perspective_fields_model = get_perspective_fields_model(cfg, device)
+            elif self.cfg.perspective_model_variant == "geo_calib":
+                from geocalib import GeoCalib
+
+                print(f"Using Geo Calib")
+                self.perspective_fields_model = GeoCalib(weights="distorted").to(device)
+            else:
+                raise ValueError(f"perspective_model_variant: {self.cfg.perspective_model_variant} not implemented")
 
             # Initialize the Camera Intrinsics Model
             self.wilde_camera_model = torch.hub.load("ShngJZ/WildCamera", "WildCamera", pretrained=True).to(device)
@@ -45,16 +54,27 @@ def process(self, filename, image_bgr, detections_list):
         image_rgb = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB)
         image_rgb_pil = Image.fromarray(image_rgb)
 
-        # Run Perspective Fields, this returns the pitch, roll
-        (
-            vis_perspective_fields,
-            perspective_fields,
-        ) = run_perspective_fields_model(self.perspective_fields_model, image_bgr)
+        if self.cfg.perspective_model_variant == "perspective_fields":
+            # Run Perspective Fields, this returns the pitch, roll
+            (
+                vis_perspective_fields,
+                perspective_fields,
+            ) = run_perspective_fields_model(self.perspective_fields_model, image_bgr)
+            roll, pitch = perspective_fields["roll"], perspective_fields["pitch"]
+
+        elif self.cfg.perspective_model_variant == "geo_calib":
+            from geocalib.utils import rad2deg
+
+            # load image as tensor in range [0, 1] with shape [C, H, W]
+            image_geo = torch.tensor((image_rgb.transpose((2, 0, 1))) / 255.0, dtype=torch.float).to(self.device)
+            geo_results = self.perspective_fields_model.calibrate(image_geo, camera_model="simple_radial")
+            roll, pitch = rad2deg(geo_results["gravity"].rp).unbind(-1)
+            roll, pitch = roll.item(), pitch.item()
 
-        # Perspective Fields to Rotation Matrix
+        # Perspective to Rotation Matrix
         perspective_R = create_rotation_matrix(
-            roll=perspective_fields["roll"],
-            pitch=perspective_fields["pitch"],
+            roll=roll,
+            pitch=pitch,
             yaw=0,
             degrees=True,
         )
@@ -79,7 +99,7 @@ def process(self, filename, image_bgr, detections_list):
 
         if self.vis:
             wis3d = Wis3D(self.cfg.wis3d_folder, filename)
-            # wis3d.add_point_cloud(vertices=pts3d.reshape((-1, 3)), colors=image_rgb.reshape(-1, 3), name="pts3d")
+            wis3d.add_point_cloud(vertices=pts3d.reshape((-1, 3)), colors=image_rgb.reshape(-1, 3), name="pts3d")
             wis3d.add_point_cloud(
                 vertices=cano_pts3d.reshape((-1, 3)), colors=image_rgb.reshape(-1, 3), name="cano_pts3d"
             )
 
@@ -0,0 +1,148 @@
+import argparse
+import time
+import warnings
+import re
+import json
+
+from sglang import function, system, gen, set_default_backend, RuntimeEndpoint
+from sglang.utils import (
+    execute_shell_command,
+    wait_for_server,
+)
+
+# Suppressing all warnings
+warnings.filterwarnings("ignore")
+
+
+response_regex = r"\{" + r'    "Question": "[\w\d\s<>?,.!]{1,512}",' + r'    "Answer": "[\w\d\s<>?,.!]{1,512}"' + r"\}"
+
+
+@function
+def rephrase_qa(s, question_1):
+    s += system(
+        r"""
+                You are a helpful assistant tasked with generating spatial reasoning-based questions and answers from provided descriptions of scenes. 
+                Always craft a question without directly revealing specific details from the description. 
+                Always generate questions related to the description using <regionX>. 
+                The description should always be used to answer and not leak into the question. 
+                When mentioning the objects or regions, use <regionX> instead of the objects or regions. 
+                Speak like you are the observer's perspective. 
+                Always make sure all the description objects or regions are mentioned with <regionX> in the question. 
+                Only mention each <regionX> once.
+
+                Here's several examples:
+
+                [Objects]: <region4> sofa, <region1> chair. [Description]: The path between the <region4> and <region1> is 1.5 meters.
+                "Question": You are a cleaning robot that is 1 meter wide. Now you are standing in a living room and see the image; you want to move from here to the door that leads to the backyard. Do you think I can go through the path between the <region4> and <region1>? 
+                "Answer": The path between <region4> and <region1> is 1.5 meters, so yes, the robot can go through the path between <region4> and <region1> since it is wider than the robot's width.
+
+                [Objects]: <region2> apple, <region3> orange. [Description]:  <region2> is positioned on the left side of <region3>.
+                "Question": You see two fruits, an apple in <region2> and an orange in <region3>. Which one is more on the left side? 
+                "Answer": The apple in <region2> is more on the left.
+
+                [Objects]: <region3> desk, <region6> bed. [Description]:  <region3> is further to the viewer than <region6>.
+                "Question": You are exploring a bedroom and walking towards <region3> and <region6>. Which one will you reach first? 
+                "Answer": You will reach the bed first because it is closer to you than the desk, which is further away.
+
+                [Objects]: <region0> book. [Description]: <region0> is 50 cm in width.
+                "Question": You are a librarian currently standing in front of a 40 cm width bookshelf, and you see <region0> that you want to place on the shelf. Can you determine if <region0> will fit on the shelf?
+                "Answer":  Answer: <region0> is 50 cm in width, so the shelf is not wide enough to hold a book of that size. Please find a larger shelf.
+
+                Now its your turn!
+
+"""
+    )
+    s += question_1
+    s += gen("json_output", max_tokens=1024, regex=response_regex)
+
+
+def process_prompt(prompt, rephrase_qa, max_retries=5):
+    for attempt in range(max_retries):
+        try:
+            llama_response = rephrase_qa.run(prompt, temperature=0.2)
+            response_string = llama_response["json_output"]
+
+            # Clean and parse the response
+            cleaned_string = response_string.strip()
+            cleaned_string = "".join(char for char in cleaned_string if ord(char) >= 32 or char == "\n")
+            cleaned_string = re.sub(r"\s+", " ", cleaned_string)
+            cleaned_string = cleaned_string.replace("'", '"')
+            json_response = json.loads(cleaned_string)
+
+            question, answer = json_response["Question"], json_response["Answer"]
+
+            # Cleanup question/answer
+            question = question[2:] if question and question[:2] == ". " else question
+            answer = answer[2:] if answer and answer[:2] == ". " else answer
+
+            # Validate region tags
+            prompt_tags = {x for x in prompt.split() if x.startswith("<region") and x.endswith(">")}
+            question_tags = {x for x in question.split() if x.startswith("<region") and x.endswith(">")}
+            answer_tags = {x for x in answer.split() if x.startswith("<region") and x.endswith(">")}
+
+            # Check if all validations pass
+            if prompt_tags == question_tags and prompt_tags == answer_tags:
+                if all(question.count(tag) == 1 for tag in prompt_tags):
+                    print(f"Prompt: {prompt}")
+                    print(f"Question: {question}")
+                    print(f"Answer: {answer}")
+                    print("---------------")
+                    return True, question, answer
+                else:
+                    print(f"Attempt {attempt + 1}: skipping because <regionX> appeared >1 times in question")
+            else:
+                print(f"Attempt {attempt + 1}: skipping because <regionX> miss-matched in question/answer")
+
+        except Exception as e:
+            print(f"Attempt {attempt + 1} failed with error: {str(e)}")
+
+    print(f"Failed to get valid output after {max_retries} attempts")
+    return False, None, None
+
+
+def main(args):
+    """Main function to control the flow of the program."""
+
+    # Launch sglang backend
+    server_process = execute_shell_command(
+        f"python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-70B-Instruct --port {args.port} --host 0.0.0.0 --tp {args.gpus}"
+    )
+    wait_for_server(f"http://localhost:{args.port}")
+    set_default_backend(RuntimeEndpoint(f"http://localhost:{args.port}"))
+
+    # Read llm_prompts json
+    with open(args.llm_prompts_path, "r") as f:
+        llm_prompts = json.load(f)
+
+    conversations = []
+    for prompt in llm_prompts:
+        success, question, answer = process_prompt(prompt, rephrase_qa)
+        if success:
+            conversations.append((question, answer))
+
+    for sample in conversations:
+        print(f"Q: {sample[0]}")
+        print(f"A: {sample[1]}")
+        print("-----------------------")
+
+
+def parse_args():
+    """Command-line argument parser."""
+    parser = argparse.ArgumentParser(description="Generate 3D SceneGraph for an image.")
+    parser.add_argument("--config", default="configs/v2.py", help="Annotation config file path.")
+    parser.add_argument("--port", default=3000, help="Port for Sglang")
+    parser.add_argument("--gpus", default=8, help="Number of gpus")
+    parser.add_argument(
+        "--llm-prompts-path",
+        default="./demo_out/20241125_175649/json/indoor_llm_prompts.json",
+        help="Path to llm prompt json.",
+    )
+
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    timestamp = time.strftime("%Y%m%d_%H%M%S", time.localtime())
+    args.timestamp = timestamp
+    main(args)
@@ -0,0 +1,157 @@
+import argparse
+import glob
+import os
+import random
+import time
+import warnings
+import re
+import json
+
+import cv2
+import numpy as np
+from mmengine import Config
+from osdsynth.processor.captions import CaptionImage
+from osdsynth.processor.pointcloud import PointCloudReconstruction
+from osdsynth.processor.instruction import PromptGenerator
+
+# from osdsynth.processor.filter import FilterImage
+from osdsynth.processor.segment import SegmentImage
+from osdsynth.utils.logger import SkipImageException, save_detection_list_to_json, setup_logger
+from tqdm import tqdm
+
+# Suppressing all warnings
+warnings.filterwarnings("ignore")
+
+
+def main(args):
+    """Main function to control the flow of the program."""
+    # Parse arguments
+    cfg = Config.fromfile(args.config)
+    exp_name = args.name if args.name else args.timestamp
+
+    # Create log folder
+    cfg.log_folder = os.path.join(args.log_dir, exp_name)
+    os.makedirs(os.path.abspath(cfg.log_folder), exist_ok=True)
+
+    # Create Wis3D folder
+    cfg.vis = args.vis
+    cfg.wis3d_folder = os.path.join(args.log_dir, "Wis3D")
+    os.makedirs(os.path.abspath(cfg.wis3d_folder), exist_ok=True)
+
+    # Init the logger and log some basic info
+    cfg.log_file = os.path.join(cfg.log_folder, f"{exp_name}_{args.timestamp}.log")
+    logger = setup_logger()  # cfg.log_file
+    logger.info(f"Config:\n{cfg.pretty_text}")
+
+    # Dump config to log
+    cfg.dump(os.path.join(cfg.log_folder, os.path.basename(args.config)))
+
+    # Create output folder
+    cfg.exp_dir = os.path.join(args.output_dir, exp_name)
+    os.makedirs(os.path.abspath(cfg.exp_dir), exist_ok=True)
+
+    # Create folder for output json
+    cfg.json_folder = os.path.join(cfg.exp_dir, "json")
+    os.makedirs(os.path.abspath(cfg.json_folder), exist_ok=True)
+
+    global_data = glob.glob(f"{args.input}/*.jpg") + glob.glob(f"{args.input}/*.png")
+    device = "cuda"
+
+    annotate(cfg, global_data, logger, device)
+
+
+def annotate(cfg, global_data, logger, device):
+
+    random.shuffle(global_data)
+
+    segmenter = SegmentImage(cfg, logger, device)
+    reconstructor = PointCloudReconstruction(cfg, logger, device)
+    captioner = CaptionImage(cfg, logger, device)
+    prompter = PromptGenerator(cfg, logger, device)
+
+    for i, filepath in tqdm(enumerate(global_data), ncols=25):
+        filename = filepath.split("/")[-1].split(".")[0]
+        print(f"Processing file: {filename}")
+
+        progress_file_path = os.path.join(cfg.log_folder, f"{filename}.progress")
+        if os.path.exists(progress_file_path) and cfg.check_exist:
+            continue
+
+        image_bgr = cv2.imread(filepath)
+        image_bgr = cv2.resize(image_bgr, (int(640 / (image_bgr.shape[0]) * (image_bgr.shape[1])), 640))
+
+        try:
+
+            # Run tagging model and get openworld detections
+            vis_som, detection_list = segmenter.process(image_bgr)
+
+            # Lift 2D to 3D, 3D bbox informations are included in detection_list
+            detection_list = reconstructor.process(filename, image_bgr, detection_list)
+
+            # Get LLaVA local caption for each region, however, currently just use a <region> placeholder
+            detection_list = captioner.process_local_caption(detection_list)
+
+            # Save detection list to json
+            detection_list_path = os.path.join(cfg.json_folder, f"{filename}.json")
+            save_detection_list_to_json(detection_list, detection_list_path)
+
+            # Generate instructions (facts) based on templates
+            facts = prompter.evaluate_predicates_on_pairs(detection_list)
+
+            batched_llm_prompts = prepare_llm_prompts(facts, detection_list)
+
+            llm_prompts_path = os.path.join(cfg.json_folder, f"{filename}_llm_prompts.json")
+            with open(llm_prompts_path, "w") as f:
+                json.dump(batched_llm_prompts, f, indent=2)
+
+            for llm_prompt in batched_llm_prompts:
+                print(f"{llm_prompt}")
+                print("-----------------------")
+
+        except SkipImageException as e:
+            # Meet skip image condition
+            logger.info(f"Skipping processing {filename}: {e}.")
+            continue
+
+
+def prepare_llm_prompts(facts, detection_list):
+    region_to_tag_list = []
+    batched_instructions = []
+    for qa_idx, instruction in enumerate(facts):
+        i_regions = re.findall(r"<region(\d+)>", instruction)
+        region_to_tag = {int(region): detection_list[int(region)]["class_name"] for region in i_regions}
+        region_to_tag_list.append(region_to_tag)
+
+        object_reference = []
+        for r_id, (region, tag) in enumerate(region_to_tag.items()):
+            object_reference.append(f"<region{region}> {tag}")
+        object_reference = ", ".join(object_reference)
+
+        new_instruction = f"[Objets]: {object_reference}. [Description]: {instruction}"
+        batched_instructions.append(new_instruction)
+
+    return batched_instructions
+
+
+def parse_args():
+    """Command-line argument parser."""
+    parser = argparse.ArgumentParser(description="Generate 3D SceneGraph for an image.")
+    parser.add_argument("--config", default="configs/v2.py", help="Annotation config file path.")
+    parser.add_argument(
+        "--input",
+        default="./demo_images",
+        help="Path to input, can be json of folder of images.",
+    )
+    parser.add_argument("--output-dir", default="./demo_out", help="Path to save the scene-graph JSON files.")
+    parser.add_argument("--name", required=False, default=None, help="Specify, otherwise use timestamp as nameing.")
+    parser.add_argument("--log-dir", default="./demo_out/log", help="Path to save logs and visualization results.")
+    parser.add_argument("--vis", required=False, default=True, help="Wis3D visualization for reconstruted pointclouds.")
+    parser.add_argument("--overwrite", required=False, action="store_true", help="Overwrite previous.")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    timestamp = time.strftime("%Y%m%d_%H%M%S", time.localtime())
+    args.timestamp = timestamp
+    main(args)