uniphore · RishuG-work · Jul 7, 2025 · Jul 7, 2025 · Jul 10, 2025 · Jul 10, 2025
diff --git a/orby/data/analyse_uground.py b/orby/data/analyse_uground.py
diff --git a/orby/data/convert_osatlas.py b/orby/data/convert_osatlas.py
@@ -70,17 +70,31 @@
 PROCESSOR = AutoProcessor.from_pretrained(MODEL_PATH, use_fast=True)
 
 
-def get_resized_wh(image):
+def to_rgb(pil_image: Image.Image) -> Image.Image:
+    if pil_image.mode == 'RGBA':
+        white_background = Image.new("RGB", pil_image.size, (255, 255, 255))
+        white_background.paste(pil_image, mask=pil_image.split()[3])  # Use alpha channel as mask
+        return white_background
+    else:
+        return pil_image.convert("RGB")
+
+def get_resized_hw(image, max_pixels=None):
     """
     Get the resized width and height of the image.
     """
+
+     # if max_pixels is not set, use the max pixels of the image processor
+    if not max_pixels:
+        print("Max pixels not set, using the max pixels of the image processor", flush=True)
+        max_pixels = PROCESSOR.image_processor.max_pixels
+
     resized_height, resized_width = smart_resize(
-        image.height,
-        image.width,
+        height=image.height,
+        width=image.width,
         factor=PROCESSOR.image_processor.patch_size
         * PROCESSOR.image_processor.merge_size,
         min_pixels=PROCESSOR.image_processor.min_pixels,
-        max_pixels=PROCESSOR.image_processor.max_pixels,
+        max_pixels=max_pixels,
     )
 
     return resized_height, resized_width
@@ -102,6 +116,17 @@ def save_in_chunks(
     for dataset_chunk in all_data:
         if len(dataset_chunk) == 0:
             continue
+
+         # Remove width and height columns if they exist
+        columns_to_remove = []
+        if "width" in dataset_chunk.column_names:
+            columns_to_remove.append("width")
+        if "height" in dataset_chunk.column_names:
+            columns_to_remove.append("height")
+
+        if columns_to_remove:
+            dataset_chunk = dataset_chunk.remove_columns(columns_to_remove)
+            print(f"Removed columns: {columns_to_remove}", flush=True)
 
         # Save the chunk as-is (remove the splitting logic)
         output_file = os.path.join(
@@ -245,6 +270,12 @@ def process_in_chunks(dataset, chunk_size):
     parser.add_argument(
         "--image_dir", default="/root/data/os_atlas/desktop_domain/merged_images/", help="Path to the directory containing images"
     )
+    parser.add_argument(
+        "--max_pixels",
+        type=int,
+        default=None,
+        help="Maximum number of pixels in the image",
+    )
 
 
     args = parser.parse_args()
@@ -274,7 +305,10 @@ def process_fn(example, idx):
             # Get image and resize ratios
             if isinstance(image, bytes):
                 image = Image.open(io.BytesIO(image))
-            resized_height, resized_width = get_resized_wh(image)
+            # Convert image to RGB if it's RGBA
+            image = to_rgb(image)
+            # Get the resized width and height of the image.
+            resized_height, resized_width = get_resized_hw(image, args.max_pixels)
 
 
             bbox = [
@@ -309,6 +343,7 @@ def process_fn(example, idx):
                     "index": idx,
                     "question": instruction,
                     "bounding_box": bbox,
+                    "max_pixels": args.max_pixels,
                 },
                 "response": answer
             }

diff --git a/orby/data/convert_screenspot.py b/orby/data/convert_screenspot.py
@@ -45,6 +45,14 @@
     "macos": "desktop",
 }
 
+def to_rgb(pil_image: Image.Image) -> Image.Image:
+    if pil_image.mode == 'RGBA':
+        white_background = Image.new("RGB", pil_image.size, (255, 255, 255))
+        white_background.paste(pil_image, mask=pil_image.split()[3])  # Use alpha channel as mask
+        return white_background
+    else:
+        return pil_image.convert("RGB")
+
 
 def get_resized_wh(image):
     """
@@ -56,7 +64,7 @@ def get_resized_wh(image):
         factor=PROCESSOR.image_processor.patch_size
         * PROCESSOR.image_processor.merge_size,
         min_pixels=PROCESSOR.image_processor.min_pixels,
-        max_pixels=PROCESSOR.image_processor.max_pixels,
+        max_pixels= 1e6,#PROCESSOR.image_processor.max_pixels,
     )
 
     return resized_height, resized_width
@@ -100,8 +108,11 @@ def process_fn(example, idx):
             # Get image and resize ratios
             if isinstance(image, bytes):
                 image = Image.open(io.BytesIO(image))
+            image = to_rgb(image)
             resized_height, resized_width = get_resized_wh(image)
 
+            image = image.resize((resized_width, resized_height))
+
             # Adjust bbox based on resize ratios
             bbox = [
                 bbox[0] * resized_width,

diff --git a/orby/data/convert_screenspot_pro.py b/orby/data/convert_screenspot_pro.py
@@ -66,14 +66,21 @@ def get_resized_ratio(image):
         factor=PROCESSOR.image_processor.patch_size
         * PROCESSOR.image_processor.merge_size,
         min_pixels=PROCESSOR.image_processor.min_pixels,
-        max_pixels=PROCESSOR.image_processor.max_pixels,
+        max_pixels= 1e6,#PROCESSOR.image_processor.max_pixels,
     )
 
     height_ratio = resized_height / image.height
     width_ratio = resized_width / image.width
 
     return height_ratio, width_ratio
 
+def to_rgb(pil_image: Image.Image) -> Image.Image:
+    if pil_image.mode == 'RGBA':
+        white_background = Image.new("RGB", pil_image.size, (255, 255, 255))
+        white_background.paste(pil_image, mask=pil_image.split()[3])  # Use alpha channel as mask
+        return white_background
+    else:
+        return pil_image.convert("RGB")
 
 def process_json_file(json_path, image_dir, split, prompt_format="thinking"):
     """
@@ -99,14 +106,18 @@ def process_json_file(json_path, image_dir, split, prompt_format="thinking"):
             image = Image.open(img_path)
             # Convert PIL Image to bytes
             img_byte_arr = io.BytesIO()
+            image = to_rgb(image)
+            height_ratio, width_ratio = get_resized_ratio(image)
+            resized_height, resized_width = image.height * height_ratio, image.width * width_ratio
+            image = image.resize((int(resized_width), int(resized_height)))  # Convert to integers
             image.save(img_byte_arr, format=image.format or "PNG")
             img_byte_arr = img_byte_arr.getvalue()
         except Exception as e:
             logging.warning(f"Failed to load image {img_path}: {e}")
             continue
 
         # Get image resize ratios
-        height_ratio, width_ratio = get_resized_ratio(image)
+
 
         # Adjust bbox based on resize ratios
         bbox = example["bbox"]

diff --git a/orby/data/convert_uground.py b/orby/data/convert_uground.py
@@ -42,18 +42,30 @@
 MODEL_PATH = "Qwen/Qwen2.5-VL-7B-Instruct"
 PROCESSOR = AutoProcessor.from_pretrained(MODEL_PATH, use_fast=True)
 
+def to_rgb(pil_image: Image.Image) -> Image.Image:
+    if pil_image.mode == 'RGBA':
+        white_background = Image.new("RGB", pil_image.size, (255, 255, 255))
+        white_background.paste(pil_image, mask=pil_image.split()[3])  # Use alpha channel as mask
+        return white_background
+    else:
+        return pil_image.convert("RGB")
 
-def get_resized_wh(image):
+def get_resized_hw(image, max_pixels=None):
     """
     Get the resized width and height of the image.
     """
+    # if max_pixels is not set, use the max pixels of the image processor
+    if not max_pixels:
+        print("Max pixels not set, using the max pixels of the image processor", flush=True)
+        max_pixels = PROCESSOR.image_processor.max_pixels
+
     resized_height, resized_width = smart_resize(
-        image.height,
-        image.width,
+        height=image.height,
+        width=image.width,
         factor=PROCESSOR.image_processor.patch_size
         * PROCESSOR.image_processor.merge_size,
         min_pixels=PROCESSOR.image_processor.min_pixels,
-        max_pixels=PROCESSOR.image_processor.max_pixels,
+        max_pixels=max_pixels,
     )
 
     return resized_height, resized_width
@@ -76,6 +88,17 @@ def save_in_chunks(
         if len(dataset_chunk) == 0:
             continue
 
+        # Remove width and height columns if they exist
+        columns_to_remove = []
+        if "width" in dataset_chunk.column_names:
+            columns_to_remove.append("width")
+        if "height" in dataset_chunk.column_names:
+            columns_to_remove.append("height")
+
+        if columns_to_remove:
+            dataset_chunk = dataset_chunk.remove_columns(columns_to_remove)
+            print(f"Removed columns: {columns_to_remove}", flush=True)
+
         # Save the chunk as-is (remove the splitting logic)
         output_file = os.path.join(
             output_dir, f"{prefix}_part_{file_counter:04d}.parquet"
@@ -165,6 +188,13 @@ def process_in_chunks(streaming_dataset, chunk_size):
         help="Maximum number of examples to process (for testing)",
     )
 
+    parser.add_argument(
+        "--max_pixels",
+        type=int,
+        default=None,
+        help="Maximum number of pixels in the image",
+    )
+
 
     args = parser.parse_args()
 
@@ -209,9 +239,13 @@ def process_fn(example, idx):
             # Get image and resize ratios
             if isinstance(image, bytes):
                 image = Image.open(io.BytesIO(image))
-            resized_height, resized_width = get_resized_wh(image)
+            # Convert image to RGB if it's RGBA
+            image = to_rgb(image)
+            # Get the resized width and height of the image.
+            resized_height, resized_width = get_resized_hw(image, args.max_pixels)
 
-            # Adjust bbox based on resize ratios. Uground labels range from
+            image = image.resize((resized_width, resized_height))
+            # Adjust bbox based on resize ratios. Uground labels range from 
             # [0, 999]
             bbox = [
                 bbox[0] * resized_width / 1000.0,
@@ -244,6 +278,7 @@ def process_fn(example, idx):
                     "index": idx,
                     "question": instruction,
                     "bounding_box": bbox,
+                    "max_pixels": args.max_pixels,
                 },
                 "response": answer
             }