Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
550 changes: 550 additions & 0 deletions orby/data/analyse_uground.py

Large diffs are not rendered by default.

45 changes: 40 additions & 5 deletions orby/data/convert_osatlas.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,17 +70,31 @@
PROCESSOR = AutoProcessor.from_pretrained(MODEL_PATH, use_fast=True)


def get_resized_wh(image):
def to_rgb(pil_image: Image.Image) -> Image.Image:
if pil_image.mode == 'RGBA':
white_background = Image.new("RGB", pil_image.size, (255, 255, 255))
white_background.paste(pil_image, mask=pil_image.split()[3]) # Use alpha channel as mask
return white_background
else:
return pil_image.convert("RGB")

def get_resized_hw(image, max_pixels=None):
"""
Get the resized width and height of the image.
"""

# if max_pixels is not set, use the max pixels of the image processor
if not max_pixels:
print("Max pixels not set, using the max pixels of the image processor", flush=True)
max_pixels = PROCESSOR.image_processor.max_pixels

resized_height, resized_width = smart_resize(
image.height,
image.width,
height=image.height,
width=image.width,
factor=PROCESSOR.image_processor.patch_size
* PROCESSOR.image_processor.merge_size,
min_pixels=PROCESSOR.image_processor.min_pixels,
max_pixels=PROCESSOR.image_processor.max_pixels,
max_pixels=max_pixels,
)

return resized_height, resized_width
Expand All @@ -102,6 +116,17 @@ def save_in_chunks(
for dataset_chunk in all_data:
if len(dataset_chunk) == 0:
continue

# Remove width and height columns if they exist
columns_to_remove = []
if "width" in dataset_chunk.column_names:
columns_to_remove.append("width")
if "height" in dataset_chunk.column_names:
columns_to_remove.append("height")

if columns_to_remove:
dataset_chunk = dataset_chunk.remove_columns(columns_to_remove)
print(f"Removed columns: {columns_to_remove}", flush=True)

# Save the chunk as-is (remove the splitting logic)
output_file = os.path.join(
Expand Down Expand Up @@ -245,6 +270,12 @@ def process_in_chunks(dataset, chunk_size):
parser.add_argument(
"--image_dir", default="/root/data/os_atlas/desktop_domain/merged_images/", help="Path to the directory containing images"
)
parser.add_argument(
"--max_pixels",
type=int,
default=None,
help="Maximum number of pixels in the image",
)


args = parser.parse_args()
Expand Down Expand Up @@ -274,7 +305,10 @@ def process_fn(example, idx):
# Get image and resize ratios
if isinstance(image, bytes):
image = Image.open(io.BytesIO(image))
resized_height, resized_width = get_resized_wh(image)
# Convert image to RGB if it's RGBA
image = to_rgb(image)
# Get the resized width and height of the image.
resized_height, resized_width = get_resized_hw(image, args.max_pixels)


bbox = [
Expand Down Expand Up @@ -309,6 +343,7 @@ def process_fn(example, idx):
"index": idx,
"question": instruction,
"bounding_box": bbox,
"max_pixels": args.max_pixels,
},
"response": answer
}
Expand Down
13 changes: 12 additions & 1 deletion orby/data/convert_screenspot.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,14 @@
"macos": "desktop",
}

def to_rgb(pil_image: Image.Image) -> Image.Image:
if pil_image.mode == 'RGBA':
white_background = Image.new("RGB", pil_image.size, (255, 255, 255))
white_background.paste(pil_image, mask=pil_image.split()[3]) # Use alpha channel as mask
return white_background
else:
return pil_image.convert("RGB")


def get_resized_wh(image):
"""
Expand All @@ -56,7 +64,7 @@ def get_resized_wh(image):
factor=PROCESSOR.image_processor.patch_size
* PROCESSOR.image_processor.merge_size,
min_pixels=PROCESSOR.image_processor.min_pixels,
max_pixels=PROCESSOR.image_processor.max_pixels,
max_pixels= 1e6,#PROCESSOR.image_processor.max_pixels,
)

return resized_height, resized_width
Expand Down Expand Up @@ -100,8 +108,11 @@ def process_fn(example, idx):
# Get image and resize ratios
if isinstance(image, bytes):
image = Image.open(io.BytesIO(image))
image = to_rgb(image)
resized_height, resized_width = get_resized_wh(image)

image = image.resize((resized_width, resized_height))

# Adjust bbox based on resize ratios
bbox = [
bbox[0] * resized_width,
Expand Down
15 changes: 13 additions & 2 deletions orby/data/convert_screenspot_pro.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,14 +66,21 @@ def get_resized_ratio(image):
factor=PROCESSOR.image_processor.patch_size
* PROCESSOR.image_processor.merge_size,
min_pixels=PROCESSOR.image_processor.min_pixels,
max_pixels=PROCESSOR.image_processor.max_pixels,
max_pixels= 1e6,#PROCESSOR.image_processor.max_pixels,
)

height_ratio = resized_height / image.height
width_ratio = resized_width / image.width

return height_ratio, width_ratio

def to_rgb(pil_image: Image.Image) -> Image.Image:
if pil_image.mode == 'RGBA':
white_background = Image.new("RGB", pil_image.size, (255, 255, 255))
white_background.paste(pil_image, mask=pil_image.split()[3]) # Use alpha channel as mask
return white_background
else:
return pil_image.convert("RGB")

def process_json_file(json_path, image_dir, split, prompt_format="thinking"):
"""
Expand All @@ -99,14 +106,18 @@ def process_json_file(json_path, image_dir, split, prompt_format="thinking"):
image = Image.open(img_path)
# Convert PIL Image to bytes
img_byte_arr = io.BytesIO()
image = to_rgb(image)
height_ratio, width_ratio = get_resized_ratio(image)
resized_height, resized_width = image.height * height_ratio, image.width * width_ratio
image = image.resize((int(resized_width), int(resized_height))) # Convert to integers
image.save(img_byte_arr, format=image.format or "PNG")
img_byte_arr = img_byte_arr.getvalue()
except Exception as e:
logging.warning(f"Failed to load image {img_path}: {e}")
continue

# Get image resize ratios
height_ratio, width_ratio = get_resized_ratio(image)


# Adjust bbox based on resize ratios
bbox = example["bbox"]
Expand Down
47 changes: 41 additions & 6 deletions orby/data/convert_uground.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,18 +42,30 @@
MODEL_PATH = "Qwen/Qwen2.5-VL-7B-Instruct"
PROCESSOR = AutoProcessor.from_pretrained(MODEL_PATH, use_fast=True)

def to_rgb(pil_image: Image.Image) -> Image.Image:
if pil_image.mode == 'RGBA':
white_background = Image.new("RGB", pil_image.size, (255, 255, 255))
white_background.paste(pil_image, mask=pil_image.split()[3]) # Use alpha channel as mask
return white_background
else:
return pil_image.convert("RGB")

def get_resized_wh(image):
def get_resized_hw(image, max_pixels=None):
"""
Get the resized width and height of the image.
"""
# if max_pixels is not set, use the max pixels of the image processor
if not max_pixels:
print("Max pixels not set, using the max pixels of the image processor", flush=True)
max_pixels = PROCESSOR.image_processor.max_pixels

resized_height, resized_width = smart_resize(
image.height,
image.width,
height=image.height,
width=image.width,
factor=PROCESSOR.image_processor.patch_size
* PROCESSOR.image_processor.merge_size,
min_pixels=PROCESSOR.image_processor.min_pixels,
max_pixels=PROCESSOR.image_processor.max_pixels,
max_pixels=max_pixels,
)

return resized_height, resized_width
Expand All @@ -76,6 +88,17 @@ def save_in_chunks(
if len(dataset_chunk) == 0:
continue

# Remove width and height columns if they exist
columns_to_remove = []
if "width" in dataset_chunk.column_names:
columns_to_remove.append("width")
if "height" in dataset_chunk.column_names:
columns_to_remove.append("height")

if columns_to_remove:
dataset_chunk = dataset_chunk.remove_columns(columns_to_remove)
print(f"Removed columns: {columns_to_remove}", flush=True)

# Save the chunk as-is (remove the splitting logic)
output_file = os.path.join(
output_dir, f"{prefix}_part_{file_counter:04d}.parquet"
Expand Down Expand Up @@ -165,6 +188,13 @@ def process_in_chunks(streaming_dataset, chunk_size):
help="Maximum number of examples to process (for testing)",
)

parser.add_argument(
"--max_pixels",
type=int,
default=None,
help="Maximum number of pixels in the image",
)


args = parser.parse_args()

Expand Down Expand Up @@ -209,9 +239,13 @@ def process_fn(example, idx):
# Get image and resize ratios
if isinstance(image, bytes):
image = Image.open(io.BytesIO(image))
resized_height, resized_width = get_resized_wh(image)
# Convert image to RGB if it's RGBA
image = to_rgb(image)
# Get the resized width and height of the image.
resized_height, resized_width = get_resized_hw(image, args.max_pixels)

# Adjust bbox based on resize ratios. Uground labels range from
image = image.resize((resized_width, resized_height))
# Adjust bbox based on resize ratios. Uground labels range from
# [0, 999]
bbox = [
bbox[0] * resized_width / 1000.0,
Expand Down Expand Up @@ -244,6 +278,7 @@ def process_fn(example, idx):
"index": idx,
"question": instruction,
"bounding_box": bbox,
"max_pixels": args.max_pixels,
},
"response": answer
}
Expand Down
Loading