From e40010cc2bff6c54122d3e57c4f7066674629127 Mon Sep 17 00:00:00 2001
From: liuzh <chujing.lzh@alibaba-inc.com>
Date: Sun, 27 Apr 2025 21:01:30 +0800
Subject: [PATCH 1/3] add support of CAPability

---
 .../tasks/capability/_default_template_yaml   |  52 ++
 lmms_eval/tasks/capability/capability.yaml    |  15 +
 .../tasks/capability/capability_OCR.yaml      |   4 +
 .../tasks/capability/capability_action.yaml   |   4 +
 .../capability/capability_camera_angle.yaml   |   4 +
 .../capability_camera_movement.yaml           |   4 +
 .../capability_character_identification.yaml  |   4 +
 .../capability_dynamic_object_number.yaml     |   4 +
 .../tasks/capability/capability_event.yaml    |   4 +
 .../capability_object_category.yaml           |   4 +
 .../capability/capability_object_color.yaml   |   4 +
 .../capability/capability_object_number.yaml  |   4 +
 .../tasks/capability/capability_scene.yaml    |   4 +
 .../capability_spatial_relation.yaml          |   4 +
 .../tasks/capability/capability_style.yaml    |   4 +
 lmms_eval/tasks/capability/prompt.py          | 238 +++++++
 lmms_eval/tasks/capability/utils.py           | 622 ++++++++++++++++++
 17 files changed, 979 insertions(+)
 create mode 100644 lmms_eval/tasks/capability/_default_template_yaml
 create mode 100644 lmms_eval/tasks/capability/capability.yaml
 create mode 100644 lmms_eval/tasks/capability/capability_OCR.yaml
 create mode 100644 lmms_eval/tasks/capability/capability_action.yaml
 create mode 100644 lmms_eval/tasks/capability/capability_camera_angle.yaml
 create mode 100644 lmms_eval/tasks/capability/capability_camera_movement.yaml
 create mode 100644 lmms_eval/tasks/capability/capability_character_identification.yaml
 create mode 100644 lmms_eval/tasks/capability/capability_dynamic_object_number.yaml
 create mode 100644 lmms_eval/tasks/capability/capability_event.yaml
 create mode 100644 lmms_eval/tasks/capability/capability_object_category.yaml
 create mode 100644 lmms_eval/tasks/capability/capability_object_color.yaml
 create mode 100644 lmms_eval/tasks/capability/capability_object_number.yaml
 create mode 100644 lmms_eval/tasks/capability/capability_scene.yaml
 create mode 100644 lmms_eval/tasks/capability/capability_spatial_relation.yaml
 create mode 100644 lmms_eval/tasks/capability/capability_style.yaml
 create mode 100644 lmms_eval/tasks/capability/prompt.py
 create mode 100644 lmms_eval/tasks/capability/utils.py

diff --git a/lmms_eval/tasks/capability/_default_template_yaml b/lmms_eval/tasks/capability/_default_template_yaml
new file mode 100644
index 00000000..afde0ead
--- /dev/null
+++ b/lmms_eval/tasks/capability/_default_template_yaml
@@ -0,0 +1,52 @@
+dataset_path: lntzm/CAPability
+dataset_kwargs:
+  token: True
+  cache_dir: capability
+  video: True
+
+generation_kwargs:
+  max_new_tokens: 4096
+  temperature: 0
+  top_p: 1.0
+  num_beams: 1
+  do_sample: false
+
+test_split: test
+output_type: generate_until
+
+lmms_eval_specific_kwargs:
+  default:
+    # image_prompt: "Please describe the image in detail."
+    # video_prompt: "Please describe the video in detail."
+    image_prompt: "Please describe the image in detail. Your description should follow these rules:\na) You should describe each object in the image in detail, including its name, number, color, and spatial relationship between objects.\nb) You should describe the scene of the image.\nc) You should describe the camera angle when shooting this image, such as level angle, high angle, low angle, or dutch angle.\nd) You should describe the style of the image, such as realistic, animated, special-effect, old-fashioned and so on.\ne) If there are any texts in the image, you should describe the text content.\nf) If you know the character in the image, you should tell his or her name.\nDirectly output your detailed description in a elaborate paragraph, instead of itemizing them in list form. Your description: "
+    video_prompt: "Please describe the video in detail. Your description should follow these rules:\na) You should describe each events in the video in order, especially focusing on the behavior and action of characters, including people, animals.\nb) You should describe each object in the video in detail, including its name, number, color, and spatial relationship between objects.\nc) You should describe the scene of the video.\nd) You should describe the camera movement when shooting this video, especially the direction, such as pan left, track right, tilt up, boom down, zoom in, dolly out, and so on.\ne) You should describe the style of the video, such as realistic, animated, special-effect, old-fashioned and so on.\nf) If there are any texts in the video, you should describe the text content.\ng) If you know the character in the video, you should tell his or her name.\nDirectly output your detailed description in a elaborate paragraph, instead of itemizing them in list form. Your description: "
+
+doc_to_visual: !function utils.capability_doc_to_visual
+doc_to_text: !function utils.capability_doc_to_text
+doc_to_target: "annotation"
+# The return value of process_results will be used by metrics
+process_results: !function utils.capability_process_results
+
+metric_list:
+  - metric: capability_inference_result
+    aggregation: !function utils.capability_aggregate_inference_result
+    higher_is_better: null
+  - metric: capability_precision
+    aggregation: !function utils.capability_aggregate_precision
+    higher_is_better: true
+  - metric: capability_recall
+    aggregation: !function utils.capability_aggregate_recall
+    higher_is_better: true
+  - metric: capability_f1_score
+    aggregation: !function utils.capability_aggregate_f1score
+    higher_is_better: true
+
+metadata:
+  version: 0.1
+  eval_save_path: null
+  eval_model_name: "gpt-4.1-2025-04-14"
+  eval_num_process: 20
+  eval_max_allow_missing: 5
+  eval_max_retry_times: 10
+  eval_auto_resume: true
+  eval_strict_match: false
diff --git a/lmms_eval/tasks/capability/capability.yaml b/lmms_eval/tasks/capability/capability.yaml
new file mode 100644
index 00000000..726802c4
--- /dev/null
+++ b/lmms_eval/tasks/capability/capability.yaml
@@ -0,0 +1,15 @@
+group: capability
+task:
+  - capability_object_category
+  - capability_object_number
+  - capability_object_color
+  - capability_spatial_relation
+  - capability_scene
+  - capability_camera_angle
+  - capability_OCR
+  - capability_style
+  - capability_character_identification
+  - capability_dynamic_object_number
+  - capability_action
+  - capability_camera_movement
+  - capability_event
diff --git a/lmms_eval/tasks/capability/capability_OCR.yaml b/lmms_eval/tasks/capability/capability_OCR.yaml
new file mode 100644
index 00000000..f2f4a99e
--- /dev/null
+++ b/lmms_eval/tasks/capability/capability_OCR.yaml
@@ -0,0 +1,4 @@
+include: _default_template_yaml
+
+task: capability_OCR
+dataset_name: OCR
diff --git a/lmms_eval/tasks/capability/capability_action.yaml b/lmms_eval/tasks/capability/capability_action.yaml
new file mode 100644
index 00000000..9ea7f333
--- /dev/null
+++ b/lmms_eval/tasks/capability/capability_action.yaml
@@ -0,0 +1,4 @@
+include: _default_template_yaml
+
+task: capability_action
+dataset_name: action
diff --git a/lmms_eval/tasks/capability/capability_camera_angle.yaml b/lmms_eval/tasks/capability/capability_camera_angle.yaml
new file mode 100644
index 00000000..cda5aba2
--- /dev/null
+++ b/lmms_eval/tasks/capability/capability_camera_angle.yaml
@@ -0,0 +1,4 @@
+include: _default_template_yaml
+
+task: capability_camera_angle
+dataset_name: camera_angle
diff --git a/lmms_eval/tasks/capability/capability_camera_movement.yaml b/lmms_eval/tasks/capability/capability_camera_movement.yaml
new file mode 100644
index 00000000..dcf130f3
--- /dev/null
+++ b/lmms_eval/tasks/capability/capability_camera_movement.yaml
@@ -0,0 +1,4 @@
+include: _default_template_yaml
+
+task: capability_camera_movement
+dataset_name: camera_movement
diff --git a/lmms_eval/tasks/capability/capability_character_identification.yaml b/lmms_eval/tasks/capability/capability_character_identification.yaml
new file mode 100644
index 00000000..073455c5
--- /dev/null
+++ b/lmms_eval/tasks/capability/capability_character_identification.yaml
@@ -0,0 +1,4 @@
+include: _default_template_yaml
+
+task: capability_character_identification
+dataset_name: character_identification
diff --git a/lmms_eval/tasks/capability/capability_dynamic_object_number.yaml b/lmms_eval/tasks/capability/capability_dynamic_object_number.yaml
new file mode 100644
index 00000000..e2137430
--- /dev/null
+++ b/lmms_eval/tasks/capability/capability_dynamic_object_number.yaml
@@ -0,0 +1,4 @@
+include: _default_template_yaml
+
+task: capability_dynamic_object_number
+dataset_name: dynamic_object_number
diff --git a/lmms_eval/tasks/capability/capability_event.yaml b/lmms_eval/tasks/capability/capability_event.yaml
new file mode 100644
index 00000000..09e3cb0b
--- /dev/null
+++ b/lmms_eval/tasks/capability/capability_event.yaml
@@ -0,0 +1,4 @@
+include: _default_template_yaml
+
+task: capability_event
+dataset_name: event
diff --git a/lmms_eval/tasks/capability/capability_object_category.yaml b/lmms_eval/tasks/capability/capability_object_category.yaml
new file mode 100644
index 00000000..3b953de6
--- /dev/null
+++ b/lmms_eval/tasks/capability/capability_object_category.yaml
@@ -0,0 +1,4 @@
+include: _default_template_yaml
+
+task: capability_object_category
+dataset_name: object_category
diff --git a/lmms_eval/tasks/capability/capability_object_color.yaml b/lmms_eval/tasks/capability/capability_object_color.yaml
new file mode 100644
index 00000000..11094013
--- /dev/null
+++ b/lmms_eval/tasks/capability/capability_object_color.yaml
@@ -0,0 +1,4 @@
+include: _default_template_yaml
+
+task: capability_object_color
+dataset_name: object_color
diff --git a/lmms_eval/tasks/capability/capability_object_number.yaml b/lmms_eval/tasks/capability/capability_object_number.yaml
new file mode 100644
index 00000000..85fed479
--- /dev/null
+++ b/lmms_eval/tasks/capability/capability_object_number.yaml
@@ -0,0 +1,4 @@
+include: _default_template_yaml
+
+task: capability_object_number
+dataset_name: object_number
diff --git a/lmms_eval/tasks/capability/capability_scene.yaml b/lmms_eval/tasks/capability/capability_scene.yaml
new file mode 100644
index 00000000..4cb3236c
--- /dev/null
+++ b/lmms_eval/tasks/capability/capability_scene.yaml
@@ -0,0 +1,4 @@
+include: _default_template_yaml
+
+task: capability_scene
+dataset_name: scene
diff --git a/lmms_eval/tasks/capability/capability_spatial_relation.yaml b/lmms_eval/tasks/capability/capability_spatial_relation.yaml
new file mode 100644
index 00000000..302fdb9b
--- /dev/null
+++ b/lmms_eval/tasks/capability/capability_spatial_relation.yaml
@@ -0,0 +1,4 @@
+include: _default_template_yaml
+
+task: capability_spatial_relation
+dataset_name: spatial_relation
diff --git a/lmms_eval/tasks/capability/capability_style.yaml b/lmms_eval/tasks/capability/capability_style.yaml
new file mode 100644
index 00000000..5f142e4a
--- /dev/null
+++ b/lmms_eval/tasks/capability/capability_style.yaml
@@ -0,0 +1,4 @@
+include: _default_template_yaml
+
+task: capability_style
+dataset_name: style
diff --git a/lmms_eval/tasks/capability/prompt.py b/lmms_eval/tasks/capability/prompt.py
new file mode 100644
index 00000000..da800668
--- /dev/null
+++ b/lmms_eval/tasks/capability/prompt.py
@@ -0,0 +1,238 @@
+
+class Prompts:
+    def __init__(self):        
+        self.event_system_prompt = "You are a video analysis expert specializing in evaluating the accuracy of video captions, particularly the descriptions of the events in a video. Please carefully analyze the user-provided caption and compare it to each provided event. Determine whether the caption contains the event."
+
+        self.action_system_prompt = "You are a video analysis expert specializing in evaluating the accuracy of video captions, particularly the descriptions of actions in a video. Please carefully analyze the user-provided caption, compare it to the provided action and complete the task."
+
+        self.object_category_system_prompt = "You are an image analysis expert specializing in evaluating the accuracy of image captions, particularly the descriptions of objects in an image. Please carefully analyze the user-provided caption, compare it to the provided object and complete the task."
+
+        self.object_color_system_prompt = "You are an image analysis expert specializing in evaluating the accuracy of image captions, particularly the descriptions of the color of objects in an image. Please carefully analyze the user-provided caption, compare it to the provided object color and complete the task."
+
+        self.object_number_system_prompt = "You are an image analysis expert specializing in evaluating the accuracy of image captions, particularly the descriptions of the number of objects in an image. Please carefully analyze the user-provided caption, compare it to the provided object number and complete the task."
+
+        self.dynamic_object_number_system_prompt = "You are a video analysis expert specializing in evaluating the accuracy of video captions, particularly the descriptions of the number of objects in a video. Please carefully analyze the user-provided caption, compare it to the provided object number and complete the task."
+
+        self.spatial_relation_system_prompt = "You are an image analysis expert specializing in evaluating the accuracy of image captions, particularly the descriptions of the spatial relationship between objects in an image. Please carefully analyze the user-provided caption, compare it to the provided spatial relationship between objects and complete the task."
+
+        self.scene_system_prompt = "You are an image analysis expert specializing in evaluating the accuracy of image captions, particularly the descriptions of the scene in an image. Please carefully analyze the user-provided caption, compare it to the provided scene and complete the task."
+
+        self.camera_angle_system_prompt = "You are an image analysis expert specializing in evaluating the accuracy of image captions, particularly the descriptions of camera angle in an image. Please carefully analyze the user-provided caption and complete the classification task."
+        self.camera_angle_category_explains = [
+            "level angle: Horizontal shooting of the subject (flat shot)",
+            "high angle: Shooting from above the subject (overhead shot)",
+            "low angle: Shooting from below the subject (upward shot)",
+            "dutch angle: The lens has a certain angle of deflection along the central axis, making the horizon crooked",
+        ]
+        self.camera_angle_categories = [c.split(":")[0] for c in self.camera_angle_category_explains]
+
+        self.camera_movement_system_prompt = "You are a video analysis expert specializing in evaluating the accuracy of video captions, particularly the descriptions of camera movements in the videos. Please carefully analyze the user-provided caption and complete the classification task."
+        self.camera_movement_category_explains = [
+            "left: the camera angle swings left (pan left), or the camera moves left (track left)",
+            "right: the camera angle swings right (pan right), or the camera moves right (track right)",
+            "up: the camera angle swings up (tilt up), or the camera moves up (boom up)",
+            "down: the camera angle swings down (tilt down), or the camera moves down (boom down)",
+            "in: camera pushes toward the subject (dolly in), or enlarges the frame (zoom in)",
+            "out: camera moves away the subject (dolly out), or expands the visible area, makeing the subject appear smaller (zoom out)",
+            "fixed: camera is almost fixed and does not change",
+        ]
+        self.camera_movement_categories = [c.split(":")[0] for c in self.camera_movement_category_explains]
+
+        self.OCR_system_prompt = "You are an image analysis expert specializing in evaluating the accuracy of image captions, particularly the descriptions of the OCR texts in an image. Please carefully analyze the user-provided caption, compare it to the provided OCR texts and complete the task."
+
+        self.style_system_prompt = "You are an image analysis expert specializing in evaluating the accuracy of image captions, particularly the descriptions of the image style. Please carefully analyze the user-provided caption and complete the classification task."
+        self.style_category_explains = [
+            "realistic: Represents subjects truthfully with lifelike detail and accuracy.",
+            "animated: Created using 2D images or 3D computer-generated imagery (CGI), e.g., cartoon, anime",
+            "special effect: Creates illusions through practical or digital techniques to enhance visuals.",
+            "old-fashioned: Emulates historical aesthetics like vintage or classical artistic styles.",
+            "pixel art: Retro digital art using blocky pixels for a nostalgic, low-res look.",
+            "sketch art: Rough, expressive drawings emphasizing line work and spontaneity.",
+            "abstract art: Non-representational art focused on shapes, colors, and emotions over realism.",
+            "impressionism art: Captures fleeting light/moments with visible brushstrokes and vibrant color dabs.",
+            "cubism art: Depicts subjects through fragmented geometric planes and multiple perspectives."
+        ]
+        self.style_categories = [c.split(":")[0] for c in self.style_category_explains]
+
+        self.character_identification_system_prompt = "You are an image analysis expert specializing in evaluating the accuracy of image captions, particularly the descriptions of person/character identification in an image. Please carefully analyze the user-provided caption, compare it to each provided name of the person/character and complete the task."
+
+    def get_prompts_by_task(self, task, caption, anno):
+        if task == "event":
+            return self.get_event_prompts(caption, anno)
+        if task == "action":
+            return self.get_action_prompts(caption, anno)
+        if task == "object_category":
+            return self.get_object_category_prompts(caption, anno)
+        elif task == "object_number":
+            return self.get_object_number_prompts(caption, anno)
+        elif task == "dynamic_object_number":
+            return self.get_dynamic_object_number_prompts(caption, anno)
+        elif task == "object_color":
+            return self.get_object_color_prompts(caption, anno)
+        elif task == "spatial_relation":
+            return self.get_spatial_relation_prompts(caption, anno)
+        elif task == "scene":
+            return self.get_scene_prompts(caption, anno)
+        elif task == "camera_angle":
+            return self.get_camera_angle_prompts(caption)
+        elif task == "camera_movement":
+            return self.get_camera_movement_prompts(caption)
+        elif task == "OCR":
+            return self.get_OCR_prompts(caption, anno)
+        elif task == "style":
+            return self.get_style_prompts(caption)
+        elif task == "character_identification":
+            return self.get_character_identification_prompts(caption, anno)
+        else:
+            raise ValueError(f"Wrong task type: {task}")
+
+    def get_event_prompts(self, caption, event):
+        event_user_prompt = "Given a video caption and an event as follows:\n"\
+            f"Video Caption: {caption}\n"\
+            f"Event: {event}\n"\
+            "Please analyze the video caption. Determine whether the provided event is described in the caption, and explain why. Note it can be considered mentioned as long as the caption contains an expression with a similar meaning to the event provided.\n"\
+            "Give score of 0 if the caption is totally irrelative to the provided event. Give score of 1 if the caption mentions the provided event correctly. Give score of -1 if the caption mentions the relative event give a wrong description.\n"\
+            "Output a JSON formed as:\n"\
+            "{\"event\": \"copy provided event here\", \"score\": \"put your score here\",  \"reason\": \"give your reason here\"}\n"\
+            "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only output the JSON. Do not add Markdown syntax. Output:"
+        return self.event_system_prompt, event_user_prompt
+
+    def get_action_prompts(self, caption, action):
+        action_user_prompt = "Given a video caption and an action as follows:\n"\
+            f"Video Caption: {caption}\n"\
+            f"Action: {action}\n"\
+            "Please analyze the video caption. Determine whether the provided action is mentioned in the caption, and explain why. Note it can be considered mentioned as long as the caption contains an expression with a similar meaning to the action provided.\n"\
+            "Give score of 0 if the caption does not mention ANY actions (including the provided action and any other action description). Give score of 1 if the caption mentions the provided action. Give score of -1 if the provided action is not mentioned in the caption.\n"\
+            "Output a JSON formed as:\n"\
+            "{\"action\": \"copy provided action here\", \"score\": \"put your score here\",  \"reason\": \"give your reason here\"}\n"\
+            "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only output the JSON. Do not add Markdown syntax. Output:"
+        return self.action_system_prompt, action_user_prompt
+
+    def get_object_category_prompts(self, caption, category):
+        object_category_user_prompt = "Given an image caption and an object as follows:\n"\
+            f"Image Caption: {caption}\n"\
+            f"Object: {category}\n"\
+            "Please analyze the image caption. Determine whether the provided object is mentioned in the caption, and explain why. Note it can be considered mentioned as long as the caption contains an expression with a similar meaning to the object provided.\n"\
+            "Give score of 0 if the caption does not mention ANY objects (including the provided object and any other objects). Give score of 1 if the caption mentions the provided object. Give score of -1 if the object is not mentioned in the caption.\n"\
+            "Output a JSON formed as:\n"\
+            "{\"object_category\": \"copy provided object here\", \"score\": \"put your score here\",  \"reason\": \"give your reason here\"}\n"\
+            "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only output the JSON. Do not add Markdown syntax. Output:"
+        return self.object_category_system_prompt, object_category_user_prompt
+    
+    def get_object_number_prompts(self, caption, number):
+        object_category, object_number = list(number.items())[0]
+        object_number_user_prompt = "Given an image caption and the number of an object with format {object: number} as follows:\n"\
+            f"Image Caption: {caption}\n"\
+            f"Object Number: {{{object_category}: {object_number}}}\n"\
+            f"Please analyze the image caption. Determine whether the provided object number is correctly described in the caption, and explain why. You may need to count in the caption to determine how many the provided objects it describes.\n"\
+            "Give score of 0 if the caption does not mention the specific number of provided object (including the use of words such as 'some' and 'various' in the caption rather than giving specific numbers) or not mention the provided object. Give score of 1 if the caption counts the provided object correctly. Give score only of -1 if the caption counts the wrong number of the provided object.\n"\
+            "Output a JSON formed as:\n"\
+            "{\"object_number\": \"copy the provided {object: number} here\", \"score\": \"put your score here\",  \"reason\": \"give your reason here\"}\n"\
+            "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only output the JSON. Do not add Markdown syntax. Output:"
+        return self.object_number_system_prompt, object_number_user_prompt
+
+    def get_dynamic_object_number_prompts(self, caption, number):
+        dynamic_object_number_user_prompts = []
+        for object_category, object_number in number.items():
+            dynamic_object_number_user_prompt = "Given a video caption and the number of an object with format {object: number} as follows:\n"\
+                f"Image Caption: {caption}\n"\
+                f"Object Number: {{{object_category}: {object_number}}}\n"\
+                f"Please analyze the video caption. Determine whether the provided object number is correctly described in the caption, and explain why. You may need to count in the caption to determine how many the provided objects it describes. Note you can never infer the number if the caption only gives 'some', 'several' without specific numbers.\n"\
+                "Give score of 0 if the caption does not mention the specific number of provided object (including the use of words such as 'some' and 'various' in the caption rather than giving specific numbers) or not mention the provided object. Give score of 1 if the caption counts the provided object correctly. Give score only of -1 if the caption counts the wrong number of the provided object.\n"\
+                "Output a JSON formed as:\n"\
+                "{\"object_number\": \"copy the provided {object: number} here\", \"score\": \"put your score here\",  \"reason\": \"give your reason here\"}\n"\
+                "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only output the JSON. Do not add Markdown syntax. Output:"
+            dynamic_object_number_user_prompts.append(dynamic_object_number_user_prompt)
+        return self.dynamic_object_number_system_prompt, dynamic_object_number_user_prompts
+
+    def get_object_color_prompts(self, caption, color):
+        object_category, object_color = list(color.items())[0]
+        object_color_user_prompt = "Given an image caption and the color of an object with format {object: color} as follows:\n"\
+            f"Image Caption: {caption}\n"\
+            f"Object Color: {{{object_category}: {object_color}}}\n"\
+            "Please analyze the image caption. Determine whether the provided object color is correctly described in the caption, and explain why.\n"\
+            "Give score of 0 for the following two situations:\n"\
+            "1) The provided object is not mentioned in the caption. Note it can be considered mentioned as long as the caption contains an expression with a similar meaning to the object provided.\n"\
+            "2) The caption does not mention the specific color of provided object\n"\
+            "Give score of 1 if the caption describes the object color correctly. Give score of -1 only if the caption gives the wrong color. Note it can be considered correct if the caption contains an expression with a similar meaning to the provided color.\n"\
+            "Output a JSON formed as:\n"\
+            "{\"object_color\": \"copy the provided {object: color} here\", \"score\": \"put your score here\",  \"reason\": \"give your reason here\"}\n"\
+            "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only output the JSON. Do not add Markdown syntax. Output:"
+        return self.object_color_system_prompt, object_color_user_prompt
+
+    def get_spatial_relation_prompts(self, caption, spatial_relation):
+        spatial_relation_user_prompt = "Given an image caption and the spatial relationship between two objects as follows:\n"\
+            f"Image Caption: {caption}\n"\
+            f"Spatial Relationship: {spatial_relation}\n"\
+            "Please analyze the image caption. Determine whether the provided spatial relationship is correctly decribed in caption, and explain why.\n"\
+            "Give score of 0 if the caption does not mention the spatial relationship between objects or not mention the objects. Give score of 1 if the caption describes the spatial relationship correctly. Give score of -1 only if the caption describes the wrong spatial relationship.\n"\
+            "Output a JSON formed as:\n"\
+            "{\"spatial_relation\": \"copy the provided spatial relationship here\", \"score\": \"put your score here\",  \"reason\": \"give your reason here\"}\n"\
+            "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only output the JSON. Do not add Markdown syntax. Output:"
+        return self.spatial_relation_system_prompt, spatial_relation_user_prompt
+
+    def get_scene_prompts(self, caption, scene):
+        scene_user_prompt = "Given an image caption and a scene as follows:\n"\
+            f"Image Caption: {caption}\n"\
+            f"Scene: {scene}\n"\
+            "Please analyze the image caption. Determine whether the provided scene is included in the caption, and explain why.\n"\
+            "Give score of 0 if the caption does not mention ANY scene information (including the provided scene and any other scenes). Give score of 1 if the caption mentions the provided scene. Give score of -1 only if the scene is not mentioned in the caption.\n"\
+            "Output a JSON formed as:\n"\
+            "{\"scene\": \"copy the provided scene here\", \"score\": \"put your score here\",  \"reason\": \"give your reason here\"}\n"\
+            "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only output the JSON. Do not add Markdown syntax. Output:"
+        return self.scene_system_prompt, scene_user_prompt
+
+    def get_camera_angle_prompts(self, caption):
+        camera_angle_user_prompt = "Given an image caption, your task is to determine which kind of camera angles is included in the caption.\n"\
+            f"Image Caption: {caption}\n"\
+            f"Please analyze the image caption and classify the descriptions of camera angles into the following categories: {self.camera_angle_categories}\n"\
+            "Here are the explanations of each category: " + '\n'.join(self.camera_angle_category_explains) + "\n"\
+            "If the caption explicitly mentions one or some of the above camera angle categories, write the result of the categories with a python list format into the 'pred' value of the json string. You should only search the descriptions about the camera angle. If there is no description of the camera angle in the image caption or the description does not belong to any of the above categories, write 'N/A' into the 'pred' value of the json string.\n"\
+            "Output a JSON formed as:\n"\
+            "{\"pred\": \"put your predicted category as a python list here\", \"reason\": \"give your reason here\"}\n"\
+            "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only output the JSON. Do not add Markdown syntax. Output:"
+        return self.camera_angle_system_prompt, camera_angle_user_prompt
+
+    def get_camera_movement_prompts(self, caption):
+        camera_movement_user_prompt = "Given a video caption, your task is to determine which kind of camera movement is included in the caption.\n"\
+            f"Video Caption: {caption}\n"\
+            f"Please analyze the video caption and classify the descriptions of camera movement into the following categories: {self.camera_movement_categories}\n"\
+            f"Here are the explanations of each category: " + '\n'.join(self.camera_movement_category_explains) + "\n"\
+            "If the caption explicitly mentions one or some of the above camera movement categories, write the result of the categories with a python list format into the 'pred' value of the json string. Note do not infer the camera movement categories from the whole caption. You should only search the descriptions about the camera movement. If there is no description of the camera movement in the video caption or the description does not belong to any of the above categories, write 'N/A' into the 'pred' value of the json string.\n"\
+            "Output a JSON formed as:\n"\
+            "{\"pred\": \"put your predicted category as a python list here\", \"reason\": \"give your reason here\"}\n"\
+            "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only output the JSON. Do not add Markdown syntax. Output:"
+        return self.camera_movement_system_prompt, camera_movement_user_prompt
+
+    def get_OCR_prompts(self, caption, OCR_text):
+        OCR_user_prompt = "Given an image caption and an OCR text as follows:\n"\
+            f"Image Caption: {caption}\n"\
+            f"OCR Text: {OCR_text}\n"\
+            f"Please analyze the image caption. Determine whether the provided text is described correctly in the caption, and explain why.\n"\
+            "Give score of 0 if there is no description about the provided OCR text in the caption. Give score of 1 if the caption refers the text and recognizes correctly. Give score of -1 if the recognization result is wrong in the caption.\n"\
+            "Output a JSON formed as:\n"\
+            "{\"OCR\": \"copy the provided real OCR text here\", \"score\": put your score here, \"reason\": \"give your reason here\"},\n"\
+            "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only output the JSON. Do not add Markdown syntax. Output:"
+        return self.OCR_system_prompt, OCR_user_prompt
+
+    def get_style_prompts(self, caption):
+        style_user_prompt = "Given an image caption, your task is to determine which category of image style is included in the caption.\n"\
+            f"Image Caption: {caption}\n"\
+            f"Please analyze the image caption and classify the descriptions of the image style into the following categories: {self.style_categories}\n"\
+            f"Here are the explanations of each category: " + '\n'.join(self.style_category_explains) + "\n"\
+            "If the description of the image style belongs to one or some of the above categories, write the result of the categories with a python list format into the 'pred' value of the json string. Focus more on the artistic style part in the caption. If there is no description of the image style in the image caption or the description does not belong to any of the above categories, write 'N/A' into the 'pred' value of the json string.\n"\
+            "Output a JSON formed as:\n"\
+            "{\"pred\": \"put your predicted category as a python list here\", \"reason\": \"give your reason here\"}\n"\
+            "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only output the JSON. Do not add Markdown syntax. Output:"
+        return self.style_system_prompt, style_user_prompt
+
+    def get_character_identification_prompts(self, caption, character_identification):
+        character_identification_user_prompt = "Given an image caption and the name of a person/character as follows:\n"\
+            f"Image Caption: {caption}\n"\
+            f"name: {character_identification}\n"\
+            "Please analyze the image caption. Determine whether the provided name of person/character is included in the caption, and explain why.\n"\
+            "Give score of 0 if the caption does not mention any names. Give score of 1 if the caption mentions the provided name correctly. Give score of -1 if the name in the caption gives a wrong name.\n"\
+            "Output a JSON formed as:\n"\
+            "{\"character_identification\": \"copy the provided name here\", \"score\": \"put your score here\",  \"reason\": \"give your reason here\"}\n"\
+            "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only output the JSON. Do not add Markdown syntax. Output:"
+        return self.character_identification_system_prompt, character_identification_user_prompt
diff --git a/lmms_eval/tasks/capability/utils.py b/lmms_eval/tasks/capability/utils.py
new file mode 100644
index 00000000..228b1552
--- /dev/null
+++ b/lmms_eval/tasks/capability/utils.py
@@ -0,0 +1,622 @@
+import os
+import ast
+import yaml
+import json
+import numpy as np
+from PIL import Image
+from tqdm import tqdm
+from pathlib import Path
+from openai import OpenAI
+from loguru import logger as eval_logger
+from lmms_eval.tasks.capability.prompt import Prompts
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
+
+
+with open(Path(__file__).parent / "_default_template_yaml", "r") as f:
+    raw_data = f.readlines()
+    safe_data = []
+    for i, line in enumerate(raw_data):
+        # remove function definition since yaml load cannot handle it
+        if "!function" not in line:
+            safe_data.append(line)
+config = yaml.safe_load("".join(safe_data))
+
+OPENAI_API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions")
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY")
+
+HF_HOME = os.getenv("HF_HOME", "~/.cache/huggingface")
+HF_HOME = os.path.expanduser(HF_HOME)
+cache_dir = os.path.join(HF_HOME, config["dataset_kwargs"]["cache_dir"])
+
+
+def capability_doc_to_visual(doc, lmms_eval_specific_kwargs=None):
+    data_type = doc['data_type']
+    file_path = doc['file_path'][5:]
+    file_path = os.path.join(cache_dir, file_path)
+    if not os.path.exists(file_path):
+        eval_logger.error(f"File path: {file_path} does not exist, please check.")
+    
+    if data_type == 'image':
+        return [Image.open(file_path).convert('RGB')]
+    else:   # video
+        return [file_path]
+
+
+def capability_doc_to_text(doc, lmms_eval_specific_kwargs=None):
+    data_type = doc['data_type']
+    return lmms_eval_specific_kwargs[f"{data_type}_prompt"]
+
+
+def capability_process_results(doc, results):
+    """
+    Args:
+        doc: a instance of the eval dataset
+        results: [pred]
+    Returns:
+        a dictionary with key: metric name (in this case capability_perception_score), value: metric value
+    """
+    if isinstance(doc["annotation"], dict):
+        annotation = {k: v for k, v in doc["annotation"].items() if v is not None}
+    else:
+        annotation = doc["annotation"]
+
+    response = {
+        "file_id": doc["file_id"],
+        "caption": results[0].strip(),
+        "annotation": annotation,
+        "task": doc["task"],
+    }
+    return {
+        "capability_inference_result": response,
+        "capability_precision": response,
+        "capability_recall": response,
+        "capability_f1_score": response,
+
+    }
+
+
+def capability_aggregate_inference_result(results, args):
+    task = results[0]['task']
+    if 'eval_save_root' in config['metadata'] and config['metadata']['eval_save_root'] is not None:
+        save_path = os.path.join(config['metadata']['eval_save_root'], f"inference/{task}.jsonl")
+    else:
+        suffix = args.model if args.log_samples_suffix == "model_outputs" else args.log_samples_suffix
+        save_path = generate_submission_file(
+            file_name=f"{task}.jsonl",
+            args=args,
+            subpath=f"capability_results/{suffix}/inference"
+        )
+
+    # delete the invalid evaluation results as lmms-eval do not support auto-resume inference
+    # to ensure re-run evaluation if re-run inference
+    eval_save_path = os.path.join(os.path.dirname(save_path), f"../evaluation/{task}.jsonl")
+    if os.path.exists(eval_save_path):
+        os.remove(eval_save_path)
+    
+    with open(save_path, 'w') as f:
+        for result in results:
+            f.write(json.dumps(result) + '\n')
+    return None
+
+
+def capability_aggregate_results(results, args):
+    """
+    Args:
+        results: a list of values returned by process_results
+    Returns:
+        A score
+    """
+    # results: [{"file_id": doc["file_id"], "caption": results[0].strip(), "annotation": doc["annotation"], "task": doc["task"]},]
+    task = results[0]['task']
+    if 'eval_save_root' in config['metadata'] and config['metadata']['eval_save_root'] is not None:
+        save_path = os.path.join(config['metadata']['eval_save_root'], f"evaluation/{task}.jsonl")
+    else:
+        suffix = args.model if args.log_samples_suffix == "model_outputs" else args.log_samples_suffix
+        save_path = generate_submission_file(
+            file_name=f"{task}.jsonl",
+            args=args,
+            subpath=f"capability_results/{suffix}/evaluation"
+        )
+    eval_model = config['metadata']['eval_model_name']
+    num_process = config['metadata']['eval_num_process']
+    max_allow_missing = config['metadata']['eval_max_allow_missing']
+    max_retry_times = config['metadata']['eval_max_retry_times']
+    auto_resume = config['metadata']['eval_auto_resume']
+    strict_match = config['metadata']['eval_strict_match']
+    evaluator = Evaluator(
+        task, results, save_path,
+        eval_model, num_process,
+        max_allow_missing, max_retry_times,
+        auto_resume, strict_match
+    )
+    score_dict = evaluator.evaluate_scores()
+    metrics = evaluator.calculate_metric(score_dict)
+    return metrics
+
+
+def capability_aggregate_precision(results, args):
+    metrics = capability_aggregate_results(results, args)
+    task = results[0]['task']
+    precision = metrics['precision']
+    eval_logger.info(f"[{task}] precision: {precision:.1f}")
+    return precision
+
+
+def capability_aggregate_recall(results, args):
+    metrics = capability_aggregate_results(results, args)
+    task = results[0]['task']
+    recall = metrics['recall']
+    eval_logger.info(f"[{task}] recall: {recall:.1f}")
+    return recall
+
+
+def capability_aggregate_f1score(results, args):
+    metrics = capability_aggregate_results(results, args)
+    task = results[0]['task']
+    f1_score = metrics['f1_score']
+    eval_logger.info(f"[{task}] f1_score: {f1_score:.1f}")
+    return f1_score
+
+
+class Evaluator:
+    def __init__(
+            self, task, results, save_path,
+            eval_model, num_process=0,
+            max_allow_missing=5, max_retry_times=10,
+            auto_resume=True, strict_match=True,
+    ):
+        self.task = task
+        self.results = results
+        self.save_path = save_path
+        self.eval_model = eval_model
+        self.num_process = num_process
+        self.max_allow_missing = max_allow_missing
+        self.max_retry_times = max_retry_times
+        self.auto_resume = auto_resume
+        self.strict_match = strict_match
+        self.prompts = Prompts()
+
+        self.post_validate_format_func = eval(f"self.post_validate_format_{task}")
+        self.post_process_func = eval(f"self.post_process_{task}")
+
+        self.file2anno = {r['file_id']: r['annotation'] for r in self.results}
+
+    def post_validate_format_event(self, response, anno):
+        # "{\"action\": \"copy provided action here\", \"score\": \"put your score here\",  \"reason\": \"give your reason here\"}\n"\
+        assert isinstance(response, dict)
+        if self.strict_match:
+            assert response["event"].strip() == anno.strip()
+        if response["score"] in ["-1", "0", "1"]:
+            response["score"] = int(response["score"])
+        assert response["score"] in [1, 0, -1]
+
+    def post_process_event(self, response, anno):
+        return response["score"]
+    
+    def post_validate_format_action(self, response, anno):
+        # "{\"action\": \"copy provided action here\", \"score\": \"put your score here\",  \"reason\": \"give your reason here\"}\n"\
+        assert isinstance(response, dict)
+        if self.strict_match:
+            assert response["action"].strip() == anno.strip()
+        if response["score"] in ["-1", "0", "1"]:
+            response["score"] = int(response["score"])
+        assert response["score"] in [1, 0, -1]
+
+    def post_process_action(self, response, anno):
+        return response["score"]
+
+    def post_validate_format_object_category(self, response, anno):
+        # "{\"object_category\": \"copy provided object here\", \"score\": \"put your score here\",  \"reason\": \"give your reason here\"}\n"\
+        assert isinstance(response, dict)
+        if self.strict_match:
+            assert response["object_category"].strip() == anno.strip()
+        if response["score"] in ["-1", "0", "1"]:
+            response["score"] = int(response["score"])
+        assert response["score"] in [1, 0, -1]
+
+    def post_process_object_category(self, response, anno):
+        return response["score"]
+    
+    def post_validate_format_object_number(self, response, anno):
+        # "{\"object_number\": \"copy the provided {object: number} here\", \"score\": \"put your score here\",  \"reason\": \"give your reason here\"}\n"\
+        assert isinstance(response, dict)
+        if isinstance(response['object_number'], str):
+            # assert response['object_number'].startswith("{") and response['object_number'].endswith("}")
+            assert ':' in response['object_number']
+            object_category, object_number = response['object_number'].lstrip('{').rstrip('}').split(":")
+            object_number = int(object_number.strip())
+        elif isinstance(response['object_number'], dict):
+            object_category, object_number = list(response['object_number'].items())[0]
+            object_number = int(object_number.strip())
+        else:
+            raise ValueError("Invalid object_number format")
+        if self.strict_match:
+            assert object_number == list(anno.values())[0]
+        if response["score"] in ["-1", "0", "1"]:
+            response["score"] = int(response["score"])
+        assert response["score"] in [1, 0, -1]
+
+    def post_process_object_number(self, response, anno):
+        return response["score"]
+
+    def post_validate_format_dynamic_object_number(self, response, anno):
+        # "{\"object_number\": \"copy the provided {object: number} here\", \"score\": \"put your score here\",  \"reason\": \"give your reason here\"}\n"\
+        assert isinstance(response, dict)
+        assert 'response' in response
+        for i, r in enumerate(response['response']):
+            if isinstance(r['object_number'], str):
+                # assert response['object_number'].startswith("{") and response['object_number'].endswith("}")
+                assert ':' in r['object_number']
+                object_category, object_number = r['object_number'].lstrip('{').rstrip('}').split(":")
+                object_number = int(object_number.strip())
+            elif isinstance(response['object_number'], dict):
+                object_category, object_number = list(r['object_number'].items())[0]
+                object_number = int(object_number.strip())
+            else:
+                raise ValueError("Invalid object_number format")
+            if self.strict_match:
+                assert object_number == list(anno.values())[i]
+            if r["score"] in ["-1", "0", "1"]:
+                r["score"] = int(r["score"])
+            assert r["score"] in [1, 0, -1]
+
+    def post_process_dynamic_object_number(self, response, anno):
+        scores = []
+        for r in response['response']:
+            scores.append(r['score'])
+        return scores
+
+    def post_validate_format_object_color(self, response, anno):
+        # "{\"object_color\": \"copy the provided {object: color} here\", \"score\": \"put your score here\",  \"reason\": \"give your reason here\"}\n"\
+        assert isinstance(response, dict)
+        if isinstance(response['object_color'], str):
+            # assert response['object_color'].startswith("{") and response['object_color'].endswith("}")
+            assert ':' in response['object_color']
+            unpacked = response['object_color'].lstrip('{').rstrip('}').split(":")
+            if len(unpacked) > 2:
+                object_category, object_color = ":".join(unpacked[:-1]), unpacked[-1]
+            else:
+                object_category, object_color = unpacked
+            object_color = object_color.strip()
+        elif isinstance(response['object_color'], dict):
+            object_category, object_color = list(response['object_color'].items())[0]
+            object_color = object_color.strip()
+        else:
+            raise ValueError("Invalid object_color format")
+        if self.strict_match:
+            assert object_color == list(anno.values())[0]
+        if response["score"] in ["-1", "0", "1"]:
+            response["score"] = int(response["score"])
+        assert response["score"] in [1, 0, -1]
+
+    def post_process_object_color(self, response, anno):
+        return response["score"]
+
+    def post_validate_format_spatial_relation(self, response, anno):
+        # "{\"spatial_relation\": \"copy the provided spatial relationship here\", \"score\": \"put your score here\",  \"reason\": \"give your reason here\"}\n"\
+        assert isinstance(response, dict)
+        if self.strict_match:
+            assert response["spatial_relation"].strip() == anno.strip()
+        if response["score"] in ["-1", "0", "1"]:
+            response["score"] = int(response["score"])
+        assert response["score"] in [1, 0, -1]
+
+    def post_process_spatial_relation(self, response, anno):
+        return response["score"]
+
+    def post_validate_format_scene(self, response, anno):
+        # "{\"scene\": \"copy the provided scene here\", \"score\": \"put your score here\",  \"reason\": \"give your reason here\"}\n"\
+        assert isinstance(response, dict)
+        if self.strict_match:
+            assert response["scene"].strip() == anno.strip()
+        if response["score"] in ["-1", "0", "1"]:
+            response["score"] = int(response["score"])
+        assert response["score"] in [1, 0, -1]
+
+    def post_process_scene(self, response, anno):
+        return response["score"]
+
+    def post_validate_format_camera_angle(self, response, anno):
+        # "{\"pred\": \"put your predicted category here\", \"reason\": \"give your reason here\"}\n"\
+        assert isinstance(response, dict)
+        assert "pred" in response
+        if response["pred"] == "N/A" or "N/A" in response["pred"]:
+            response["pred"] = ["N/A"]
+        if isinstance(response["pred"], str):
+            response["pred"] = ast.literal_eval(response['pred'])
+        assert isinstance(response["pred"], list)
+        for i in range(len(response["pred"])):
+            if response["pred"][i] in self.prompts.camera_angle_category_explains:
+                response["pred"][i] = response["pred"].split(":")[0].lower()
+            assert response["pred"][i] == "N/A" or response["pred"][i] in self.prompts.camera_angle_categories
+    
+    def post_process_camera_angle(self, response, anno):
+        if len(response["pred"]) == 1 and response["pred"][0] == "N/A":
+            return 0
+        elif anno in response["pred"]:
+            return 1
+        else:
+            return -1
+
+    def post_validate_format_camera_movement(self, response, anno):
+        # "{\"pred\": \"put your predicted category here\", \"reason\": \"give your reason here\"}\n"\
+        assert isinstance(response, dict)
+        assert "pred" in response
+        if response["pred"] == "N/A" or "N/A" in response["pred"]:
+            response["pred"] = ["N/A"]
+        if isinstance(response["pred"], str):
+            response["pred"] = ast.literal_eval(response['pred'])
+        assert isinstance(response["pred"], list)
+        for i in range(len(response["pred"])):
+            if response["pred"][i] in self.prompts.camera_movement_category_explains:
+                response["pred"][i] = response["pred"].split(":")[0].lower()
+            assert response["pred"][i] == "N/A" or response["pred"][i] in self.prompts.camera_movement_categories
+    
+    def post_process_camera_movement(self, response, anno):
+        if len(response["pred"]) == 1 and response["pred"][0] == "N/A":
+            return 0
+        elif anno in response["pred"]:
+            return 1
+        else:
+            return -1
+
+    def post_validate_format_OCR(self, response, anno):
+        # "{\"OCR\": \"copy the provided real OCR text here\", \"score\": put your score here, \"reason\": \"give your reason here\"},\n"\
+        assert isinstance(response, dict)
+        if self.strict_match:
+            assert response['OCR'].strip() == anno.strip()
+        if response["score"] in ["-1", "0", "1"]:
+            response["score"] = int(response["score"])
+        assert response["score"] in [1, 0, -1]
+        
+    def post_process_OCR(self, response, anno):
+        return response['score']
+
+    def post_validate_format_style(self, response, anno):
+        # "{\"pred\": \"put your predicted category here\", \"reason\": \"give your reason here\"}\n"\
+        assert isinstance(response, dict)
+        assert "pred" in response
+        if response["pred"] == "N/A" or "N/A" in response["pred"]:
+            response["pred"] = ["N/A"]
+        if isinstance(response["pred"], str):
+            response["pred"] = ast.literal_eval(response['pred'])
+        assert isinstance(response["pred"], list)
+        for i in range(len(response["pred"])):
+            if response["pred"][i] in self.prompts.style_category_explains:
+                response["pred"][i] = response["pred"][i].split(":")[0].lower()
+            assert response["pred"][i] == "N/A" or response["pred"][i] in self.prompts.style_categories
+
+    def post_process_style(self, response, anno):
+        if len(response["pred"]) == 1 and response["pred"][0] == "N/A":
+            return 0
+        elif anno in response["pred"]:
+            return 1
+        else:
+            return -1
+    
+    def post_validate_format_character_identification(self, response, anno):
+        # "{\"name\": \"copy the provided name here\", \"score\": \"put your score here\",  \"reason\": \"give your reason here\"}\n"\
+        assert isinstance(response, dict)
+        if self.strict_match:
+            assert response["character_identification"].strip() == anno.strip()
+        if response["score"] in ["-1", "0", "1"]:
+            response["score"] = int(response["score"])
+        assert response["score"] in [1, 0, -1]
+
+    def post_process_character_identification(self, response, anno):
+        return response["score"]
+        
+    def load_saved_records(self):
+        if os.path.exists(self.save_path):
+            with open(self.save_path, 'r') as f:
+                saved_responses = [json.loads(l.strip('\n')) for l in f.readlines()]
+        else:
+            saved_responses = []
+        return saved_responses
+
+    def call_gpt(self, system_prompt, user_prompt):
+        messages = [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": user_prompt},
+        ]
+        try:
+            client = OpenAI(
+                api_key=OPENAI_API_KEY,
+                base_url=OPENAI_API_URL,
+                timeout=120
+            )
+            response = client.chat.completions.create(
+                model=self.eval_model,
+                messages=messages,
+            )
+        except Exception as e:
+            eval_logger.info(f"Error calling {self.eval_model}: {e}")
+            return None
+        
+        try:
+            response_message = response.choices[0].message.content
+            return response_message
+        except Exception as e:
+            eval_logger.info(f"Error parsing {self.eval_model} response: {e}\nResponse: {response}")
+            return None
+
+    def call_and_parse_single_meaasge(self, file, system_prompt, user_prompt):
+        response_message = self.call_gpt(system_prompt, user_prompt)
+        if response_message is None:
+            return None
+
+        try:
+            if '```json' in response_message:
+                response_message = response_message.split('```json')[-1].split('```')[0].strip()
+            if '```python' in response_message:
+                response_message = response_message.split('```python')[-1].split('```')[0].strip()
+            elif '```' in response_message:
+                response_message = response_message.split('```')[1].strip()
+            response = ast.literal_eval(response_message)
+            return response
+        except (SyntaxError, ValueError) as e:
+            eval_logger.info(f"Invalid response format for {file}: {response_message}")
+            return None
+
+    def evaluate_sample_worker(self, args):
+        file, anno, system_prompt, user_prompt = args
+        if isinstance(user_prompt, list):
+            response = {'response': []}
+            for prompt in user_prompt:
+                single_response = self.call_and_parse_single_meaasge(file, system_prompt, prompt)
+                if single_response is None:
+                    return None
+                response['response'].append(single_response)
+            
+        else:
+            response = self.call_and_parse_single_meaasge(file, system_prompt, user_prompt)
+            if response is None:
+                return None
+        
+        try:
+            self.post_validate_format_func(response, anno)
+        except Exception as e:
+            eval_logger.info(f"Format validation failed for {file}: {e}, anno: {anno}, response: {response}")
+            return None
+
+        response['file_id'] = file
+        return response
+
+    def evaluate_scores(self):
+        score_dict = {}
+        # Load saved records for resuming evaluation
+        if self.auto_resume:
+            saved_responses = self.load_saved_records()
+            eval_logger.info(f"[{self.task}] Loaded {len(saved_responses)} records")
+        else:
+            saved_responses = []
+        
+        buffer = []
+        buffer_size = 100
+        try:
+            # Evaluate remaining
+            for retry_count in range(self.max_retry_times + 1):
+                saved_files = [r['file_id'] for r in saved_responses]
+                if len(saved_files) == len(self.results):
+                    break
+                if len(self.results) - len(saved_files) <= self.max_allow_missing:
+                    break
+
+                remaining_results = [r for r in self.results if r['file_id'] not in saved_files]
+                if retry_count != 0:
+                    print(f"\nRetrying {retry_count} times")
+                
+                process_args = []
+                for res in remaining_results:
+                    file = res['file_id']
+                    caption = res['caption']
+                    anno = res['annotation']
+                    system_prompt, user_prompt = self.prompts.get_prompts_by_task(self.task, caption, anno)
+                    args = (file, anno, system_prompt, user_prompt)
+                    process_args.append(args)
+                
+                if self.num_process == 0:
+                    for args in tqdm(process_args, desc=f"Evaluating {self.task}"):
+                        response = self.evaluate_sample_worker(args)
+                        if response is not None:
+                            with open(self.save_path, 'a') as f:
+                                f.write(json.dumps(response) + '\n')
+                            saved_responses.append(response)
+                else:
+                    with ThreadPoolExecutor(max_workers=self.num_process) as executor:
+                        futures = {executor.submit(self.evaluate_sample_worker, arg): arg for arg in process_args}
+                        buffer_counter = 0
+                        for future in tqdm(as_completed(futures), total=len(remaining_results), desc=f"Evaluating {self.task}"):
+                            result = future.result()
+                            if result is not None:
+                                buffer.append(json.dumps(result) + '\n')
+                                buffer_counter += 1
+                                if buffer_counter >= buffer_size:
+                                    with open(self.save_path, 'a') as f:
+                                        f.writelines(buffer)
+                                    buffer.clear()
+                                    buffer_counter = 0
+                                
+                                saved_responses.append(result)
+                        
+                        if len(buffer) > 0:
+                            with open(self.save_path, 'a') as f:
+                                f.writelines(buffer)
+                            buffer.clear()
+
+        finally:
+            if len(buffer) > 0:
+                with open(self.save_path, 'a') as f:
+                    f.writelines(buffer)
+
+        
+        for response in tqdm(saved_responses, desc=f"Calculating {self.task} scores"):
+            file = response['file_id']
+            score_dict[file] = self.post_process_func(response, self.file2anno[file])
+            
+        return score_dict
+
+    def calculate_metric(self, score_dict):
+        all_scores = []
+        for file_id, scores in score_dict.items():
+            if isinstance(scores, list):
+                all_scores += scores
+            else:
+                all_scores.append(scores)
+        all_scores = np.array(all_scores)
+        sum_count = len(all_scores)
+        hit_count = np.count_nonzero(all_scores != 0)
+        correct_count = np.count_nonzero(all_scores == 1)
+        precision = 0 if hit_count == 0 else 100 * correct_count / hit_count
+        recall = 100 * correct_count / sum_count
+        hit_rate = 100 * hit_count / sum_count
+        f1_score = 0 if precision == 0 else 2 * precision * recall / (precision + recall)
+        eval_logger.info(f"[{self.task}] all: {sum_count}, hit: {hit_count}, correct: {correct_count}")
+        return {
+            "precision": precision,
+            "recall": recall,
+            "hit_rate": hit_rate,
+            "f1_score": f1_score
+        }
+    
+
+# Directly run this file to evaluate existing inference record
+if __name__ == "__main__":
+    results_dir = "logs/capability_results/llava_onevision_7b/inference"
+    save_dir = "logs/capability_results/llava_onevision_7b/evaluation"
+    os.makedirs(save_dir, exist_ok=True)
+
+    tasks = ["object_category", "object_number", "object_color", "spatial_relation", 
+             "scene", "camera_angle", "OCR", "style", "character_identification", 
+             "dynamic_object_number", "action", "camera_movement", "event"]
+    
+    metrics = []
+    for task in tasks:
+        with open(os.path.join(results_dir, f"{task}.jsonl"), 'r') as f:
+            result = [json.loads(l.strip()) for l in f.readlines()]
+        save_path = os.path.join(save_dir, f"{task}.jsonl")
+        eval_model = config['metadata']['eval_model_name']
+        num_process = config['metadata']['eval_num_process']
+        max_allow_missing = config['metadata']['eval_max_allow_missing']
+        max_retry_times = config['metadata']['eval_max_retry_times']
+        auto_resume = config['metadata']['eval_auto_resume']
+        strict_match = config['metadata']['eval_strict_match']
+        evaluator = Evaluator(
+            task, result, save_path,
+            eval_model, num_process,
+            max_allow_missing, max_retry_times,
+            auto_resume, strict_match
+        )
+        score_dict = evaluator.evaluate_scores()
+        metric = evaluator.calculate_metric(score_dict)
+        metrics.append(metric)
+        eval_logger.info(f"[{task}] " + ", ".join([f"{k}: {v:.1f}" for k, v in metric.items()]))
+    
+    # summarize metrics
+    eval_logger.info("Summarized Results:")
+    avg_precision = np.mean([m["precision"] for m in metrics])
+    avg_recall = np.mean([m["recall"] for m in metrics])
+    avg_hit_rate = np.mean([m["hit_rate"] for m in metrics])
+    avg_f1_score = np.mean([m["f1_score"] for m in metrics])
+    eval_logger.info(f"Average precision: {avg_precision:.3f}, recall: {avg_recall:.3f}, f1_score: {avg_f1_score:.3f}, hit_rate: {avg_hit_rate:.3f}")

From ad4528c4b1e88f065184a0d193031a08bda9ca43 Mon Sep 17 00:00:00 2001
From: liuzh <chujing.lzh@alibaba-inc.com>
Date: Sat, 3 May 2025 16:13:37 +0800
Subject: [PATCH 2/3] add support of azure api

---
 lmms_eval/tasks/capability/utils.py | 52 ++++++++++++++++++++---------
 1 file changed, 37 insertions(+), 15 deletions(-)

diff --git a/lmms_eval/tasks/capability/utils.py b/lmms_eval/tasks/capability/utils.py
index 228b1552..06dbdce4 100644
--- a/lmms_eval/tasks/capability/utils.py
+++ b/lmms_eval/tasks/capability/utils.py
@@ -2,6 +2,7 @@
 import ast
 import yaml
 import json
+import requests
 import numpy as np
 from PIL import Image
 from tqdm import tqdm
@@ -22,8 +23,29 @@
             safe_data.append(line)
 config = yaml.safe_load("".join(safe_data))
 
-OPENAI_API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions")
-OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY")
+API_TYPE = os.getenv("API_TYPE", "openai")
+
+if API_TYPE == "openai":
+    API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions")
+    API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY")
+    headers = {
+        "Authorization": f"Bearer {API_KEY}",
+        "Content-Type": "application/json",
+    }
+elif API_TYPE == "azure":
+    API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken")
+    API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY")
+    headers = {
+        "api-key": API_KEY,
+        "Content-Type": "application/json",
+    }
+else:
+    API_URL = "YOUR_API_URL"
+    API_KEY = "YOUR_API_KEY"
+    headers = {
+        "Authorization": f"Bearer {API_KEY}",
+        "Content-Type": "application/json",
+    }
 
 HF_HOME = os.getenv("HF_HOME", "~/.cache/huggingface")
 HF_HOME = os.path.expanduser(HF_HOME)
@@ -126,7 +148,7 @@ def capability_aggregate_results(results, args):
     strict_match = config['metadata']['eval_strict_match']
     evaluator = Evaluator(
         task, results, save_path,
-        eval_model, num_process,
+        eval_model, headers, num_process,
         max_allow_missing, max_retry_times,
         auto_resume, strict_match
     )
@@ -162,7 +184,7 @@ def capability_aggregate_f1score(results, args):
 class Evaluator:
     def __init__(
             self, task, results, save_path,
-            eval_model, num_process=0,
+            eval_model, headers, num_process=0,
             max_allow_missing=5, max_retry_times=10,
             auto_resume=True, strict_match=True,
     ):
@@ -170,6 +192,7 @@ def __init__(
         self.results = results
         self.save_path = save_path
         self.eval_model = eval_model
+        self.headers = headers
         self.num_process = num_process
         self.max_allow_missing = max_allow_missing
         self.max_retry_times = max_retry_times
@@ -421,21 +444,19 @@ def call_gpt(self, system_prompt, user_prompt):
             {"role": "user", "content": user_prompt},
         ]
         try:
-            client = OpenAI(
-                api_key=OPENAI_API_KEY,
-                base_url=OPENAI_API_URL,
-                timeout=120
-            )
-            response = client.chat.completions.create(
-                model=self.eval_model,
-                messages=messages,
-            )
+            payload = {
+                "model": self.eval_model,
+                "messages": messages,
+            }
+            response = requests.post(API_URL, headers=self.headers, json=payload, timeout=60)
+            response.raise_for_status()
+            response = response.json()
         except Exception as e:
             eval_logger.info(f"Error calling {self.eval_model}: {e}")
             return None
         
         try:
-            response_message = response.choices[0].message.content
+            response_message = response["choices"][0]["message"]["content"].strip()
             return response_message
         except Exception as e:
             eval_logger.info(f"Error parsing {self.eval_model} response: {e}\nResponse: {response}")
@@ -549,6 +570,7 @@ def evaluate_scores(self):
             if len(buffer) > 0:
                 with open(self.save_path, 'a') as f:
                     f.writelines(buffer)
+                buffer.clear()
 
         
         for response in tqdm(saved_responses, desc=f"Calculating {self.task} scores"):
@@ -604,7 +626,7 @@ def calculate_metric(self, score_dict):
         strict_match = config['metadata']['eval_strict_match']
         evaluator = Evaluator(
             task, result, save_path,
-            eval_model, num_process,
+            eval_model, headers, num_process,
             max_allow_missing, max_retry_times,
             auto_resume, strict_match
         )

From 01eac470320831b874654fa08f8efdf8d1e37ce6 Mon Sep 17 00:00:00 2001
From: liuzh <chujing.lzh@alibaba-inc.com>
Date: Sat, 3 May 2025 23:57:04 +0800
Subject: [PATCH 3/3] add warning of os.remove

---
 lmms_eval/tasks/capability/utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lmms_eval/tasks/capability/utils.py b/lmms_eval/tasks/capability/utils.py
index 06dbdce4..22f02fb8 100644
--- a/lmms_eval/tasks/capability/utils.py
+++ b/lmms_eval/tasks/capability/utils.py
@@ -114,6 +114,7 @@ def capability_aggregate_inference_result(results, args):
     # to ensure re-run evaluation if re-run inference
     eval_save_path = os.path.join(os.path.dirname(save_path), f"../evaluation/{task}.jsonl")
     if os.path.exists(eval_save_path):
+        eval_logger.warning(f"Found EXISTING evaluation records: {eval_save_path}, REMOVING it!")
         os.remove(eval_save_path)
     
     with open(save_path, 'w') as f: