Skip to content

[Task] Add new benchmark: CAPability #656

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
May 6, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 52 additions & 0 deletions lmms_eval/tasks/capability/_default_template_yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
dataset_path: lntzm/CAPability
dataset_kwargs:
token: True
cache_dir: capability
video: True

generation_kwargs:
max_new_tokens: 4096
temperature: 0
top_p: 1.0
num_beams: 1
do_sample: false

test_split: test
output_type: generate_until

lmms_eval_specific_kwargs:
default:
# image_prompt: "Please describe the image in detail."
# video_prompt: "Please describe the video in detail."
image_prompt: "Please describe the image in detail. Your description should follow these rules:\na) You should describe each object in the image in detail, including its name, number, color, and spatial relationship between objects.\nb) You should describe the scene of the image.\nc) You should describe the camera angle when shooting this image, such as level angle, high angle, low angle, or dutch angle.\nd) You should describe the style of the image, such as realistic, animated, special-effect, old-fashioned and so on.\ne) If there are any texts in the image, you should describe the text content.\nf) If you know the character in the image, you should tell his or her name.\nDirectly output your detailed description in a elaborate paragraph, instead of itemizing them in list form. Your description: "
video_prompt: "Please describe the video in detail. Your description should follow these rules:\na) You should describe each events in the video in order, especially focusing on the behavior and action of characters, including people, animals.\nb) You should describe each object in the video in detail, including its name, number, color, and spatial relationship between objects.\nc) You should describe the scene of the video.\nd) You should describe the camera movement when shooting this video, especially the direction, such as pan left, track right, tilt up, boom down, zoom in, dolly out, and so on.\ne) You should describe the style of the video, such as realistic, animated, special-effect, old-fashioned and so on.\nf) If there are any texts in the video, you should describe the text content.\ng) If you know the character in the video, you should tell his or her name.\nDirectly output your detailed description in a elaborate paragraph, instead of itemizing them in list form. Your description: "

doc_to_visual: !function utils.capability_doc_to_visual
doc_to_text: !function utils.capability_doc_to_text
doc_to_target: "annotation"
# The return value of process_results will be used by metrics
process_results: !function utils.capability_process_results

metric_list:
- metric: capability_inference_result
aggregation: !function utils.capability_aggregate_inference_result
higher_is_better: null
- metric: capability_precision
aggregation: !function utils.capability_aggregate_precision
higher_is_better: true
- metric: capability_recall
aggregation: !function utils.capability_aggregate_recall
higher_is_better: true
- metric: capability_f1_score
aggregation: !function utils.capability_aggregate_f1score
higher_is_better: true

metadata:
version: 0.1
eval_save_path: null
eval_model_name: "gpt-4.1-2025-04-14"
eval_num_process: 20
eval_max_allow_missing: 5
eval_max_retry_times: 10
eval_auto_resume: true
eval_strict_match: false
15 changes: 15 additions & 0 deletions lmms_eval/tasks/capability/capability.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
group: capability
task:
- capability_object_category
- capability_object_number
- capability_object_color
- capability_spatial_relation
- capability_scene
- capability_camera_angle
- capability_OCR
- capability_style
- capability_character_identification
- capability_dynamic_object_number
- capability_action
- capability_camera_movement
- capability_event
4 changes: 4 additions & 0 deletions lmms_eval/tasks/capability/capability_OCR.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
include: _default_template_yaml

task: capability_OCR
dataset_name: OCR
4 changes: 4 additions & 0 deletions lmms_eval/tasks/capability/capability_action.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
include: _default_template_yaml

task: capability_action
dataset_name: action
4 changes: 4 additions & 0 deletions lmms_eval/tasks/capability/capability_camera_angle.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
include: _default_template_yaml

task: capability_camera_angle
dataset_name: camera_angle
4 changes: 4 additions & 0 deletions lmms_eval/tasks/capability/capability_camera_movement.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
include: _default_template_yaml

task: capability_camera_movement
dataset_name: camera_movement
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
include: _default_template_yaml

task: capability_character_identification
dataset_name: character_identification
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
include: _default_template_yaml

task: capability_dynamic_object_number
dataset_name: dynamic_object_number
4 changes: 4 additions & 0 deletions lmms_eval/tasks/capability/capability_event.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
include: _default_template_yaml

task: capability_event
dataset_name: event
4 changes: 4 additions & 0 deletions lmms_eval/tasks/capability/capability_object_category.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
include: _default_template_yaml

task: capability_object_category
dataset_name: object_category
4 changes: 4 additions & 0 deletions lmms_eval/tasks/capability/capability_object_color.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
include: _default_template_yaml

task: capability_object_color
dataset_name: object_color
4 changes: 4 additions & 0 deletions lmms_eval/tasks/capability/capability_object_number.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
include: _default_template_yaml

task: capability_object_number
dataset_name: object_number
4 changes: 4 additions & 0 deletions lmms_eval/tasks/capability/capability_scene.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
include: _default_template_yaml

task: capability_scene
dataset_name: scene
4 changes: 4 additions & 0 deletions lmms_eval/tasks/capability/capability_spatial_relation.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
include: _default_template_yaml

task: capability_spatial_relation
dataset_name: spatial_relation
4 changes: 4 additions & 0 deletions lmms_eval/tasks/capability/capability_style.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
include: _default_template_yaml

task: capability_style
dataset_name: style
Loading
Loading