We introduce ScoreRS, a quality assessment model trained on carefully curated large-scale remote sensing vision-language preference data. ScoreRS effectively scores and filters vision-language datasets, improving model performance by selecting high-quality data for training.
- Please checkout the blogpost for the chain-of-thought behind this project :).
- Our report is available at arXiv now!
- 🎉🎉🎉 We release our report here. Our data and model are also released!
- We release our ScoreRS model, finetuned CLIP, Qwen2VL, and our basic codebase!
The environment specifically used for inference, demonstrations, fine-tuning CLIP and Qwen2VL, without other specifics; you should use this environment.
conda create -n scorers python==3.10 -y
conda activate scorers
cd scorers # important!!!!!!! Make sure you are under the projcet directory for the following command.
bash basic_env_setup.sh
conda create -n scorers_vllm python==3.10 -y
conda activate scorers_vllm
cd scorers # important!!!!!!! Make sure you are under the projcet directory for the following command.
bash vllm_env_setup.sh
import sys
sys.path.append("path_to_ScoreRS_code_root") # make sure to replace this line with your path to ScoreRS folder
import torch
from src.model.qwen_reward import Qwen2Reward
from transformer import AutoProcessor
from qwen_vl_utils import process_vision_info
from transformers.models.qwen2_vl.configuration_qwen2_vl import Qwen2VLConfig
image_path = "./dummy_folder/your_image.png"
device = "cuda"
processor = AutoProcessor.from_pretrained("PumpkinCat/ScoreRS")
processor.tokenizer.padding_side = "right"
model_config = Qwen2VLConfig.from_pretrained("PumpkinCat/ScoreRS")
model_config.pad_token_id = processor.tokenizer.pad_token_id
model_config.ranked_candidate_num = 1
model = Qwen2Reward.from_pretrained(
"PumpkinCat/ScoreRS",
config=model_config,
torch_dtype=torch.float16,
attn_implementation="flash_attention_2",
device_map=device
)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image_path,
},
{"type": "text", "text": "A satellite image airport"},
],
}
]
text = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to(model.device)
with torch.no_grad() and torch.autocast("cuda", dtype=torch.float16):
outputs = model(**inputs, return_dict=True, return_loss=False)
scores = outputs.value
print(scores) # score
from PIL import Image
import torch
from transformers import CLIPProcessor, CLIPModel
device = "cuda"
torch_dtype = torch.float16
model = CLIPModel.from_pretrained("PumpkinCat/CLIP-ViT-L14-ScoreRS30", device_map=device, torch_dtype=torch_dtype)
processor = CLIPProcessor.from_pretrained("PumpkinCat/CLIP-ViT-L14-ScoreRS30")
image_path = "./dummy_folder/your_image.png"
image = Image.open(image_path)
inputs = processor(text=["a satellite image of airport", "a staellite image of sea"], images=image, return_tensors="pt", padding=True)
outputs = model(**inputs)
logits_per_image = outputs.logits_per_image # this is the image-text similarity score
probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
from transformers import AutoProcessor
from transformers.models.qwen2_vl.modeling_qwen2_vl import (
Qwen2VLForConditionalGeneration,
)
from qwen_vl_utils import process_vision_info
R1_SYSTEM_PROMPT = "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think><answer> answer here </answer>"
use_thinking = True # whether you want to use Qwen2VL-RS-R1 or Qwen2VL-RS
model_path = (
"PumpkinCat/Qwen2VL-7B-RS-R1" if use_thinking else "PumpkinCat/Qwen2VL-7B-RS"
)
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_path, torch_dtype="auto", device_map="auto"
)
processor = AutoProcessor.from_pretrained(model_path)
image_path = "./dummy_folder/your_image.png"
messages = []
if use_thinking:
messages.append(
{"role": "system", "content": [{"type": "text", "text": R1_SYSTEM_PROMPT}]}
)
messages.append(
{
"role": "user",
"content": [
{
"type": "image",
"image": image_path,
},
{"type": "text", "text": "Describe this image."},
],
}
)
# Preparation for inference
text = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to(model.device)
# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)
from lmdeploy import (
GenerationConfig,
TurbomindEngineConfig,
pipeline,
ChatTemplateConfig,
)
from lmdeploy.vl import load_image
use_thinking = True # whether you want to use Qwen2VL-RS-R1 or Qwen2VL-RS
model_path = (
"PumpkinCat/Qwen2VL-7B-RS-R1" if use_thinking else "PumpkinCat/Qwen2VL-7B-RS"
)
if use_thinking:
reasoning_config = "./config/qwen2_thinking_template.json"
else:
reasoning_config = None
model = pipeline(
model_path,
backend_config=TurbomindEngineConfig(session_len=8192),
chat_template_config=(
ChatTemplateConfig.from_json(reasoning_config)
if reasoning_config
else None
),
)
generation_config = GenerationConfig(
max_new_tokens=256,
temperature=0.01,
top_p=0.9,
do_sample=True,
)
image_path = "./dummy_folder/your_image.png"
image = load_image(image_path)
response = model(('describe this image', image))
print(response)
from vllm import SamplingParams
from vllm import LLM
from transformer import AutoProcessor
# Qwen2-VL
def run_qwen2_vl(question: str, processor: AutoProcessor):
placeholder = "<|image_pad|>"
prompt = (
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
f"{question}<|im_end|>\n"
"<|im_start|>assistant\n"
)
stop_token_ids = None
return prompt, stop_token_ids
# Qwen2-VL-R1
def run_qwen2_vl_r1(question: str, processor: AutoProcessor):
SYSTEM_PROMPT = (
"A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant "
"first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning "
"process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., "
"<think> reasoning process here </think><answer> answer here </answer>"
)
placeholder = "<|image_pad|>"
prompt = (
f"<|im_start|>system\n{SYSTEM_PROMPT}<|im_end|>\n"
f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
f"{question}<|im_end|>\n"
"<|im_start|>assistant\n"
)
stop_token_ids = None
return prompt, stop_token_ids
device = "cuda"
use_thinking = True # whether you want to use Qwen2VL-RS-R1 or Qwen2VL-RS
model_path = (
"PumpkinCat/Qwen2VL-7B-RS-R1" if use_thinking else "PumpkinCat/Qwen2VL-7B-RS"
)
llm = LLM(
model=model_path,
mm_processor_kwargs={
"min_pixels": 28 * 28,
"max_pixels": 1280 * 28 * 28,
},
disable_mm_preprocessor_cache=False,
dtype="float16",
device=device,
gpu_memory_utilization=0.6,
)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
sampling_param = SamplingParams(
temperature=0.01,
top_p=0.9,
max_tokens=16384,
)
run_func = run_qwen2_vl_r1 if use_thinking else run_qwen2_vl
image_path = "./dummy_folder/your_image.png"
prompt = "describe the image."
questions, stop_token_ids = run_func(prompt, processor)
sampling_param.stop_token_ids = stop_token_ids
inputs = [
{
"prompt": questions,
"multi_modal_data": {"image": image_path},
}
]
outputs = llm.generate(inputs, sampling_params=sampling_param)
print(outputs)
- Qwen2VL-RS
python ./python_script/web_demo.py -c PumpkinCat/Qwen2VL-7B-RS --flash-attn2
- Qwen2VL-RS-R1
python ./python_script/web_demo.py -c PumpkinCat/Qwen2VL-7B-RS-R1 --flash-attn2 --reasoning
-
Classification:
Please refer to this file for evluation CLIP on classification tasks.
-
Retrieval
Please refer to this file for evluation CLIP on retrieval tasks.
-
First download and zip this zip file from our Huggingface Repo
-
For Evaluation Our Qwen2VL-RS Series (Shell Script)
-
Qwen2VL-RS
SCRIPT_PATH=./python_script/evaluation/rs_evaluation.py DATA_ROOT="Your path to unzip folder" OUTPUT_DIR="Your path to eval log file" model_type=lmdeploy MODEL_PATH=PumpkinCat/Qwen2VL-7B-RS CUDA_VISIBLE_DEVICES=0 accelerate launch --num_processes 1 --mixed_precision bf16 $SCRIPT_PATH \ --data_root $DATA_ROOT \ --output_dir $OUTPUT_DIR \ --model_type $model_type \ --model_path $MODEL_PATH \ --force_inference true \ --task all
-
Qwen2VL-RS-R1
... # same as above MODEL_PATH=PumpkinCat/Qwen2VL-7B-RS-R1 REASONING_CONFIG=./python_script/evaluation/qwen2_thinking_template.json CUDA_VISIBLE_DEVICES=0 accelerate launch --num_processes 1 --mixed_precision bf16 $SCRIPT_PATH \ --data_root $DATA_ROOT \ --output_dir $OUTPUT_DIR \ --model_type $model_type \ --model_path $MODEL_PATH \ --force_inference true \ --task all \ --reasoning_config $REASONING_CONFIG
-
-
Qwen2-VL, LLaVA-1.6, and InternVL-2.5
... # same as eval on Qwen2VL-RS model_type=lmpdeloy MODEL_PATH=Qwen/Qwen2-VL-7B-Instruct # liuhaotian/llava-v1.6-vicuna-7b or OpenGVLab/InternVL2_5-8B ... # same as eval on Qwen2VL-RS
-
GeoChat, VHM, and SkysenseGPT
... # same as eval on Qwen2VL-RS model_type=geochat # vhm or skysensegpt MODEL_PATH=MBZUAI/geochat-7B # FitzPC/vhm_7B or ll-13/SkySenseGPT-7B-CLIP-ViT ... # same as eval on Qwen2VL-RS
-
LHRS-Bot-Nova
-
First, download the converted Huggingface-style checkpoint from here
... # same as eval on Qwen2VL-RS model_type=lhrs MODEL_PATH="your_path_to FINAL.pt" # important!!! must be point to FINAL.pt file and make sure the TextLoRA is under the same folder with the FINAL.pt ... # same as eval on Qwen2VL-RS
-
- Our preference data can be found at Hugging Face.
-
If you encounter an error related to "flash_attn" when using "Llama", please install "flash_attn" using the following command (from here):
pip install https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.8/flash_attn-2.5.8+cu122torch2.3cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
- We gratitude to the following repositories for their wonderful works:
- We are grateful to the wonderful works that provided the foundation for our project: LHRS-Bot, GeoChat, and SkysenseGPT.
- If you find our work is useful, please give us 🌟 in GitHub.
- Licence: MIT