From 7e33b4e7c012bb91dedfb9910d13e8e51b836f11 Mon Sep 17 00:00:00 2001 From: Roger Wang <136131678+ywang96@users.noreply.github.com> Date: Mon, 6 Jan 2025 11:58:16 -0800 Subject: [PATCH] [V1] Extend beyond image modality and support mixed-modality inference with Llava-OneVision (#11685) Signed-off-by: Roger Wang Signed-off-by: DarkLight1337 Co-authored-by: DarkLight1337 --- docs/source/models/supported_models.md | 2 +- tests/multimodal/test_utils.py | 209 +++++++++++++++++- tests/v1/core/test_kv_cache_utils.py | 18 +- tests/v1/core/test_prefix_caching.py | 17 +- vllm/model_executor/models/interfaces.py | 6 +- vllm/model_executor/models/llava_onevision.py | 65 +++--- vllm/model_executor/models/molmo.py | 3 - vllm/multimodal/__init__.py | 3 + vllm/multimodal/hasher.py | 100 +++++++++ vllm/multimodal/inputs.py | 9 +- vllm/multimodal/processing.py | 92 +++----- vllm/multimodal/utils.py | 86 ++++++- vllm/v1/engine/__init__.py | 18 +- vllm/v1/engine/mm_input_mapper.py | 67 ------ vllm/v1/engine/processor.py | 101 ++++++--- vllm/v1/request.py | 48 ++-- vllm/v1/worker/gpu_model_runner.py | 74 ++++--- 17 files changed, 636 insertions(+), 282 deletions(-) create mode 100644 vllm/multimodal/hasher.py diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md index 5a2778026192a..94a8849f7edcd 100644 --- a/docs/source/models/supported_models.md +++ b/docs/source/models/supported_models.md @@ -647,7 +647,7 @@ See [this page](#generative-models) for more information on how to use generativ - `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. - - ✅︎ - - + - ✅︎ * - `MiniCPMV` - MiniCPM-V - T + IE+ diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py index 6029f2e514772..198344e5bd88c 100644 --- a/tests/multimodal/test_utils.py +++ b/tests/multimodal/test_utils.py @@ -2,16 +2,22 @@ import mimetypes import os from tempfile import NamedTemporaryFile, TemporaryDirectory -from typing import Dict, Tuple +from typing import TYPE_CHECKING, Dict, NamedTuple, Optional, Tuple import numpy as np import pytest from PIL import Image, ImageChops from transformers import AutoConfig, AutoTokenizer +from vllm.multimodal.inputs import PlaceholderRange from vllm.multimodal.utils import (MediaConnector, + merge_and_sort_multimodal_metadata, repeat_and_pad_placeholder_tokens) +if TYPE_CHECKING: + from vllm.multimodal.hasher import MultiModalHashDict + from vllm.multimodal.inputs import MultiModalPlaceholderDict + # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA) TEST_IMAGE_URLS = [ "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", @@ -191,3 +197,204 @@ def test_repeat_and_pad_placeholder_tokens(model): assert new_prompt == expected_prompt assert new_token_ids == expected_token_ids assert ranges == expected_ranges + + +# Used for the next two tests related to `merge_and_sort_multimodal_metadata`. +class TestCase(NamedTuple): + mm_positions: "MultiModalPlaceholderDict" + mm_hashes: Optional["MultiModalHashDict"] + expected_modalities: list[str] + expected_ranges: list[PlaceholderRange] + expected_hashes: Optional[list[str]] + + +def test_merge_and_sort_multimodal_metadata(): + + test_cases = [ + # Single modality should return result as is but flattened + TestCase( + mm_positions={ + "image": [ + PlaceholderRange(offset=0, length=2), + PlaceholderRange(offset=3, length=2), + ] + }, + mm_hashes={"image": ["hash1", "hash2"]}, + expected_modalities=["image"], + expected_ranges=[ + PlaceholderRange(offset=0, length=2), + PlaceholderRange(offset=3, length=2), + ], + expected_hashes=["hash1", "hash2"], + ), + + # Single modality without hashes return None for mm hash. + TestCase( + mm_positions={ + "image": [ + PlaceholderRange(offset=0, length=2), + PlaceholderRange(offset=2, length=2), + ] + }, + mm_hashes=None, + expected_modalities=["image"], + expected_ranges=[ + PlaceholderRange(offset=0, length=2), + PlaceholderRange(offset=2, length=2), + ], + expected_hashes=None, + ), + + # Multiple modalities with hashes should return sorted modalities + # and flattened ranges and hashes. + TestCase( + mm_positions={ + "image": [ + PlaceholderRange(offset=7, length=4), + PlaceholderRange(offset=11, length=5), + ], + "audio": [ + PlaceholderRange(offset=0, length=2), + PlaceholderRange(offset=2, length=3), + ] + }, + mm_hashes={ + "image": ["image_hash1", "image_hash2"], + "audio": ["audio_hash1", "audio_hash2"], + }, + expected_modalities=["audio", "image"], + expected_ranges=[ + PlaceholderRange(offset=0, length=2), + PlaceholderRange(offset=2, length=3), + PlaceholderRange(offset=7, length=4), + PlaceholderRange(offset=11, length=5), + ], + expected_hashes=[ + "audio_hash1", "audio_hash2", "image_hash1", "image_hash2" + ], + ), + + # Multiple modalities without hashes should return sorted modalities + # and flattened ranges and None. + TestCase( + mm_positions={ + "image": [ + PlaceholderRange(offset=7, length=4), + PlaceholderRange(offset=11, length=5), + ], + "audio": [ + PlaceholderRange(offset=0, length=2), + PlaceholderRange(offset=2, length=3), + ] + }, + mm_hashes=None, + expected_modalities=["audio", "image"], + expected_ranges=[ + PlaceholderRange(offset=0, length=2), + PlaceholderRange(offset=2, length=3), + PlaceholderRange(offset=7, length=4), + PlaceholderRange(offset=11, length=5), + ], + expected_hashes=None, + ), + + # Three modalities + TestCase( + mm_positions={ + "image": [ + PlaceholderRange(offset=15, length=7), + PlaceholderRange(offset=22, length=8), + ], + "audio": [ + PlaceholderRange(offset=0, length=2), + ], + "video": [ + PlaceholderRange(offset=3, length=4), + PlaceholderRange(offset=7, length=5), + PlaceholderRange(offset=12, length=6), + ] + }, + mm_hashes={ + "image": ["image_hash1", "image_hash2"], + "audio": ["audio_hash1"], + "video": ["video_hash1", "video_hash2", "video_hash3"] + }, + expected_modalities=["audio", "video", "image"], + expected_ranges=[ + PlaceholderRange(offset=0, length=2), + PlaceholderRange(offset=3, length=4), + PlaceholderRange(offset=7, length=5), + PlaceholderRange(offset=12, length=6), + PlaceholderRange(offset=15, length=7), + PlaceholderRange(offset=22, length=8), + ], + expected_hashes=[ + "audio_hash1", "video_hash1", "video_hash2", "video_hash3", + "image_hash1", "image_hash2" + ], + ), + ] + + for (mm_positions, mm_hashes, expected_modalities, expected_ranges, + expected_hashes) in test_cases: + modalities, ranges, hashes = merge_and_sort_multimodal_metadata( + mm_positions, mm_hashes) + + assert modalities == expected_modalities + assert ranges == expected_ranges + assert hashes == expected_hashes + + +def test_merge_and_sort_multimodal_metadata_with_interleaving(): + + test_cases = [ + + #