From f0002bfe3ef89232e197c9de45a850eb2a738476 Mon Sep 17 00:00:00 2001 From: xiangyuT Date: Mon, 24 Mar 2025 16:04:16 +0800 Subject: [PATCH 01/19] init --- .../llm/src/ipex_llm/vllm/xpu/engine/engine.py | 16 ++++++++++++---- .../llm/src/ipex_llm/vllm/xpu/model_convert.py | 6 +++--- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/python/llm/src/ipex_llm/vllm/xpu/engine/engine.py b/python/llm/src/ipex_llm/vllm/xpu/engine/engine.py index 178da383e0e..f17470ca9a3 100644 --- a/python/llm/src/ipex_llm/vllm/xpu/engine/engine.py +++ b/python/llm/src/ipex_llm/vllm/xpu/engine/engine.py @@ -94,20 +94,20 @@ def __init__( quantization: Optional[str] = None, revision: Optional[str] = None, tokenizer_revision: Optional[str] = None, - seed: int = 0, + seed: Optional[int] = None, gpu_memory_utilization: float = 0.9, swap_space: float = 4, cpu_offload_gb: float = 0, enforce_eager: Optional[bool] = None, max_seq_len_to_capture: int = 8192, disable_custom_all_reduce: bool = False, - disable_async_output_proc: bool = True, + disable_async_output_proc: bool = False, hf_overrides: Optional[HfOverrides] = None, - mm_processor_kwargs: Optional[Dict[str, Any]]=None, + mm_processor_kwargs: Optional[dict[str, Any]] = None, # After positional args are removed, move this right below `model` task: TaskOption = "auto", override_pooler_config: Optional[PoolerConfig] = None, - compilation_config: Optional[Union[int, Dict[str, Any]]]=None, + compilation_config: Optional[Union[int, dict[str, Any]]]=None, load_in_low_bit: str = "sym_int4", **kwargs, ) -> None: @@ -120,6 +120,13 @@ def __init__( if "disable_log_stats" not in kwargs: kwargs["disable_log_stats"] = True + if "worker_cls" in kwargs: + worker_cls = kwargs["worker_cls"] + # if the worker_cls is not qualified string name, + # we serialize it using cloudpickle to avoid pickling issues + if isinstance(worker_cls, type): + kwargs["worker_cls"] = cloudpickle.dumps(worker_cls) + if compilation_config is not None: if isinstance(compilation_config, (int, dict)): compilation_config_instance = CompilationConfig.from_cli( @@ -164,6 +171,7 @@ def __init__( load_in_low_bit=load_in_low_bit) self.request_counter = Counter() + self.default_sampling_params: Union[dict[str, Any], None] = None @staticmethod def get_engine_class() -> Type[LLMEngine]: diff --git a/python/llm/src/ipex_llm/vllm/xpu/model_convert.py b/python/llm/src/ipex_llm/vllm/xpu/model_convert.py index 1237d2b271a..bc2cd79c84b 100644 --- a/python/llm/src/ipex_llm/vllm/xpu/model_convert.py +++ b/python/llm/src/ipex_llm/vllm/xpu/model_convert.py @@ -48,7 +48,7 @@ def _sample_get_logits( logits = lm_head(hidden_states) if embedding_bias is not None: logits += embedding_bias - if self.use_gather: + if self.use_all_gather: logits = tensor_model_parallel_gather(logits) else: logits = tensor_model_parallel_all_gather(logits) @@ -68,9 +68,9 @@ def _ipex_llm_convert(load_in_low_bit): from ipex_llm.vllm.xpu.ipex_llm_v1_wrapper import get_ipex_llm_v1_wrapper import vllm.executor.ray_utils as ray_utils_v0 import vllm.v1.executor.ray_utils as ray_utils_v1 - from vllm.v1.worker.gpu_model_runner import GPUModelRunner + # from vllm.v1.worker.gpu_model_runner import GPUModelRunner setattr(XPUModelRunner, "load_model", get_load_function(load_in_low_bit)) - setattr(GPUModelRunner, "load_model", get_load_function(load_in_low_bit)) + # setattr(GPUModelRunner, "load_model", get_load_function(load_in_low_bit)) setattr(ray_utils_v0, "RayWorkerWrapper", get_ipex_llm_wrapper(load_in_low_bit)) setattr(ray_utils_v1, "RayWorkerWrapper", get_ipex_llm_v1_wrapper(load_in_low_bit)) From d0fc9e9b0e6aec373d8c30736e396984784b2978 Mon Sep 17 00:00:00 2001 From: xiangyuT Date: Mon, 24 Mar 2025 16:47:26 +0800 Subject: [PATCH 02/19] update openai/api_server.py --- .../src/ipex_llm/vllm/xpu/engine/engine.py | 93 +++ .../vllm/xpu/entrypoints/openai/api_server.py | 542 ++++++++++++------ 2 files changed, 447 insertions(+), 188 deletions(-) diff --git a/python/llm/src/ipex_llm/vllm/xpu/engine/engine.py b/python/llm/src/ipex_llm/vllm/xpu/engine/engine.py index f17470ca9a3..0617c441ebc 100644 --- a/python/llm/src/ipex_llm/vllm/xpu/engine/engine.py +++ b/python/llm/src/ipex_llm/vllm/xpu/engine/engine.py @@ -58,6 +58,28 @@ def from_engine_args( start_engine_loop=start_engine_loop, usage_context=usage_context, stat_loggers=stat_loggers) + @classmethod + def from_vllm_config( + cls, + vllm_config: VllmConfig, + start_engine_loop: bool = True, + usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, + stat_loggers: Optional[dict[str, StatLoggerBase]] = None, + disable_log_requests: bool = False, + disable_log_stats: bool = False, + load_in_low_bit: str = "sym_int4", + ) -> "AsyncLLMEngine": + _ipex_llm_convert(load_in_low_bit) + return super().from_vllm_config( + cls=cls, + vllm_config=vllm_config, + start_engine_loop=start_engine_loop, + usage_context=usage_context, + stat_loggers=stat_loggers, + disable_log_requests=disable_log_requests, + disable_log_stats=disable_log_stats, + ) + class IPEXLLMAsyncV1Engine(AsyncLLM): @@ -79,6 +101,27 @@ def from_engine_args( start_engine_loop=start_engine_loop, usage_context=usage_context, stat_loggers=stat_loggers) + @classmethod + def from_vllm_config( + cls, + vllm_config: VllmConfig, + start_engine_loop: bool = True, + usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, + stat_loggers: Optional[dict[str, StatLoggerBase]] = None, + disable_log_requests: bool = False, + disable_log_stats: bool = False, + load_in_low_bit: str = "sym_int4", + ) -> "AsyncLLM": + _ipex_llm_convert(load_in_low_bit) + return super().from_vllm_config( + cls=cls, + vllm_config=vllm_config, + start_engine_loop=start_engine_loop, + usage_context=usage_context, + stat_loggers=stat_loggers, + disable_log_requests=disable_log_requests, + disable_log_stats=disable_log_stats, + ) class IPEXLLMClass(LLM): def __init__( @@ -202,6 +245,23 @@ def from_engine_args( stat_loggers, enable_multiprocessing) + @classmethod + def from_vllm_config( + cls, + vllm_config: VllmConfig, + usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, + stat_loggers: Optional[Dict[str, StatLoggerBase]] = None, + disable_log_stats: bool = False, + load_in_low_bit: str = "sym_int4", + ) -> "LLMEngine": + _ipex_llm_convert(load_in_low_bit) + return super().from_vllm_config( + cls=cls, + vllm_config=vllm_config, + usage_context=usage_context, + stat_loggers=stat_loggers, + disable_log_stats=disable_log_stats + ) class IPEXLLMLLMEngine(LLMEngine): def __init__(self, *args, **kwargs): @@ -220,6 +280,23 @@ def from_engine_args( _ipex_llm_convert(load_in_low_bit) return super().from_engine_args(engine_args, usage_context, stat_loggers) + @classmethod + def from_vllm_config( + cls, + vllm_config: VllmConfig, + usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, + stat_loggers: Optional[Dict[str, StatLoggerBase]] = None, + disable_log_stats: bool = False, + load_in_low_bit: str = "sym_int4", + ) -> "LLMEngine": + _ipex_llm_convert(load_in_low_bit) + return super().from_vllm_config( + cls=cls, + vllm_config=vllm_config, + usage_context=usage_context, + stat_loggers=stat_loggers, + disable_log_stats=disable_log_stats + ) class IPEXLLMMQLLMEngine(MQLLMEngine): @classmethod @@ -228,6 +305,22 @@ def from_engine_args(cls, engine_args: AsyncEngineArgs, _ipex_llm_convert(load_in_low_bit) return super().from_engine_args(engine_args, usage_context, ipc_path) + @classmethod + def from_vllm_config(cls, vllm_config: VllmConfig, + usage_context: UsageContext, + disable_log_requests: bool, disable_log_stats: bool, + ipc_path: str, load_in_low_bit: str) -> "MQLLMEngine": + + _ipex_llm_convert(load_in_low_bit) + return super().from_vllm_config( + cls=cls, + vllm_config=vllm_config, + ipc_path=ipc_path, + usage_context=usage_context, + disable_log_requests=disable_log_requests + disable_log_stats=disable_log_stats, + ) + def run_mp_engine(engine_args: AsyncEngineArgs, usage_context: UsageContext, ipc_path: str, load_in_low_bit: str, engine_alive): diff --git a/python/llm/src/ipex_llm/vllm/xpu/entrypoints/openai/api_server.py b/python/llm/src/ipex_llm/vllm/xpu/entrypoints/openai/api_server.py index 4c16c12f1c2..ae8909c30f9 100644 --- a/python/llm/src/ipex_llm/vllm/xpu/entrypoints/openai/api_server.py +++ b/python/llm/src/ipex_llm/vllm/xpu/entrypoints/openai/api_server.py @@ -1,5 +1,8 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio import atexit +import gc import importlib import inspect import multiprocessing @@ -10,13 +13,14 @@ import tempfile import uuid from argparse import Namespace +from collections.abc import AsyncIterator from contextlib import asynccontextmanager from functools import partial from http import HTTPStatus -from typing import AsyncIterator, Optional, Set, Tuple +from typing import Annotated, Optional, Union import uvloop -from fastapi import APIRouter, FastAPI, Request +from fastapi import APIRouter, Depends, FastAPI, Form, HTTPException, Request from fastapi.exceptions import RequestValidationError from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import JSONResponse, Response, StreamingResponse @@ -27,17 +31,15 @@ import vllm.envs as envs from vllm.config import ModelConfig from vllm.engine.arg_utils import AsyncEngineArgs -from ipex_llm.vllm.xpu.engine import IPEXLLMAsyncLLMEngine as AsyncLLMEngine +from vllm.engine.async_llm_engine import AsyncLLMEngine # type: ignore from vllm.engine.multiprocessing.client import MQLLMEngineClient -from ipex_llm.vllm.xpu.engine import run_mp_engine +from vllm.engine.multiprocessing.engine import run_mp_engine from vllm.engine.protocol import EngineClient from vllm.entrypoints.chat_utils import load_chat_template from vllm.entrypoints.launcher import serve_http from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.openai.cli_args import (make_arg_parser, validate_parsed_serve_args) - -# from ipex_llm.vllm.xpu.entrypoints.openai.cli_args import make_arg_parser # yapf conflicts with isort for this block # yapf: disable from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, @@ -46,31 +48,42 @@ CompletionResponse, DetokenizeRequest, DetokenizeResponse, + EmbeddingChatRequest, + EmbeddingCompletionRequest, EmbeddingRequest, EmbeddingResponse, EmbeddingResponseData, ErrorResponse, - LoadLoraAdapterRequest, + LoadLoRAAdapterRequest, + PoolingChatRequest, + PoolingCompletionRequest, PoolingRequest, PoolingResponse, + RerankRequest, RerankResponse, ScoreRequest, ScoreResponse, TokenizeRequest, TokenizeResponse, - UnloadLoraAdapterRequest) + TranscriptionRequest, + TranscriptionResponse, + UnloadLoRAAdapterRequest) +from vllm.entrypoints.openai.reasoning_parsers import ReasoningParserManager # yapf: enable from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding +from vllm.entrypoints.openai.serving_engine import OpenAIServing from vllm.entrypoints.openai.serving_models import (BaseModelPath, OpenAIServingModels) - -from vllm.entrypoints.openai.serving_engine import OpenAIServing from vllm.entrypoints.openai.serving_pooling import OpenAIServingPooling -from vllm.entrypoints.openai.serving_score import OpenAIServingScores +from vllm.entrypoints.openai.serving_score import ServingScores from vllm.entrypoints.openai.serving_tokenization import ( OpenAIServingTokenization) +from vllm.entrypoints.openai.serving_transcription import ( + OpenAIServingTranscription) from vllm.entrypoints.openai.tool_parsers import ToolParserManager -from vllm.entrypoints.utils import with_cancellation +from vllm.entrypoints.utils import load_aware_call, with_cancellation from vllm.logger import init_logger +from vllm.transformers_utils.config import ( + maybe_register_config_serialize_by_value) from vllm.usage.usage_lib import UsageContext from vllm.utils import (FlexibleArgumentParser, get_open_zmq_ipc_path, is_valid_ipv6_address, set_ulimit) @@ -83,7 +96,7 @@ # Cannot use __name__ (https://github.com/vllm-project/vllm/pull/4765) logger = init_logger('vllm.entrypoints.openai.api_server') -_running_tasks: Set[asyncio.Task] = set() +_running_tasks: set[asyncio.Task] = set() @asynccontextmanager @@ -102,6 +115,11 @@ async def _force_log(): task.add_done_callback(_running_tasks.remove) else: task = None + + # Mark the startup heap as static so that it's ignored by GC. + # Reduces pause times of oldest generation collections. + gc.collect() + gc.freeze() try: yield finally: @@ -139,24 +157,47 @@ async def build_async_engine_client_from_engine_args( Returns the Client or None if the creation failed. """ - # Fall back - # TODO: fill out feature matrix. - if (MQLLMEngineClient.is_unsupported_config(engine_args) - or envs.VLLM_USE_V1 or disable_frontend_multiprocessing): + # Create the EngineConfig (determines if we can use V1). + usage_context = UsageContext.OPENAI_API_SERVER + vllm_config = engine_args.create_engine_config(usage_context=usage_context) + + # V1 AsyncLLM. + if envs.VLLM_USE_V1: + if disable_frontend_multiprocessing: + logger.warning( + "V1 is enabled, but got --disable-frontend-multiprocessing. " + "To disable frontend multiprocessing, set VLLM_USE_V1=0.") + + from vllm.v1.engine.async_llm import AsyncLLM + async_llm: Optional[AsyncLLM] = None + try: + async_llm = AsyncLLM.from_vllm_config( + vllm_config=vllm_config, + usage_context=usage_context, + disable_log_requests=engine_args.disable_log_requests, + disable_log_stats=engine_args.disable_log_stats) + yield async_llm + finally: + if async_llm: + async_llm.shutdown() + + # V0 AsyncLLM. + elif (MQLLMEngineClient.is_unsupported_config(vllm_config) + or disable_frontend_multiprocessing): + engine_client: Optional[EngineClient] = None try: - # When starting this, we are actually starting with the V1Engine - # Here we are doing a classification, we will need to do this in IPEX-LLM - engine_client = AsyncLLMEngine.from_engine_args( - engine_args=engine_args, - usage_context=UsageContext.OPENAI_API_SERVER, - load_in_low_bit=load_in_low_bit) + engine_client = AsyncLLMEngine.from_vllm_config( + vllm_config=vllm_config, + usage_context=usage_context, + disable_log_requests=engine_args.disable_log_requests, + disable_log_stats=engine_args.disable_log_stats) yield engine_client finally: if engine_client and hasattr(engine_client, "shutdown"): engine_client.shutdown() - # Otherwise, use the multiprocessing AsyncLLMEngine. + # V0MQLLMEngine. else: if "PROMETHEUS_MULTIPROC_DIR" not in os.environ: # Make TemporaryDirectory for prometheus multiprocessing @@ -183,14 +224,18 @@ async def build_async_engine_client_from_engine_args( # so we need to spawn a new process context = multiprocessing.get_context("spawn") + # Ensure we can serialize transformer config before spawning + maybe_register_config_serialize_by_value() + # The Process can raise an exception during startup, which may # not actually result in an exitcode being reported. As a result # we use a shared variable to communicate the information. engine_alive = multiprocessing.Value('b', True, lock=False) - engine_process = context.Process(target=run_mp_engine, - args=(engine_args, - UsageContext.OPENAI_API_SERVER, - ipc_path, load_in_low_bit, engine_alive)) + engine_process = context.Process( + target=run_mp_engine, + args=(vllm_config, UsageContext.OPENAI_API_SERVER, ipc_path, + engine_args.disable_log_stats, + engine_args.disable_log_requests, engine_alive)) engine_process.start() engine_pid = engine_process.pid assert engine_pid is not None, "Engine process failed to start." @@ -205,8 +250,7 @@ def _cleanup_ipc_path(): atexit.register(_cleanup_ipc_path) # Build RPCClient, which conforms to EngineClient Protocol. - engine_config = engine_args.create_engine_config() - build_client = partial(MQLLMEngineClient, ipc_path, engine_config, + build_client = partial(MQLLMEngineClient, ipc_path, vllm_config, engine_pid) mq_engine_client = await asyncio.get_running_loop().run_in_executor( None, build_client) @@ -244,6 +288,16 @@ def _cleanup_ipc_path(): multiprocess.mark_process_dead(engine_process.pid) +async def validate_json_request(raw_request: Request): + content_type = raw_request.headers.get("content-type", "").lower() + media_type = content_type.split(";", maxsplit=1)[0] + if media_type != "application/json": + raise HTTPException( + status_code=HTTPStatus.UNSUPPORTED_MEDIA_TYPE, + detail="Unsupported Media Type: Only 'application/json' is allowed" + ) + + router = APIRouter() @@ -298,7 +352,11 @@ def embedding(request: Request) -> Optional[OpenAIServingEmbedding]: return request.app.state.openai_serving_embedding -def score(request: Request) -> Optional[OpenAIServingScores]: +def score(request: Request) -> Optional[ServingScores]: + return request.app.state.openai_serving_scores + + +def rerank(request: Request) -> Optional[ServingScores]: return request.app.state.openai_serving_scores @@ -306,6 +364,10 @@ def tokenization(request: Request) -> OpenAIServingTokenization: return request.app.state.openai_serving_tokenization +def transcription(request: Request) -> OpenAIServingTranscription: + return request.app.state.openai_serving_transcription + + def engine_client(request: Request) -> EngineClient: return request.app.state.engine_client @@ -317,7 +379,31 @@ async def health(raw_request: Request) -> Response: return Response(status_code=200) -@router.post("/tokenize") +@router.get("/load") +async def get_server_load_metrics(request: Request): + # This endpoint returns the current server load metrics. + # It tracks requests utilizing the GPU from the following routes: + # - /v1/chat/completions + # - /v1/completions + # - /v1/audio/transcriptions + # - /v1/embeddings + # - /pooling + # - /score + # - /v1/score + # - /rerank + # - /v1/rerank + # - /v2/rerank + return JSONResponse( + content={'server_load': request.app.state.server_load_metrics}) + + +@router.api_route("/ping", methods=["GET", "POST"]) +async def ping(raw_request: Request) -> Response: + """Ping check. Endpoint required for SageMaker""" + return await health(raw_request) + + +@router.post("/tokenize", dependencies=[Depends(validate_json_request)]) @with_cancellation async def tokenize(request: TokenizeRequest, raw_request: Request): handler = tokenization(raw_request) @@ -332,7 +418,7 @@ async def tokenize(request: TokenizeRequest, raw_request: Request): assert_never(generator) -@router.post("/detokenize") +@router.post("/detokenize", dependencies=[Depends(validate_json_request)]) @with_cancellation async def detokenize(request: DetokenizeRequest, raw_request: Request): handler = tokenization(raw_request) @@ -361,35 +447,10 @@ async def show_version(): return JSONResponse(content=ver) -save_dict = {} -import os -flag = os.getenv("VLLM_LOG_OUTPUT", None) -async def stream_generator(generator, request, request_id): - async for chunk in generator: - if request_id not in save_dict: - save_dict[request_id] = "" - import json - try: - data = chunk.strip() - if data.startswith('data: '): - data = data[len('data: '):] - else: - yield chunk - json_data = json.loads(data) - if 'choices' in json_data and len(json_data['choices']) > 0: - choice = json_data['choices'][0] - if 'delta' in choice: - save_dict[request_id] += choice["delta"]["content"] - elif 'text' in choice: - save_dict[request_id] += choice["text"] - except json.JSONDecodeError: - print(f"Received request_id: {request_id}, request: {request} content: {save_dict[request_id]}") - pass # Done - yield chunk - - -@router.post("/v1/chat/completions") +@router.post("/v1/chat/completions", + dependencies=[Depends(validate_json_request)]) @with_cancellation +@load_aware_call async def create_chat_completion(request: ChatCompletionRequest, raw_request: Request): handler = chat(raw_request) @@ -397,11 +458,6 @@ async def create_chat_completion(request: ChatCompletionRequest, return base(raw_request).create_error_response( message="The model does not support Chat Completions API") - if flag is not None: - request_id = "chatcmpl-" \ - f"{handler._base_request_id(raw_request, request.request_id)}" - print(f"First received request_id: {request_id}, request: {request}") - generator = await handler.create_chat_completion(request, raw_request) if isinstance(generator, ErrorResponse): @@ -409,43 +465,39 @@ async def create_chat_completion(request: ChatCompletionRequest, status_code=generator.code) elif isinstance(generator, ChatCompletionResponse): - if flag is not None: - print(f"Received request-id:{request_id}, request:{request}, Output:{generator.model_dump()}") return JSONResponse(content=generator.model_dump()) - if flag is not None: - return StreamingResponse(content=stream_generator(generator, request, request_id), media_type="text/event-stream") return StreamingResponse(content=generator, media_type="text/event-stream") -@router.post("/v1/completions") +@router.post("/v1/completions", dependencies=[Depends(validate_json_request)]) @with_cancellation +@load_aware_call async def create_completion(request: CompletionRequest, raw_request: Request): handler = completion(raw_request) if handler is None: return base(raw_request).create_error_response( message="The model does not support Completions API") - if flag is not None: - request_id = f"cmpl-{handler._base_request_id(raw_request)}" - print(f"First received request_id: {request_id}, request: {request}") - generator = await handler.create_completion(request, raw_request) if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), status_code=generator.code) elif isinstance(generator, CompletionResponse): - if flag is not None: - print(f"Received request-id:{request_id}, request:{request}, Output:{generator.model_dump()}") return JSONResponse(content=generator.model_dump()) + + if flag is not None: + return StreamingResponse(content=stream_generator(generator, request, request_id), media_type="text/event-stream") + if flag is not None: return StreamingResponse(content=stream_generator(generator, request, request_id), media_type="text/event-stream") return StreamingResponse(content=generator, media_type="text/event-stream") -@router.post("/v1/embeddings") +@router.post("/v1/embeddings", dependencies=[Depends(validate_json_request)]) @with_cancellation +@load_aware_call async def create_embedding(request: EmbeddingRequest, raw_request: Request): handler = embedding(raw_request) if handler is None: @@ -460,6 +512,8 @@ async def create_embedding(request: EmbeddingRequest, raw_request: Request): "use the Pooling API (`/pooling`) instead.") res = await fallback_handler.create_pooling(request, raw_request) + + generator: Union[ErrorResponse, EmbeddingResponse] if isinstance(res, PoolingResponse): generator = EmbeddingResponse( id=res.id, @@ -488,8 +542,9 @@ async def create_embedding(request: EmbeddingRequest, raw_request: Request): assert_never(generator) -@router.post("/pooling") +@router.post("/pooling", dependencies=[Depends(validate_json_request)]) @with_cancellation +@load_aware_call async def create_pooling(request: PoolingRequest, raw_request: Request): handler = pooling(raw_request) if handler is None: @@ -506,8 +561,9 @@ async def create_pooling(request: PoolingRequest, raw_request: Request): assert_never(generator) -@router.post("/score") +@router.post("/score", dependencies=[Depends(validate_json_request)]) @with_cancellation +@load_aware_call async def create_score(request: ScoreRequest, raw_request: Request): handler = score(raw_request) if handler is None: @@ -524,8 +580,9 @@ async def create_score(request: ScoreRequest, raw_request: Request): assert_never(generator) -@router.post("/v1/score") +@router.post("/v1/score", dependencies=[Depends(validate_json_request)]) @with_cancellation +@load_aware_call async def create_score_v1(request: ScoreRequest, raw_request: Request): logger.warning( "To indicate that Score API is not part of standard OpenAI API, we " @@ -534,6 +591,153 @@ async def create_score_v1(request: ScoreRequest, raw_request: Request): return await create_score(request, raw_request) +@router.post("/v1/audio/transcriptions") +@with_cancellation +@load_aware_call +async def create_transcriptions(request: Annotated[TranscriptionRequest, + Form()], + raw_request: Request): + handler = transcription(raw_request) + if handler is None: + return base(raw_request).create_error_response( + message="The model does not support Transcriptions API") + + audio_data = await request.file.read() + generator = await handler.create_transcription(audio_data, request, + raw_request) + + if isinstance(generator, ErrorResponse): + return JSONResponse(content=generator.model_dump(), + status_code=generator.code) + + elif isinstance(generator, TranscriptionResponse): + return JSONResponse(content=generator.model_dump()) + + return StreamingResponse(content=generator, media_type="text/event-stream") + + +@router.post("/rerank", dependencies=[Depends(validate_json_request)]) +@with_cancellation +@load_aware_call +async def do_rerank(request: RerankRequest, raw_request: Request): + handler = rerank(raw_request) + if handler is None: + return base(raw_request).create_error_response( + message="The model does not support Rerank (Score) API") + generator = await handler.do_rerank(request, raw_request) + if isinstance(generator, ErrorResponse): + return JSONResponse(content=generator.model_dump(), + status_code=generator.code) + elif isinstance(generator, RerankResponse): + return JSONResponse(content=generator.model_dump()) + + assert_never(generator) + + +@router.post("/v1/rerank", dependencies=[Depends(validate_json_request)]) +@with_cancellation +async def do_rerank_v1(request: RerankRequest, raw_request: Request): + logger.warning_once( + "To indicate that the rerank API is not part of the standard OpenAI" + " API, we have located it at `/rerank`. Please update your client " + "accordingly. (Note: Conforms to JinaAI rerank API)") + + return await do_rerank(request, raw_request) + + +@router.post("/v2/rerank", dependencies=[Depends(validate_json_request)]) +@with_cancellation +async def do_rerank_v2(request: RerankRequest, raw_request: Request): + return await do_rerank(request, raw_request) + + +TASK_HANDLERS: dict[str, dict[str, tuple]] = { + "generate": { + "messages": (ChatCompletionRequest, create_chat_completion), + "default": (CompletionRequest, create_completion), + }, + "embed": { + "messages": (EmbeddingChatRequest, create_embedding), + "default": (EmbeddingCompletionRequest, create_embedding), + }, + "score": { + "default": (RerankRequest, do_rerank) + }, + "rerank": { + "default": (RerankRequest, do_rerank) + }, + "reward": { + "messages": (PoolingChatRequest, create_pooling), + "default": (PoolingCompletionRequest, create_pooling), + }, + "classify": { + "messages": (PoolingChatRequest, create_pooling), + "default": (PoolingCompletionRequest, create_pooling), + }, +} + +if envs.VLLM_SERVER_DEV_MODE: + + @router.post("/reset_prefix_cache") + async def reset_prefix_cache(raw_request: Request): + """ + Reset the prefix cache. Note that we currently do not check if the + prefix cache is successfully reset in the API server. + """ + logger.info("Resetting prefix cache...") + await engine_client(raw_request).reset_prefix_cache() + return Response(status_code=200) + + @router.post("/sleep") + async def sleep(raw_request: Request): + # get POST params + level = raw_request.query_params.get("level", "1") + logger.info("sleep the engine with level %s", level) + await engine_client(raw_request).sleep(int(level)) + # FIXME: in v0 with frontend multiprocessing, the sleep command + # is sent but does not finish yet when we return a response. + return Response(status_code=200) + + @router.post("/wake_up") + async def wake_up(raw_request: Request): + logger.info("wake up the engine") + await engine_client(raw_request).wake_up() + # FIXME: in v0 with frontend multiprocessing, the wake-up command + # is sent but does not finish yet when we return a response. + return Response(status_code=200) + + @router.get("/is_sleeping") + async def is_sleeping(raw_request: Request): + logger.info("check whether the engine is sleeping") + is_sleeping = await engine_client(raw_request).is_sleeping() + return JSONResponse(content={"is_sleeping": is_sleeping}) + + +@router.post("/invocations", dependencies=[Depends(validate_json_request)]) +async def invocations(raw_request: Request): + """ + For SageMaker, routes requests to other handlers based on model `task`. + """ + body = await raw_request.json() + task = raw_request.app.state.task + + if task not in TASK_HANDLERS: + raise HTTPException( + status_code=400, + detail=f"Unsupported task: '{task}' for '/invocations'. " + f"Expected one of {set(TASK_HANDLERS.keys())}") + + handler_config = TASK_HANDLERS[task] + if "messages" in body: + request_model, handler = handler_config["messages"] + else: + request_model, handler = handler_config["default"] + + # this is required since we lose the FastAPI automatic casting + request = request_model.model_validate(body) + return await handler(request, raw_request) + + if envs.VLLM_TORCH_PROFILER_DIR: logger.warning( "Torch Profiler is enabled in the API server. This should ONLY be " @@ -556,32 +760,30 @@ async def stop_profile(raw_request: Request): if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING: logger.warning( - "Lora dynamic loading & unloading is enabled in the API server. " + "LoRA dynamic loading & unloading is enabled in the API server. " "This should ONLY be used for local development!") - @router.post("/v1/load_lora_adapter") - async def load_lora_adapter(request: LoadLoraAdapterRequest, + @router.post("/v1/load_lora_adapter", + dependencies=[Depends(validate_json_request)]) + async def load_lora_adapter(request: LoadLoRAAdapterRequest, raw_request: Request): - for route in [chat, completion, embedding]: - handler = route(raw_request) - if handler is not None: - response = await handler.load_lora_adapter(request) - if isinstance(response, ErrorResponse): - return JSONResponse(content=response.model_dump(), - status_code=response.code) + handler = models(raw_request) + response = await handler.load_lora_adapter(request) + if isinstance(response, ErrorResponse): + return JSONResponse(content=response.model_dump(), + status_code=response.code) return Response(status_code=200, content=response) - @router.post("/v1/unload_lora_adapter") - async def unload_lora_adapter(request: UnloadLoraAdapterRequest, + @router.post("/v1/unload_lora_adapter", + dependencies=[Depends(validate_json_request)]) + async def unload_lora_adapter(request: UnloadLoRAAdapterRequest, raw_request: Request): - for route in [chat, completion, embedding]: - handler = route(raw_request) - if handler is not None: - response = await handler.unload_lora_adapter(request) - if isinstance(response, ErrorResponse): - return JSONResponse(content=response.model_dump(), - status_code=response.code) + handler = models(raw_request) + response = await handler.unload_lora_adapter(request) + if isinstance(response, ErrorResponse): + return JSONResponse(content=response.model_dump(), + status_code=response.code) return Response(status_code=200, content=response) @@ -648,7 +850,7 @@ async def add_request_id(request: Request, call_next): module_path, object_name = middleware.rsplit(".", 1) imported = getattr(importlib.import_module(module_path), object_name) if inspect.isclass(imported): - app.add_middleware(imported) + app.add_middleware(imported) # type: ignore[arg-type] elif inspect.iscoroutinefunction(imported): app.middleware("http")(imported) else: @@ -658,7 +860,7 @@ async def add_request_id(request: Request, call_next): return app -def init_app_state( +async def init_app_state( engine_client: EngineClient, model_config: ModelConfig, state: State, @@ -683,15 +885,18 @@ def init_app_state( state.log_stats = not args.disable_log_stats resolved_chat_template = load_chat_template(args.chat_template) - logger.info("Using supplied chat template:\n%s", resolved_chat_template) + if resolved_chat_template is not None: + logger.info("Using supplied chat template:\n%s", + resolved_chat_template) state.openai_serving_models = OpenAIServingModels( + engine_client=engine_client, model_config=model_config, base_model_paths=base_model_paths, lora_modules=args.lora_modules, prompt_adapters=args.prompt_adapters, ) - # TODO: The chat template is now broken for lora adapters :( + await state.openai_serving_models.init_static_loras() state.openai_serving_chat = OpenAIServingChat( engine_client, model_config, @@ -703,6 +908,8 @@ def init_app_state( return_tokens_as_token_ids=args.return_tokens_as_token_ids, enable_auto_tools=args.enable_auto_tool_choice, tool_parser=args.tool_call_parser, + enable_reasoning=args.enable_reasoning, + reasoning_parser=args.reasoning_parser, enable_prompt_tokens_details=args.enable_prompt_tokens_details, ) if model_config.runner_type == "generate" else None state.openai_serving_completion = OpenAIServingCompletion( @@ -728,7 +935,13 @@ def init_app_state( chat_template=resolved_chat_template, chat_template_content_format=args.chat_template_content_format, ) if model_config.task == "embed" else None - state.openai_serving_scores = OpenAIServingScores( + state.openai_serving_scores = ServingScores( + engine_client, + model_config, + state.openai_serving_models, + request_logger=request_logger) if model_config.task in ( + "score", "embed", "pooling") else None + state.jinaai_serving_reranking = ServingScores( engine_client, model_config, state.openai_serving_models, @@ -742,92 +955,26 @@ def init_app_state( chat_template=resolved_chat_template, chat_template_content_format=args.chat_template_content_format, ) + state.openai_serving_transcription = OpenAIServingTranscription( + engine_client, + model_config, + state.openai_serving_models, + request_logger=request_logger, + ) if model_config.runner_type == "transcription" else None state.task = model_config.task - # if args.served_model_name is not None: - # served_model_names = args.served_model_name - # else: - # served_model_names = [args.model] - - # if args.disable_log_requests: - # request_logger = None - # else: - # request_logger = RequestLogger(max_log_len=args.max_log_len) - - # base_model_paths = [ - # BaseModelPath(name=name, model_path=args.model) - # for name in served_model_names - # ] - - # state.engine_client = engine_client - # state.log_stats = not args.disable_log_stats - - # resolved_chat_template = load_chat_template(args.chat_template) - # logger.info("Using supplied chat template:\n%s", resolved_chat_template) - - # state.openai_serving_chat = OpenAIServingChat( - # engine_client, - # model_config, - # base_model_paths, - # args.response_role, - # lora_modules=args.lora_modules, - # prompt_adapters=args.prompt_adapters, - # request_logger=request_logger, - # chat_template=resolved_chat_template, - # chat_template_content_format=args.chat_template_content_format, - # return_tokens_as_token_ids=args.return_tokens_as_token_ids, - # enable_auto_tools=args.enable_auto_tool_choice, - # tool_parser=args.tool_call_parser, - # enable_prompt_tokens_details=args.enable_prompt_tokens_details, - # ) if model_config.runner_type == "generate" else None - # state.openai_serving_completion = OpenAIServingCompletion( - # engine_client, - # model_config, - # base_model_paths, - # lora_modules=args.lora_modules, - # prompt_adapters=args.prompt_adapters, - # request_logger=request_logger, - # return_tokens_as_token_ids=args.return_tokens_as_token_ids, - # ) if model_config.runner_type == "generate" else None - # state.openai_serving_pooling = OpenAIServingPooling( - # engine_client, - # model_config, - # base_model_paths, - # request_logger=request_logger, - # chat_template=resolved_chat_template, - # chat_template_content_format=args.chat_template_content_format, - # ) if model_config.runner_type == "pooling" else None - # state.openai_serving_embedding = OpenAIServingEmbedding( - # engine_client, - # model_config, - # base_model_paths, - # request_logger=request_logger, - # chat_template=resolved_chat_template, - # chat_template_content_format=args.chat_template_content_format, - # ) if model_config.task == "embed" else None - # state.openai_serving_scores = OpenAIServingScores( - # engine_client, - # model_config, - # base_model_paths, - # request_logger=request_logger - # ) if model_config.task == "score" else None - # state.openai_serving_tokenization = OpenAIServingTokenization( - # engine_client, - # model_config, - # base_model_paths, - # lora_modules=args.lora_modules, - # request_logger=request_logger, - # chat_template=resolved_chat_template, - # chat_template_content_format=args.chat_template_content_format, - # ) - - -def create_server_socket(addr: Tuple[str, int]) -> socket.socket: + + state.enable_server_load_tracking = args.enable_server_load_tracking + state.server_load_metrics = 0 + + +def create_server_socket(addr: tuple[str, int]) -> socket.socket: family = socket.AF_INET if is_valid_ipv6_address(addr[0]): family = socket.AF_INET6 sock = socket.socket(family=family, type=socket.SOCK_STREAM) sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEPORT, 1) sock.bind(addr) return sock @@ -840,11 +987,18 @@ async def run_server(args, **uvicorn_kwargs) -> None: if args.tool_parser_plugin and len(args.tool_parser_plugin) > 3: ToolParserManager.import_tool_parser(args.tool_parser_plugin) - valide_tool_parses = ToolParserManager.tool_parsers.keys() + valid_tool_parses = ToolParserManager.tool_parsers.keys() if args.enable_auto_tool_choice \ - and args.tool_call_parser not in valide_tool_parses: + and args.tool_call_parser not in valid_tool_parses: raise KeyError(f"invalid tool call parser: {args.tool_call_parser} " - f"(chose from {{ {','.join(valide_tool_parses)} }})") + f"(chose from {{ {','.join(valid_tool_parses)} }})") + + valid_reasoning_parses = ReasoningParserManager.reasoning_parsers.keys() + if args.enable_reasoning \ + and args.reasoning_parser not in valid_reasoning_parses: + raise KeyError( + f"invalid reasoning parser: {args.reasoning_parser} " + f"(chose from {{ {','.join(valid_reasoning_parses)} }})") # workaround to make sure that we bind the port before the engine is set up. # This avoids race conditions with ray. @@ -866,10 +1020,22 @@ def signal_handler(*_) -> None: app = build_app(args) model_config = await engine_client.get_model_config() - init_app_state(engine_client, model_config, app.state, args) + await init_app_state(engine_client, model_config, app.state, args) + + def _listen_addr(a: str) -> str: + if is_valid_ipv6_address(a): + return '[' + a + ']' + return a or "0.0.0.0" + + is_ssl = args.ssl_keyfile and args.ssl_certfile + logger.info("Starting vLLM API server on http%s://%s:%d", + "s" if is_ssl else "", _listen_addr(sock_addr[0]), + sock_addr[1]) shutdown_task = await serve_http( app, + sock=sock, + enable_ssl_refresh=args.enable_ssl_refresh, host=args.host, port=args.port, log_level=args.uvicorn_log_level, From 763f2d1194ddfa875ca692813117b027500814df Mon Sep 17 00:00:00 2001 From: xiangyuT Date: Tue, 25 Mar 2025 09:36:26 +0800 Subject: [PATCH 03/19] update --- docker/llm/serving/xpu/docker/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/llm/serving/xpu/docker/Dockerfile b/docker/llm/serving/xpu/docker/Dockerfile index 2ad571fd929..3bc8b40d5ff 100644 --- a/docker/llm/serving/xpu/docker/Dockerfile +++ b/docker/llm/serving/xpu/docker/Dockerfile @@ -160,7 +160,7 @@ RUN set -eux && \ pip install --upgrade cmake && \ VLLM_TARGET_DEVICE=xpu pip install --no-build-isolation -v /llm/vllm && \ pip install mpi4py fastapi uvicorn openai && \ - pip install ray + pip install ray numba WORKDIR /llm/ ENTRYPOINT ["bash", "/llm/start-vllm-service.sh"] From a51045e50d5c4a0e8cf549e4f6f6f86d81c3d1c0 Mon Sep 17 00:00:00 2001 From: xiangyuT Date: Wed, 26 Mar 2025 11:18:58 +0800 Subject: [PATCH 04/19] fix --- python/llm/src/ipex_llm/vllm/xpu/engine/engine.py | 2 +- python/llm/src/ipex_llm/vllm/xpu/model_convert.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/python/llm/src/ipex_llm/vllm/xpu/engine/engine.py b/python/llm/src/ipex_llm/vllm/xpu/engine/engine.py index 0617c441ebc..e9ee2023408 100644 --- a/python/llm/src/ipex_llm/vllm/xpu/engine/engine.py +++ b/python/llm/src/ipex_llm/vllm/xpu/engine/engine.py @@ -317,7 +317,7 @@ def from_vllm_config(cls, vllm_config: VllmConfig, vllm_config=vllm_config, ipc_path=ipc_path, usage_context=usage_context, - disable_log_requests=disable_log_requests + disable_log_requests=disable_log_requests, disable_log_stats=disable_log_stats, ) diff --git a/python/llm/src/ipex_llm/vllm/xpu/model_convert.py b/python/llm/src/ipex_llm/vllm/xpu/model_convert.py index bc2cd79c84b..053d1c6e5cf 100644 --- a/python/llm/src/ipex_llm/vllm/xpu/model_convert.py +++ b/python/llm/src/ipex_llm/vllm/xpu/model_convert.py @@ -68,9 +68,9 @@ def _ipex_llm_convert(load_in_low_bit): from ipex_llm.vllm.xpu.ipex_llm_v1_wrapper import get_ipex_llm_v1_wrapper import vllm.executor.ray_utils as ray_utils_v0 import vllm.v1.executor.ray_utils as ray_utils_v1 - # from vllm.v1.worker.gpu_model_runner import GPUModelRunner + from vllm.v1.worker.gpu_model_runner import GPUModelRunner setattr(XPUModelRunner, "load_model", get_load_function(load_in_low_bit)) - # setattr(GPUModelRunner, "load_model", get_load_function(load_in_low_bit)) + setattr(GPUModelRunner, "load_model", get_load_function(load_in_low_bit)) setattr(ray_utils_v0, "RayWorkerWrapper", get_ipex_llm_wrapper(load_in_low_bit)) setattr(ray_utils_v1, "RayWorkerWrapper", get_ipex_llm_v1_wrapper(load_in_low_bit)) From 82e88f6f6e5260c779a3e537fd4def9ca27be89f Mon Sep 17 00:00:00 2001 From: xiangyuT Date: Wed, 26 Mar 2025 12:01:45 +0800 Subject: [PATCH 05/19] fix --- .../src/ipex_llm/vllm/xpu/engine/engine.py | 55 +++++++++++++------ .../vllm/xpu/entrypoints/openai/api_server.py | 12 ++-- .../src/ipex_llm/vllm/xpu/model_convert.py | 2 + 3 files changed, 48 insertions(+), 21 deletions(-) diff --git a/python/llm/src/ipex_llm/vllm/xpu/engine/engine.py b/python/llm/src/ipex_llm/vllm/xpu/engine/engine.py index e9ee2023408..d4a2ae130e1 100644 --- a/python/llm/src/ipex_llm/vllm/xpu/engine/engine.py +++ b/python/llm/src/ipex_llm/vllm/xpu/engine/engine.py @@ -38,6 +38,7 @@ class IPEXLLMAsyncLLMEngine(AsyncLLMEngine): + _is_converted = False def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -53,7 +54,9 @@ def from_engine_args( ) -> "AsyncLLMEngine": """Creates an async LLM engine from the engine arguments.""" # Create the engine configs. - _ipex_llm_convert(load_in_low_bit) + if not cls._is_converted: + _ipex_llm_convert(load_in_low_bit) + cls._is_converted = True return super().from_engine_args(engine_args=engine_args, engine_config=engine_config, start_engine_loop=start_engine_loop, usage_context=usage_context, stat_loggers=stat_loggers) @@ -69,9 +72,10 @@ def from_vllm_config( disable_log_stats: bool = False, load_in_low_bit: str = "sym_int4", ) -> "AsyncLLMEngine": - _ipex_llm_convert(load_in_low_bit) + if not cls._is_converted: + _ipex_llm_convert(load_in_low_bit) + cls._is_converted = True return super().from_vllm_config( - cls=cls, vllm_config=vllm_config, start_engine_loop=start_engine_loop, usage_context=usage_context, @@ -82,7 +86,7 @@ def from_vllm_config( class IPEXLLMAsyncV1Engine(AsyncLLM): - + _is_converted = False def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -96,7 +100,9 @@ def from_engine_args( load_in_low_bit: str = "sym_int4", stat_loggers: Optional[Dict[str, StatLoggerBase]]=None, # noqa ) -> "AsyncLLM": - _ipex_llm_convert(load_in_low_bit) + if not cls._is_converted: + _ipex_llm_convert(load_in_low_bit) + cls._is_converted = True return super().from_engine_args(engine_args=engine_args, engine_config=engine_config, start_engine_loop=start_engine_loop, usage_context=usage_context, stat_loggers=stat_loggers) @@ -112,9 +118,10 @@ def from_vllm_config( disable_log_stats: bool = False, load_in_low_bit: str = "sym_int4", ) -> "AsyncLLM": - _ipex_llm_convert(load_in_low_bit) + if not cls._is_converted: + _ipex_llm_convert(load_in_low_bit) + cls._is_converted = True return super().from_vllm_config( - cls=cls, vllm_config=vllm_config, start_engine_loop=start_engine_loop, usage_context=usage_context, @@ -209,6 +216,7 @@ def __init__( # Logic to switch between engines is done at runtime instead of import # to avoid import order issues self.engine_class = self.get_engine_class() + # print("!!! ", load_in_low_bit) self.llm_engine = self.engine_class.from_engine_args( engine_args, usage_context=UsageContext.LLM_CLASS, load_in_low_bit=load_in_low_bit) @@ -224,6 +232,7 @@ def get_engine_class() -> Type[LLMEngine]: class IPEXLLMLLMV1Engine(V1LLMEngine): + _is_converted = False def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -239,7 +248,9 @@ def from_engine_args( """Creates an LLM engine from the engine arguments.""" # Create the engine configs. - _ipex_llm_convert(load_in_low_bit) + if not cls._is_converted: + _ipex_llm_convert(load_in_low_bit) + cls._is_converted = True return super().from_engine_args(engine_args, usage_context, stat_loggers, @@ -254,9 +265,10 @@ def from_vllm_config( disable_log_stats: bool = False, load_in_low_bit: str = "sym_int4", ) -> "LLMEngine": - _ipex_llm_convert(load_in_low_bit) + if not cls._is_converted: + _ipex_llm_convert(load_in_low_bit) + cls._is_converted = True return super().from_vllm_config( - cls=cls, vllm_config=vllm_config, usage_context=usage_context, stat_loggers=stat_loggers, @@ -264,6 +276,7 @@ def from_vllm_config( ) class IPEXLLMLLMEngine(LLMEngine): + _is_converted = False def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -277,7 +290,9 @@ def from_engine_args( ) -> "LLMEngine": """Creates an LLM engine from the engine arguments.""" # Create the engine configs. - _ipex_llm_convert(load_in_low_bit) + if not cls._is_converted: + _ipex_llm_convert(load_in_low_bit) + cls._is_converted = True return super().from_engine_args(engine_args, usage_context, stat_loggers) @classmethod @@ -289,9 +304,10 @@ def from_vllm_config( disable_log_stats: bool = False, load_in_low_bit: str = "sym_int4", ) -> "LLMEngine": - _ipex_llm_convert(load_in_low_bit) + if not cls._is_converted: + _ipex_llm_convert(load_in_low_bit) + cls._is_converted = True return super().from_vllm_config( - cls=cls, vllm_config=vllm_config, usage_context=usage_context, stat_loggers=stat_loggers, @@ -299,10 +315,16 @@ def from_vllm_config( ) class IPEXLLMMQLLMEngine(MQLLMEngine): + _is_converted = False + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + @classmethod def from_engine_args(cls, engine_args: AsyncEngineArgs, usage_context: UsageContext, ipc_path: str, load_in_low_bit: str): - _ipex_llm_convert(load_in_low_bit) + if not cls._is_converted: + _ipex_llm_convert(load_in_low_bit) + cls._is_converted = True return super().from_engine_args(engine_args, usage_context, ipc_path) @classmethod @@ -311,9 +333,10 @@ def from_vllm_config(cls, vllm_config: VllmConfig, disable_log_requests: bool, disable_log_stats: bool, ipc_path: str, load_in_low_bit: str) -> "MQLLMEngine": - _ipex_llm_convert(load_in_low_bit) + if not cls._is_converted: + _ipex_llm_convert(load_in_low_bit) + cls._is_converted = True return super().from_vllm_config( - cls=cls, vllm_config=vllm_config, ipc_path=ipc_path, usage_context=usage_context, diff --git a/python/llm/src/ipex_llm/vllm/xpu/entrypoints/openai/api_server.py b/python/llm/src/ipex_llm/vllm/xpu/entrypoints/openai/api_server.py index ae8909c30f9..881d1ddeea4 100644 --- a/python/llm/src/ipex_llm/vllm/xpu/entrypoints/openai/api_server.py +++ b/python/llm/src/ipex_llm/vllm/xpu/entrypoints/openai/api_server.py @@ -31,9 +31,9 @@ import vllm.envs as envs from vllm.config import ModelConfig from vllm.engine.arg_utils import AsyncEngineArgs -from vllm.engine.async_llm_engine import AsyncLLMEngine # type: ignore +from ipex_llm.vllm.xpu.engine import IPEXLLMAsyncLLMEngine as AsyncLLMEngine # type: ignore from vllm.engine.multiprocessing.client import MQLLMEngineClient -from vllm.engine.multiprocessing.engine import run_mp_engine +from ipex_llm.vllm.xpu.engine import run_mp_engine from vllm.engine.protocol import EngineClient from vllm.entrypoints.chat_utils import load_chat_template from vllm.entrypoints.launcher import serve_http @@ -175,7 +175,8 @@ async def build_async_engine_client_from_engine_args( vllm_config=vllm_config, usage_context=usage_context, disable_log_requests=engine_args.disable_log_requests, - disable_log_stats=engine_args.disable_log_stats) + disable_log_stats=engine_args.disable_log_stats, + load_in_low_bit=load_in_low_bit) yield async_llm finally: if async_llm: @@ -191,7 +192,8 @@ async def build_async_engine_client_from_engine_args( vllm_config=vllm_config, usage_context=usage_context, disable_log_requests=engine_args.disable_log_requests, - disable_log_stats=engine_args.disable_log_stats) + disable_log_stats=engine_args.disable_log_stats, + load_in_low_bit=load_in_low_bit) yield engine_client finally: if engine_client and hasattr(engine_client, "shutdown"): @@ -235,7 +237,7 @@ async def build_async_engine_client_from_engine_args( target=run_mp_engine, args=(vllm_config, UsageContext.OPENAI_API_SERVER, ipc_path, engine_args.disable_log_stats, - engine_args.disable_log_requests, engine_alive)) + engine_args.disable_log_requests, load_in_low_bit, engine_alive)) engine_process.start() engine_pid = engine_process.pid assert engine_pid is not None, "Engine process failed to start." diff --git a/python/llm/src/ipex_llm/vllm/xpu/model_convert.py b/python/llm/src/ipex_llm/vllm/xpu/model_convert.py index 053d1c6e5cf..1f3bd87b802 100644 --- a/python/llm/src/ipex_llm/vllm/xpu/model_convert.py +++ b/python/llm/src/ipex_llm/vllm/xpu/model_convert.py @@ -63,6 +63,8 @@ def _model_sample_convert(): def _ipex_llm_convert(load_in_low_bit): + # import pdb + # pdb.set_trace() from vllm.worker.xpu_model_runner import XPUModelRunner from ipex_llm.vllm.xpu.ipex_llm_wrapper import get_ipex_llm_wrapper from ipex_llm.vllm.xpu.ipex_llm_v1_wrapper import get_ipex_llm_v1_wrapper From 8faaa62c45051a1e2c0a97a1ee2325adb49a6de9 Mon Sep 17 00:00:00 2001 From: xiangyuT Date: Wed, 26 Mar 2025 14:11:36 +0800 Subject: [PATCH 06/19] fix --- .../src/ipex_llm/vllm/xpu/engine/engine.py | 32 ++++++++++++------- .../vllm/xpu/entrypoints/openai/api_server.py | 6 ---- 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/python/llm/src/ipex_llm/vllm/xpu/engine/engine.py b/python/llm/src/ipex_llm/vllm/xpu/engine/engine.py index d4a2ae130e1..7e1b5cfc480 100644 --- a/python/llm/src/ipex_llm/vllm/xpu/engine/engine.py +++ b/python/llm/src/ipex_llm/vllm/xpu/engine/engine.py @@ -344,26 +344,34 @@ def from_vllm_config(cls, vllm_config: VllmConfig, disable_log_stats=disable_log_stats, ) +from vllm.transformers_utils.config import ( + maybe_register_config_serialize_by_value) +def signal_handler(*_) -> None: + raise KeyboardInterrupt("MQLLMEngine terminated") + +def run_mp_engine(vllm_config: VllmConfig, usage_context: UsageContext, + ipc_path: str, disable_log_stats: bool, + disable_log_requests: bool, load_in_low_bit, engine_alive): + try: + # Ensure we can serialize transformer config before spawning + maybe_register_config_serialize_by_value() -def run_mp_engine(engine_args: AsyncEngineArgs, usage_context: UsageContext, - ipc_path: str, load_in_low_bit: str, engine_alive): - - def signal_handler(*_) -> None: - # Interrupt server on sigterm - raise KeyboardInterrupt("MQLLMEngine terminated") # noqa + engine = IPEXLLMMQLLMEngine.from_vllm_config( + vllm_config=vllm_config, + usage_context=usage_context, + disable_log_stats=disable_log_stats, + disable_log_requests=disable_log_requests, + load_in_low_bit=load_in_low_bit, + ipc_path=ipc_path) - try: signal.signal(signal.SIGTERM, signal_handler) - engine = IPEXLLMMQLLMEngine.from_engine_args(engine_args=engine_args, - usage_context=usage_context, - ipc_path=ipc_path, - load_in_low_bit=load_in_low_bit) engine.start() + except BaseException as e: logger.exception(e) engine_alive.value = False - raise e # noqa + raise e if os.getenv("VLLM_USE_V1"): IPEXLLMAsyncLLMEngine = IPEXLLMAsyncV1Engine diff --git a/python/llm/src/ipex_llm/vllm/xpu/entrypoints/openai/api_server.py b/python/llm/src/ipex_llm/vllm/xpu/entrypoints/openai/api_server.py index 881d1ddeea4..6666ba896c5 100644 --- a/python/llm/src/ipex_llm/vllm/xpu/entrypoints/openai/api_server.py +++ b/python/llm/src/ipex_llm/vllm/xpu/entrypoints/openai/api_server.py @@ -487,13 +487,7 @@ async def create_completion(request: CompletionRequest, raw_request: Request): status_code=generator.code) elif isinstance(generator, CompletionResponse): return JSONResponse(content=generator.model_dump()) - - - if flag is not None: - return StreamingResponse(content=stream_generator(generator, request, request_id), media_type="text/event-stream") - if flag is not None: - return StreamingResponse(content=stream_generator(generator, request, request_id), media_type="text/event-stream") return StreamingResponse(content=generator, media_type="text/event-stream") From b5f1e1fb81667edfa7c79f4f607865914d1d2c81 Mon Sep 17 00:00:00 2001 From: xiangyuT Date: Wed, 26 Mar 2025 14:28:37 +0800 Subject: [PATCH 07/19] fix --- python/llm/src/ipex_llm/vllm/xpu/engine/__init__.py | 4 +++- .../src/ipex_llm/vllm/xpu/entrypoints/openai/api_server.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/python/llm/src/ipex_llm/vllm/xpu/engine/__init__.py b/python/llm/src/ipex_llm/vllm/xpu/engine/__init__.py index a3cec88f2b0..b3d11b0eb1b 100644 --- a/python/llm/src/ipex_llm/vllm/xpu/engine/__init__.py +++ b/python/llm/src/ipex_llm/vllm/xpu/engine/__init__.py @@ -13,10 +13,12 @@ # See the License for the specific language governing permissions and # limitations under the License. # -from .engine import IPEXLLMAsyncLLMEngine, IPEXLLMLLMEngine, IPEXLLMClass, run_mp_engine +from .engine import IPEXLLMAsyncLLMEngine, IPEXLLMLLMEngine, IPEXLLMClass, run_mp_engine, IPEXLLMAsyncV1Engine, IPEXLLMLLMV1Engine __all__ = [ "IPEXLLMAsyncLLMEngine", "IPEXLLMLLMEngine", "IPEXLLMClass", + "IPEXLLMAsyncV1Engine", + "IPEXLLMLLMV1Engine", "run_mp_engine", ] diff --git a/python/llm/src/ipex_llm/vllm/xpu/entrypoints/openai/api_server.py b/python/llm/src/ipex_llm/vllm/xpu/entrypoints/openai/api_server.py index 6666ba896c5..d26ecfbd0d4 100644 --- a/python/llm/src/ipex_llm/vllm/xpu/entrypoints/openai/api_server.py +++ b/python/llm/src/ipex_llm/vllm/xpu/entrypoints/openai/api_server.py @@ -168,7 +168,7 @@ async def build_async_engine_client_from_engine_args( "V1 is enabled, but got --disable-frontend-multiprocessing. " "To disable frontend multiprocessing, set VLLM_USE_V1=0.") - from vllm.v1.engine.async_llm import AsyncLLM + from ipex_llm.vllm.xpu.engine import IPEXLLMAsyncV1Engine as AsyncLLM async_llm: Optional[AsyncLLM] = None try: async_llm = AsyncLLM.from_vllm_config( From dbe7ab9c1cecd919922853cf2990e5c629621e91 Mon Sep 17 00:00:00 2001 From: xiangyuT Date: Thu, 27 Mar 2025 09:55:56 +0800 Subject: [PATCH 08/19] add log pr --- .../vllm/xpu/entrypoints/openai/api_server.py | 43 +++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/python/llm/src/ipex_llm/vllm/xpu/entrypoints/openai/api_server.py b/python/llm/src/ipex_llm/vllm/xpu/entrypoints/openai/api_server.py index d26ecfbd0d4..60d029a8f1f 100644 --- a/python/llm/src/ipex_llm/vllm/xpu/entrypoints/openai/api_server.py +++ b/python/llm/src/ipex_llm/vllm/xpu/entrypoints/openai/api_server.py @@ -300,6 +300,33 @@ async def validate_json_request(raw_request: Request): ) +save_dict = {} +import os +flag = os.getenv("VLLM_LOG_OUTPUT", None) +async def stream_generator(generator, request, request_id): + async for chunk in generator: + if request_id not in save_dict: + save_dict[request_id] = "" + import json + try: + data = chunk.strip() + if data.startswith('data: '): + data = data[len('data: '):] + else: + yield chunk + json_data = json.loads(data) + if 'choices' in json_data and len(json_data['choices']) > 0: + choice = json_data['choices'][0] + if 'delta' in choice: + save_dict[request_id] += choice["delta"]["content"] + elif 'text' in choice: + save_dict[request_id] += choice["text"] + except json.JSONDecodeError: + print(f"Received request_id: {request_id}, request: {request} content: {save_dict[request_id]}") + pass # Done + yield chunk + + router = APIRouter() @@ -460,6 +487,10 @@ async def create_chat_completion(request: ChatCompletionRequest, return base(raw_request).create_error_response( message="The model does not support Chat Completions API") + if flag is not None: + request_id = "chatcmpl-" \ + f"{handler._base_request_id(raw_request, request.request_id)}" + print(f"First received request_id: {request_id}, request: {request}") generator = await handler.create_chat_completion(request, raw_request) if isinstance(generator, ErrorResponse): @@ -467,8 +498,12 @@ async def create_chat_completion(request: ChatCompletionRequest, status_code=generator.code) elif isinstance(generator, ChatCompletionResponse): + if flag is not None: + print(f"Received request-id:{request_id}, request:{request}, Output:{generator.model_dump()}") return JSONResponse(content=generator.model_dump()) + if flag is not None: + return StreamingResponse(content=stream_generator(generator, request, request_id), media_type="text/event-stream") return StreamingResponse(content=generator, media_type="text/event-stream") @@ -481,13 +516,21 @@ async def create_completion(request: CompletionRequest, raw_request: Request): return base(raw_request).create_error_response( message="The model does not support Completions API") + if flag is not None: + request_id = f"cmpl-{handler._base_request_id(raw_request)}" + print(f"First received request_id: {request_id}, request: {request}") + generator = await handler.create_completion(request, raw_request) if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), status_code=generator.code) elif isinstance(generator, CompletionResponse): + if flag is not None: + print(f"Received request-id:{request_id}, request:{request}, Output:{generator.model_dump()}") return JSONResponse(content=generator.model_dump()) + if flag is not None: + return StreamingResponse(content=stream_generator(generator, request, request_id), media_type="text/event-stream") return StreamingResponse(content=generator, media_type="text/event-stream") From c1e614f5a8ed53af63ff22a6f18322560fe5de16 Mon Sep 17 00:00:00 2001 From: xiangyuT Date: Mon, 7 Apr 2025 14:36:01 +0800 Subject: [PATCH 09/19] update --- .../vllm/xpu/entrypoints/openai/api_server.py | 95 +++++++++++++++---- 1 file changed, 79 insertions(+), 16 deletions(-) diff --git a/python/llm/src/ipex_llm/vllm/xpu/entrypoints/openai/api_server.py b/python/llm/src/ipex_llm/vllm/xpu/entrypoints/openai/api_server.py index 60d029a8f1f..f07cb245888 100644 --- a/python/llm/src/ipex_llm/vllm/xpu/entrypoints/openai/api_server.py +++ b/python/llm/src/ipex_llm/vllm/xpu/entrypoints/openai/api_server.py @@ -24,6 +24,7 @@ from fastapi.exceptions import RequestValidationError from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import JSONResponse, Response, StreamingResponse +from starlette.concurrency import iterate_in_threadpool from starlette.datastructures import State from starlette.routing import Mount from typing_extensions import assert_never @@ -35,7 +36,9 @@ from vllm.engine.multiprocessing.client import MQLLMEngineClient from ipex_llm.vllm.xpu.engine import run_mp_engine from vllm.engine.protocol import EngineClient -from vllm.entrypoints.chat_utils import load_chat_template +from vllm.entrypoints.chat_utils import (load_chat_template, + resolve_hf_chat_template, + resolve_mistral_chat_template) from vllm.entrypoints.launcher import serve_http from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.openai.cli_args import (make_arg_parser, @@ -65,7 +68,6 @@ TranscriptionRequest, TranscriptionResponse, UnloadLoRAAdapterRequest) -from vllm.entrypoints.openai.reasoning_parsers import ReasoningParserManager # yapf: enable from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion @@ -80,12 +82,15 @@ from vllm.entrypoints.openai.serving_transcription import ( OpenAIServingTranscription) from vllm.entrypoints.openai.tool_parsers import ToolParserManager -from vllm.entrypoints.utils import load_aware_call, with_cancellation +from vllm.entrypoints.utils import (cli_env_setup, load_aware_call, + with_cancellation) from vllm.logger import init_logger +from vllm.reasoning import ReasoningParserManager from vllm.transformers_utils.config import ( maybe_register_config_serialize_by_value) +from vllm.transformers_utils.tokenizer import MistralTokenizer from vllm.usage.usage_lib import UsageContext -from vllm.utils import (FlexibleArgumentParser, get_open_zmq_ipc_path, +from vllm.utils import (Device, FlexibleArgumentParser, get_open_zmq_ipc_path, is_valid_ipv6_address, set_ulimit) from vllm.version import __version__ as VLLM_VERSION @@ -337,6 +342,7 @@ def mount_metrics(app: FastAPI): # See https://prometheus.github.io/client_python/multiprocess/ from prometheus_client import (CollectorRegistry, make_asgi_app, multiprocess) + from prometheus_fastapi_instrumentator import Instrumentator prometheus_multiproc_dir_path = os.getenv("PROMETHEUS_MULTIPROC_DIR", None) if prometheus_multiproc_dir_path is not None: @@ -344,6 +350,16 @@ def mount_metrics(app: FastAPI): prometheus_multiproc_dir_path) registry = CollectorRegistry() multiprocess.MultiProcessCollector(registry) + Instrumentator( + excluded_handlers=[ + "/metrics", + "/health", + "/load", + "/ping", + "/version", + ], + registry=registry, + ).add().instrument(app).expose(app) # Add prometheus asgi middleware to route /metrics requests metrics_route = Mount("/metrics", make_asgi_app(registry=registry)) @@ -723,15 +739,18 @@ async def reset_prefix_cache(raw_request: Request): Reset the prefix cache. Note that we currently do not check if the prefix cache is successfully reset in the API server. """ - logger.info("Resetting prefix cache...") - await engine_client(raw_request).reset_prefix_cache() + device = None + device_str = raw_request.query_params.get("device") + if device_str is not None: + device = Device[device_str.upper()] + logger.info("Resetting prefix cache with specific %s...", str(device)) + await engine_client(raw_request).reset_prefix_cache(device) return Response(status_code=200) @router.post("/sleep") async def sleep(raw_request: Request): # get POST params level = raw_request.query_params.get("level", "1") - logger.info("sleep the engine with level %s", level) await engine_client(raw_request).sleep(int(level)) # FIXME: in v0 with frontend multiprocessing, the sleep command # is sent but does not finish yet when we return a response. @@ -739,8 +758,12 @@ async def sleep(raw_request: Request): @router.post("/wake_up") async def wake_up(raw_request: Request): - logger.info("wake up the engine") - await engine_client(raw_request).wake_up() + tags = raw_request.query_params.getlist("tags") + if tags == []: + # set to None to wake up all tags if no tags are provided + tags = None + logger.info("wake up the engine with tags: %s", tags) + await engine_client(raw_request).wake_up(tags) # FIXME: in v0 with frontend multiprocessing, the wake-up command # is sent but does not finish yet when we return a response. return Response(status_code=200) @@ -856,7 +879,8 @@ async def validation_exception_handler(_, exc): return JSONResponse(err.model_dump(), status_code=HTTPStatus.BAD_REQUEST) - if token := envs.VLLM_API_KEY or args.api_key: + # Ensure --api-key option from CLI takes precedence over VLLM_API_KEY + if token := args.api_key or envs.VLLM_API_KEY: @app.middleware("http") async def authentication(request: Request, call_next): @@ -885,6 +909,21 @@ async def add_request_id(request: Request, call_next): response.headers["X-Request-Id"] = request_id return response + if envs.VLLM_DEBUG_LOG_API_SERVER_RESPONSE: + logger.warning("CAUTION: Enabling log response in the API Server. " + "This can include sensitive information and should be " + "avoided in production.") + + @app.middleware("http") + async def log_response(request: Request, call_next): + response = await call_next(request) + response_body = [ + section async for section in response.body_iterator + ] + response.body_iterator = iterate_in_threadpool(iter(response_body)) + logger.info("response_body={%s}", response_body[0].decode()) + return response + for middleware in args.middleware: module_path, object_name = middleware.rsplit(".", 1) imported = getattr(importlib.import_module(module_path), object_name) @@ -925,8 +964,26 @@ async def init_app_state( resolved_chat_template = load_chat_template(args.chat_template) if resolved_chat_template is not None: - logger.info("Using supplied chat template:\n%s", - resolved_chat_template) + # Get the tokenizer to check official template + tokenizer = await engine_client.get_tokenizer() + + if isinstance(tokenizer, MistralTokenizer): + # The warning is logged in resolve_mistral_chat_template. + resolved_chat_template = resolve_mistral_chat_template( + chat_template=resolved_chat_template) + else: + hf_chat_template = resolve_hf_chat_template( + tokenizer, + chat_template=None, + tools=None, + trust_remote_code=model_config.trust_remote_code) + + if hf_chat_template != resolved_chat_template: + logger.warning( + "Using supplied chat template: %s\n" + "It is different from official chat template '%s'. " + "This discrepancy may lead to performance degradation.", + resolved_chat_template, args.model) state.openai_serving_models = OpenAIServingModels( engine_client=engine_client, @@ -1078,6 +1135,9 @@ def _listen_addr(a: str) -> str: host=args.host, port=args.port, log_level=args.uvicorn_log_level, + # NOTE: When the 'disable_uvicorn_access_log' value is True, + # no access log will be output. + access_log=not args.disable_uvicorn_access_log, timeout_keep_alive=TIMEOUT_KEEP_ALIVE, ssl_keyfile=args.ssl_keyfile, ssl_certfile=args.ssl_certfile, @@ -1087,16 +1147,19 @@ def _listen_addr(a: str) -> str: ) # NB: Await server shutdown only after the backend context is exited - await shutdown_task - - sock.close() + try: + await shutdown_task + finally: + sock.close() if __name__ == "__main__": # NOTE(simon): - # This section should be in sync with vllm/scripts.py for CLI entrypoints. + # This section should be in sync with vllm/entrypoints/cli/main.py for CLI + # entrypoints. logger.warning("Warning: Please use `ipex_llm.vllm.xpu.entrypoints.openai.api_server` " "instead of `vllm.entrypoints.openai.api_server` to start the API server") + cli_env_setup() parser = FlexibleArgumentParser( description="vLLM OpenAI-Compatible RESTful API server.") parser = make_arg_parser(parser) From 7b8c72ed3463fa058183f380ab985f70ef518667 Mon Sep 17 00:00:00 2001 From: xiangyuT Date: Tue, 8 Apr 2025 15:35:07 +0800 Subject: [PATCH 10/19] update for qwen3 --- python/llm/src/ipex_llm/vllm/xpu/model_convert.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/llm/src/ipex_llm/vllm/xpu/model_convert.py b/python/llm/src/ipex_llm/vllm/xpu/model_convert.py index 1f3bd87b802..25a0131bcde 100644 --- a/python/llm/src/ipex_llm/vllm/xpu/model_convert.py +++ b/python/llm/src/ipex_llm/vllm/xpu/model_convert.py @@ -100,7 +100,7 @@ def _ipex_llm_load_model(self) -> None: "codegeex4-all" in self.vllm_config.model_config.model.lower() or "chatglm" in self.vllm_config.model_config.model.lower()) and \ "gptq" not in self.model_config.model.lower() and \ - "awq" not in self.model_config.model.lower(): + "awq" not in self.model_config.model.lower() and "qwen3" not in self.model_config.model.lower(): self.model.apply(padding_mlp) from ipex_llm import optimize_model not_convert_last_mlp = os.getenv("IPEX_LLM_NOT_CONVERT_LAST_MLP", None) From 913ff88858836181e0a1c5d8e1a8a495c969d022 Mon Sep 17 00:00:00 2001 From: xiangyuT Date: Fri, 11 Apr 2025 10:21:16 +0800 Subject: [PATCH 11/19] add ReplicatedLinear --- python/llm/src/ipex_llm/transformers/convert.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/llm/src/ipex_llm/transformers/convert.py b/python/llm/src/ipex_llm/transformers/convert.py index d61d804d09c..8edf1e32c1c 100644 --- a/python/llm/src/ipex_llm/transformers/convert.py +++ b/python/llm/src/ipex_llm/transformers/convert.py @@ -150,12 +150,12 @@ def is_linear_module(module): if _VLLM_VERSION is None: _VLLM_VERSION = get_package_version('vllm') from vllm.model_executor.layers.linear import ( - ColumnParallelLinear, RowParallelLinear, QKVParallelLinear, MergedColumnParallelLinear + ColumnParallelLinear, RowParallelLinear, QKVParallelLinear, MergedColumnParallelLinear, ReplicatedLinear ) from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead VLLM_LINEAR_LIST = [ ColumnParallelLinear, RowParallelLinear, QKVParallelLinear, - MergedColumnParallelLinear, + MergedColumnParallelLinear, ReplicatedLinear, ] if 'xpu' in _VLLM_VERSION: VLLM_LINEAR_LIST.append(ParallelLMHead) From 1f2cdaa42abfbfadf3e4d30ccc8640f5591961b8 Mon Sep 17 00:00:00 2001 From: xiangyuT Date: Mon, 28 Apr 2025 14:03:11 +0800 Subject: [PATCH 12/19] format --- .../llm/src/ipex_llm/transformers/convert.py | 3 +- .../src/ipex_llm/vllm/xpu/engine/engine.py | 28 +++++++++++++------ .../src/ipex_llm/vllm/xpu/model_convert.py | 4 ++- 3 files changed, 25 insertions(+), 10 deletions(-) diff --git a/python/llm/src/ipex_llm/transformers/convert.py b/python/llm/src/ipex_llm/transformers/convert.py index 8edf1e32c1c..efd8839eb06 100644 --- a/python/llm/src/ipex_llm/transformers/convert.py +++ b/python/llm/src/ipex_llm/transformers/convert.py @@ -150,7 +150,8 @@ def is_linear_module(module): if _VLLM_VERSION is None: _VLLM_VERSION = get_package_version('vllm') from vllm.model_executor.layers.linear import ( - ColumnParallelLinear, RowParallelLinear, QKVParallelLinear, MergedColumnParallelLinear, ReplicatedLinear + ColumnParallelLinear, RowParallelLinear, QKVParallelLinear, + MergedColumnParallelLinear, ReplicatedLinear ) from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead VLLM_LINEAR_LIST = [ diff --git a/python/llm/src/ipex_llm/vllm/xpu/engine/engine.py b/python/llm/src/ipex_llm/vllm/xpu/engine/engine.py index 7e1b5cfc480..acce03c430d 100644 --- a/python/llm/src/ipex_llm/vllm/xpu/engine/engine.py +++ b/python/llm/src/ipex_llm/vllm/xpu/engine/engine.py @@ -39,6 +39,7 @@ class IPEXLLMAsyncLLMEngine(AsyncLLMEngine): _is_converted = False + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -67,7 +68,7 @@ def from_vllm_config( vllm_config: VllmConfig, start_engine_loop: bool = True, usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, - stat_loggers: Optional[dict[str, StatLoggerBase]] = None, + stat_loggers: Optional[dict[str, StatLoggerBase]]=None, disable_log_requests: bool = False, disable_log_stats: bool = False, load_in_low_bit: str = "sym_int4", @@ -87,6 +88,7 @@ def from_vllm_config( class IPEXLLMAsyncV1Engine(AsyncLLM): _is_converted = False + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -113,7 +115,7 @@ def from_vllm_config( vllm_config: VllmConfig, start_engine_loop: bool = True, usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, - stat_loggers: Optional[dict[str, StatLoggerBase]] = None, + stat_loggers: Optional[dict[str, StatLoggerBase]]=None, disable_log_requests: bool = False, disable_log_stats: bool = False, load_in_low_bit: str = "sym_int4", @@ -130,7 +132,9 @@ def from_vllm_config( disable_log_stats=disable_log_stats, ) + class IPEXLLMClass(LLM): + def __init__( self, model: str, @@ -152,8 +156,8 @@ def __init__( max_seq_len_to_capture: int = 8192, disable_custom_all_reduce: bool = False, disable_async_output_proc: bool = False, - hf_overrides: Optional[HfOverrides] = None, - mm_processor_kwargs: Optional[dict[str, Any]] = None, + hf_overrides: Optional[HfOverrides]=None, + mm_processor_kwargs: Optional[dict[str, Any]]=None, # After positional args are removed, move this right below `model` task: TaskOption = "auto", override_pooler_config: Optional[PoolerConfig] = None, @@ -233,6 +237,7 @@ def get_engine_class() -> Type[LLMEngine]: class IPEXLLMLLMV1Engine(V1LLMEngine): _is_converted = False + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -261,7 +266,7 @@ def from_vllm_config( cls, vllm_config: VllmConfig, usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, - stat_loggers: Optional[Dict[str, StatLoggerBase]] = None, + stat_loggers: Optional[Dict[str, StatLoggerBase]]=None, disable_log_stats: bool = False, load_in_low_bit: str = "sym_int4", ) -> "LLMEngine": @@ -275,8 +280,10 @@ def from_vllm_config( disable_log_stats=disable_log_stats ) + class IPEXLLMLLMEngine(LLMEngine): _is_converted = False + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -300,7 +307,7 @@ def from_vllm_config( cls, vllm_config: VllmConfig, usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, - stat_loggers: Optional[Dict[str, StatLoggerBase]] = None, + stat_loggers: Optional[Dict[str, StatLoggerBase]]=None, disable_log_stats: bool = False, load_in_low_bit: str = "sym_int4", ) -> "LLMEngine": @@ -314,8 +321,10 @@ def from_vllm_config( disable_log_stats=disable_log_stats ) + class IPEXLLMMQLLMEngine(MQLLMEngine): _is_converted = False + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -346,8 +355,11 @@ def from_vllm_config(cls, vllm_config: VllmConfig, from vllm.transformers_utils.config import ( maybe_register_config_serialize_by_value) + + def signal_handler(*_) -> None: - raise KeyboardInterrupt("MQLLMEngine terminated") + raise KeyboardInterrupt("MQLLMEngine terminated") # noqa + def run_mp_engine(vllm_config: VllmConfig, usage_context: UsageContext, ipc_path: str, disable_log_stats: bool, @@ -371,7 +383,7 @@ def run_mp_engine(vllm_config: VllmConfig, usage_context: UsageContext, except BaseException as e: logger.exception(e) engine_alive.value = False - raise e + raise e # noqa if os.getenv("VLLM_USE_V1"): IPEXLLMAsyncLLMEngine = IPEXLLMAsyncV1Engine diff --git a/python/llm/src/ipex_llm/vllm/xpu/model_convert.py b/python/llm/src/ipex_llm/vllm/xpu/model_convert.py index 25a0131bcde..e0e2e23b5b0 100644 --- a/python/llm/src/ipex_llm/vllm/xpu/model_convert.py +++ b/python/llm/src/ipex_llm/vllm/xpu/model_convert.py @@ -100,7 +100,8 @@ def _ipex_llm_load_model(self) -> None: "codegeex4-all" in self.vllm_config.model_config.model.lower() or "chatglm" in self.vllm_config.model_config.model.lower()) and \ "gptq" not in self.model_config.model.lower() and \ - "awq" not in self.model_config.model.lower() and "qwen3" not in self.model_config.model.lower(): + "awq" not in self.model_config.model.lower() and \ + "qwen3moe" not in self.model_config.model.lower(): self.model.apply(padding_mlp) from ipex_llm import optimize_model not_convert_last_mlp = os.getenv("IPEX_LLM_NOT_CONVERT_LAST_MLP", None) @@ -140,6 +141,7 @@ def _ipex_llm_load_model(self) -> None: self.model_memory_usage = m.consumed_memory logger = init_logger(__name__) + logger.info(self.model) logger.info("Loading model weights took %.4f GB", self.model_memory_usage / float(2**30)) From 8f23290376d9f4789e8ef24eb64a9765d30dabc9 Mon Sep 17 00:00:00 2001 From: xiangyuT Date: Mon, 28 Apr 2025 14:11:32 +0800 Subject: [PATCH 13/19] update dockerfile --- docker/llm/serving/xpu/docker/Dockerfile | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docker/llm/serving/xpu/docker/Dockerfile b/docker/llm/serving/xpu/docker/Dockerfile index 3bc8b40d5ff..5ea0cb9a6fd 100644 --- a/docker/llm/serving/xpu/docker/Dockerfile +++ b/docker/llm/serving/xpu/docker/Dockerfile @@ -154,11 +154,13 @@ RUN set -eux && \ rm -rf /tmp/neo && \ # # Install vllm - git clone -b 0.6.6 https://github.com/analytics-zoo/vllm.git /llm/vllm && \ + git clone -b 0.8.3 https://github.com/analytics-zoo/vllm.git /llm/vllm && \ cd /llm/vllm && \ pip install setuptools-scm && \ pip install --upgrade cmake && \ VLLM_TARGET_DEVICE=xpu pip install --no-build-isolation -v /llm/vllm && \ + pip install intel-extension-for-pytorch==2.6.10+xpu --extra-index-url=https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/ && \ + pip uninstall -y oneccl oneccl-devel && \ pip install mpi4py fastapi uvicorn openai && \ pip install ray numba From 993256758c8e9e95f27370179e9606bb4638576b Mon Sep 17 00:00:00 2001 From: xiangyuT Date: Tue, 29 Apr 2025 10:43:29 +0800 Subject: [PATCH 14/19] Fix and update --- .../xpu/docker/1ccl_for_multi_arc.patch | 181 +- docker/llm/serving/xpu/docker/Dockerfile | 19 +- .../serving/xpu/docker/start-vllm-service.sh | 2 + .../xpu/docker/vllm_for_multi_arc.patch | 37077 +++------------- 4 files changed, 6624 insertions(+), 30655 deletions(-) diff --git a/docker/llm/serving/xpu/docker/1ccl_for_multi_arc.patch b/docker/llm/serving/xpu/docker/1ccl_for_multi_arc.patch index 5fde7daa747..7c0c8fcd795 100644 --- a/docker/llm/serving/xpu/docker/1ccl_for_multi_arc.patch +++ b/docker/llm/serving/xpu/docker/1ccl_for_multi_arc.patch @@ -1,7 +1,9 @@ -From d345631f78a2f33ff1ddd7d9908b288eb0afaf46 Mon Sep 17 00:00:00 2001 -From: Huajun Li -Date: Fri, 24 May 2024 09:47:26 +0800 -Subject: [PATCH 1/3] allreduce optimization with LL256 for Arc770 dGPU +From dfe1851b59df6859829b447353307b7c916ccee0 Mon Sep 17 00:00:00 2001 +From: junhansh +Date: Mon, 28 Apr 2025 23:33:11 +0800 +Subject: [PATCH] oneccl for Arc770 V2025.0.0.6.7 + +allreduce optimization with LL256 for Arc770 dGPU To enable this feature, please set env var: export CCL_DG2_ALLREDUCE=1 @@ -12,6 +14,15 @@ Build: 3. cmake .. -GNinja -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DCMAKE_CXX_FLAGS="-fsycl" -DCOMPUTE_BACKEND=dpcpp -DCMAKE_BUILD_TYPE=MinSizeRel 4. ninja 5. ls -al src/libccl* + +Changes: +optimize req_workgroup calculate + +Revert "optimize req_workgroup calculate" for hang issue + +This reverts commit 20bfd0e0a37f93dfb8bb9c092cd5a0b35e868bfa. + +fix_fdset_buffer_overflow_issue --- src/CMakeLists.txt | 2 + src/coll/coll.cpp | 30 +- @@ -20,9 +31,9 @@ Build: src/common/env/env.cpp | 1 + src/common/env/env.hpp | 1 + src/common/env/vars.hpp | 1 + - src/dg2/dg2_allreduce.cpp | 642 +++++++++++++++++++++++++++++++ + src/dg2/dg2_allreduce.cpp | 640 +++++++++++++++++++++++++++++++ src/dg2/dg2_allreduce.hpp | 13 + - 9 files changed, 693 insertions(+), 3 deletions(-) + 9 files changed, 691 insertions(+), 3 deletions(-) create mode 100644 src/dg2/dg2_allreduce.cpp create mode 100644 src/dg2/dg2_allreduce.hpp @@ -163,10 +174,10 @@ index 73dcf77..84ab518 100644 constexpr const char* CCL_MIN_CHUNK_SIZE = "CCL_MIN_CHUNK_SIZE"; diff --git a/src/dg2/dg2_allreduce.cpp b/src/dg2/dg2_allreduce.cpp new file mode 100644 -index 0000000..15ace74 +index 0000000..73e114b --- /dev/null +++ b/src/dg2/dg2_allreduce.cpp -@@ -0,0 +1,642 @@ +@@ -0,0 +1,640 @@ +#include +#include +#include @@ -178,7 +189,7 @@ index 0000000..15ace74 +#include + +#include -+ ++#include +#include +#include +#include @@ -315,7 +326,6 @@ index 0000000..15ace74 + +static void *thread_func(void *arg) +{ -+ fd_set fds; + int count = 0; + char sock_path[64]; + int peer_buf_fd = 0; @@ -323,6 +333,10 @@ index 0000000..15ace74 + + snprintf(sock_path, sizeof(sock_path), "%s-%d_%d", SOCK_PATH, rank, 0xa770); + int srv_fd = srv_sock(sock_path); ++ if (srv_fd < 0) { ++ perror("srv_sock failed"); ++ return nullptr; ++ } + + //std::cout << "-----> srv_fd of " << sock_path << " : " << srv_fd << "\n"; + @@ -331,35 +345,30 @@ index 0000000..15ace74 + ze_context_handle_t ze_context = sycl::get_native(sycl_context); + ze_device_handle_t ze_device = sycl::get_native(sycl_device); + -+ FD_ZERO(&fds); -+ FD_SET(srv_fd, &fds); ++ struct pollfd pfd = { ++ .fd = srv_fd, ++ .events = POLL_IN, ++ .revents = 0 ++ }; + while (++count < world_size) { -+ int ret = select(srv_fd + 1, &fds, NULL, NULL, NULL); -+ switch (ret) { -+ case 1: -+ { -+ int peer_rank; -+ void *peer_buf; -+ -+ int conn_fd = accept(srv_fd, NULL, 0); -+ ccl::utils::recvmsg_fd(conn_fd, &peer_buf_fd, &peer_rank, sizeof(peer_rank)); -+ -+ ze_ipc_mem_handle_t ipc_handle_peer_buf = get_handle_from_fd(peer_buf_fd); -+ zeMemOpenIpcHandle(ze_context, ze_device, ipc_handle_peer_buf, ZE_IPC_MEMORY_FLAG_BIAS_CACHED /* cached allocation */, &peer_buf); ++ int ret = poll(&pfd, 1, -1); ++ if (ret <= 0) { ++ std::cerr << "poll failed: " << strerror(errno) << "\n"; ++ break; ++ } + -+ peer_bufs[peer_rank] = peer_buf; -+ //printf("<------------- rank: %d, peer_bufs[%d]: %p\n", world_rank, peer_rank, peer_bufs[peer_rank]); ++ if (pfd.revents & POLL_IN) { ++ int peer_rank; ++ void *peer_buf = nullptr; + -+ if (conn_fd > 0) close(conn_fd); ++ int conn_fd = accept(srv_fd, NULL, 0); ++ ccl::utils::recvmsg_fd(conn_fd, &peer_buf_fd, &peer_rank, sizeof(peer_rank)); ++ ze_ipc_mem_handle_t ipc_handle_peer_buf = get_handle_from_fd(peer_buf_fd); ++ zeMemOpenIpcHandle(ze_context, ze_device, ipc_handle_peer_buf, ZE_IPC_MEMORY_FLAG_BIAS_CACHED, &peer_buf); + -+ break; -+ } -+ case 0: -+ case -1: -+ std::cout << "srv_fd select() failed" << "\n"; -+ break; -+ default: -+ break; ++ peer_bufs[peer_rank] = peer_buf; ++ //printf("<------------- rank: %d, peer_bufs[%d]: %p\n", world_rank, peer_rank, peer_bufs[peer_rank]); ++ if (conn_fd > 0) close(conn_fd); + } + } + @@ -831,105 +840,3 @@ index 0000000..0506445 -- 2.34.1 - -From 20bfd0e0a37f93dfb8bb9c092cd5a0b35e868bfa Mon Sep 17 00:00:00 2001 -From: Huajun Li -Date: Fri, 7 Mar 2025 15:15:35 +0800 -Subject: [PATCH 2/3] optimize req_workgroup calculate - ---- - src/dg2/dg2_allreduce.cpp | 25 ++----------------------- - 1 file changed, 2 insertions(+), 23 deletions(-) - -diff --git a/src/dg2/dg2_allreduce.cpp b/src/dg2/dg2_allreduce.cpp -index 15ace74..83270ae 100644 ---- a/src/dg2/dg2_allreduce.cpp -+++ b/src/dg2/dg2_allreduce.cpp -@@ -527,30 +527,9 @@ ccl::event dg2_ll256_allreduce(const void *src, void *dst, size_t count, - auto chunk_sz = req_workitems * LS_SZ; /* LS_SZ bytes per work-item */ - auto chunk_with_pattern = sg_sz * LS_SZ; /* aligned to 256B */ - -- /* items will be assigned to each rank */ -- auto per_rank_items = (unreduced + (local_world_size * LS_SZ - 1)) / (local_world_size * LS_SZ); -- auto req_workgroups = (per_rank_items + (workgroup_available_items - 1)) / workgroup_available_items; -- auto req_subgroups = 0; -- -- if (req_workgroups >= g_sz/l_sz) { -- req_workgroups = g_sz/l_sz; -- } else { -- if (group_id == (req_workgroups - 1)) { -- req_subgroups = (per_rank_items + (sg_sz - 1)) / (sg_sz - 1); -- -- /* (req_subgroups % (l_sz/sg_sz) - 1) equals to the final subgroup id in a workgroup */ -- /* Note: req_subgroups % (l_sz/sg_sz) might be 0 */ -- if (((req_subgroups % (l_sz/sg_sz)) == 0) || (sg_id == (req_subgroups % (l_sz/sg_sz) - 1))) { -- if ((per_rank_items % (sg_sz - 1)) != 0) { -- /* FIXME: */ -- req_workitems = per_rank_items % (sg_sz - 1); -- chunk_sz = req_workitems * LS_SZ; /* LS_SZ bytes per work-item */ -- } -- } -- } -- } -+ auto work_left = unreduced - sg_id * local_world_size * chunk_sz; - -- if (group_id < req_workgroups) { -+ if (work_left > 0) { - // step 1: push data to next GPU - { - offset = base + local_world_rank * chunk_sz; --- -2.34.1 - - -From 1c58cc9ede5ca38138a270f9e5ff59bca74f51d4 Mon Sep 17 00:00:00 2001 -From: Huajun Li -Date: Wed, 12 Mar 2025 13:29:27 +0800 -Subject: [PATCH 3/3] Revert "optimize req_workgroup calculate" for hang issue - -This reverts commit 20bfd0e0a37f93dfb8bb9c092cd5a0b35e868bfa. ---- - src/dg2/dg2_allreduce.cpp | 25 +++++++++++++++++++++++-- - 1 file changed, 23 insertions(+), 2 deletions(-) - -diff --git a/src/dg2/dg2_allreduce.cpp b/src/dg2/dg2_allreduce.cpp -index 83270ae..15ace74 100644 ---- a/src/dg2/dg2_allreduce.cpp -+++ b/src/dg2/dg2_allreduce.cpp -@@ -527,9 +527,30 @@ ccl::event dg2_ll256_allreduce(const void *src, void *dst, size_t count, - auto chunk_sz = req_workitems * LS_SZ; /* LS_SZ bytes per work-item */ - auto chunk_with_pattern = sg_sz * LS_SZ; /* aligned to 256B */ - -- auto work_left = unreduced - sg_id * local_world_size * chunk_sz; -+ /* items will be assigned to each rank */ -+ auto per_rank_items = (unreduced + (local_world_size * LS_SZ - 1)) / (local_world_size * LS_SZ); -+ auto req_workgroups = (per_rank_items + (workgroup_available_items - 1)) / workgroup_available_items; -+ auto req_subgroups = 0; -+ -+ if (req_workgroups >= g_sz/l_sz) { -+ req_workgroups = g_sz/l_sz; -+ } else { -+ if (group_id == (req_workgroups - 1)) { -+ req_subgroups = (per_rank_items + (sg_sz - 1)) / (sg_sz - 1); -+ -+ /* (req_subgroups % (l_sz/sg_sz) - 1) equals to the final subgroup id in a workgroup */ -+ /* Note: req_subgroups % (l_sz/sg_sz) might be 0 */ -+ if (((req_subgroups % (l_sz/sg_sz)) == 0) || (sg_id == (req_subgroups % (l_sz/sg_sz) - 1))) { -+ if ((per_rank_items % (sg_sz - 1)) != 0) { -+ /* FIXME: */ -+ req_workitems = per_rank_items % (sg_sz - 1); -+ chunk_sz = req_workitems * LS_SZ; /* LS_SZ bytes per work-item */ -+ } -+ } -+ } -+ } - -- if (work_left > 0) { -+ if (group_id < req_workgroups) { - // step 1: push data to next GPU - { - offset = base + local_world_rank * chunk_sz; --- -2.34.1 - - diff --git a/docker/llm/serving/xpu/docker/Dockerfile b/docker/llm/serving/xpu/docker/Dockerfile index dae899ced1a..7b5eaeaad5e 100644 --- a/docker/llm/serving/xpu/docker/Dockerfile +++ b/docker/llm/serving/xpu/docker/Dockerfile @@ -54,6 +54,7 @@ RUN set -eux && \ # # Install Intel PyTorch extension for LLM inference pip install --pre --upgrade ipex-llm[xpu_2.6] --extra-index-url https://download.pytorch.org/whl/xpu && \ + pip install intel-extension-for-pytorch==2.6.10+xpu --extra-index-url=https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/ && \ # # Build torch-ccl mkdir -p /build && \ @@ -64,7 +65,7 @@ RUN set -eux && \ git submodule sync && \ git submodule update --init --recursive && \ # This patch will enable build torch-ccl with pytorch 2.6 environment - git apply /tmp/ccl_torch.patch && \ + # git apply /tmp/ccl_torch.patch && \ USE_SYSTEM_ONECCL=ON COMPUTE_BACKEND=dpcpp python setup.py bdist_wheel && \ # File path: /build/torch-ccl/dist/oneccl_bind_pt-2.6.0+xpu-cp311-cp311-linux_x86_64.whl # Build oneCCL @@ -85,7 +86,7 @@ RUN set -eux && \ FROM intel/oneapi-basekit:2025.0.1-0-devel-ubuntu22.04 # Copy the built torch-ccl package from the build stage -COPY --from=build /build/torch-ccl/dist/oneccl_bind_pt-2.5.0+xpu-cp311-cp311-linux_x86_64.whl /opt/ +COPY --from=build /build/torch-ccl/dist/oneccl_bind_pt-2.6.0+xpu-cp311-cp311-linux_x86_64.whl /opt/ COPY --from=build /llm/ /llm/ COPY --from=build /build/oneCCL/build/src/libccl.so.1.0 /opt/intel/1ccl-wks/lib/ COPY --from=build /build/oneCCL/build/src/libccl.so.1 /opt/intel/1ccl-wks/lib/ @@ -144,9 +145,10 @@ RUN set -eux && \ # Install vllm dependencies pip install --upgrade fastapi && \ pip install --upgrade "uvicorn[standard]" && \ + pip install intel-extension-for-pytorch==2.6.10+xpu --extra-index-url=https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/ && \ # # Install torch-ccl - pip install /opt/oneccl_bind_pt-2.5.0+xpu-cp311-cp311-linux_x86_64.whl && \ + pip install /opt/oneccl_bind_pt-2.6.0+xpu-cp311-cp311-linux_x86_64.whl && \ # apt-get update && \ apt-get install -y --no-install-recommends libfabric-dev wrk libaio-dev numactl && \ @@ -169,18 +171,23 @@ RUN set -eux && \ cd /llm && \ rm -rf /tmp/neo && \ # Install vllm - git clone -b 0.8.3 https://github.com/analytics-zoo/vllm.git /llm/vllm && \ + git clone -b v0.8.3 https://github.com/vllm-project/vllm /llm/vllm && \ cd /llm/vllm && \ git apply /llm/vllm_for_multi_arc.patch && \ - pip install setuptools-scm && \ + pip install setuptools-scm==8.2.0 setuptools==78.1.0 && \ pip install --upgrade cmake && \ - VLLM_TARGET_DEVICE=xpu pip install --no-build-isolation -v /llm/vllm && \ + pip install -v -r requirements/xpu.txt && \ + VLLM_TARGET_DEVICE=xpu python setup.py install && \ pip install intel-extension-for-pytorch==2.6.10+xpu --extra-index-url=https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/ && \ pip uninstall -y oneccl oneccl-devel && \ rm -rf /llm/vllm_for_multi_arc.patch && \ pip install mpi4py fastapi uvicorn openai && \ pip install ray numba +# Install ipex-llm, should remove before merge. +RUN git clone -b vllm_083_0407 https://github.com/xiangyuT/ipex-llm.git /llm/ipex-llm && \ + cp -r /llm/ipex-llm/python/llm/src/ipex_llm/vllm/xpu /usr/local/lib/python3.11/dist-packages/ipex_llm/vllm && \ + cp /llm/ipex-llm/python/llm/src/ipex_llm/transformers/convert.py /usr/local/lib/python3.11/dist-packages/ipex_llm/transformers/convert.py WORKDIR /llm/ ENTRYPOINT ["bash", "/llm/start-vllm-service.sh"] diff --git a/docker/llm/serving/xpu/docker/start-vllm-service.sh b/docker/llm/serving/xpu/docker/start-vllm-service.sh index 43a9dc7e333..924ddb53ce7 100644 --- a/docker/llm/serving/xpu/docker/start-vllm-service.sh +++ b/docker/llm/serving/xpu/docker/start-vllm-service.sh @@ -32,6 +32,8 @@ export TORCH_LLM_ALLREDUCE=0 export CCL_SAME_STREAM=1 export CCL_BLOCKING_WAIT=0 +export IPEX_LLM_LOWBIT=$LOAD_IN_LOW_BIT + source /opt/intel/1ccl-wks/setvars.sh python -m ipex_llm.vllm.xpu.entrypoints.openai.api_server \ diff --git a/docker/llm/serving/xpu/docker/vllm_for_multi_arc.patch b/docker/llm/serving/xpu/docker/vllm_for_multi_arc.patch index e2121a65cea..eb80d89c9c9 100644 --- a/docker/llm/serving/xpu/docker/vllm_for_multi_arc.patch +++ b/docker/llm/serving/xpu/docker/vllm_for_multi_arc.patch @@ -1,157 +1,8 @@ -diff --git a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml -index 708e54872..679abf181 100644 ---- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml -+++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml -@@ -1,5 +1,6 @@ - steps: - - label: "Wait for container to be ready" -+ key: wait-for-container-image - agents: - queue: A100 - plugins: -@@ -10,12 +11,11 @@ steps: - command: - - sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh - -- - wait -- - - label: "A100" - # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing" - agents: - queue: A100 -+ depends_on: wait-for-container-image - plugins: - - kubernetes: - podSpec: -@@ -49,6 +49,7 @@ steps: - # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing" - agents: - queue: H200 -+ depends_on: wait-for-container-image - plugins: - - docker#v5.12.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT -@@ -73,7 +74,7 @@ steps: - # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing" - agents: - queue: H100 -- depends_on: block-h100 -+ depends_on: wait-for-container-image - plugins: - - docker#v5.12.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT -diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml -index b563c9634..529daf54f 100644 ---- a/.buildkite/test-pipeline.yaml -+++ b/.buildkite/test-pipeline.yaml -@@ -106,14 +106,12 @@ steps: - source_file_dependencies: - - vllm/ - commands: -- - pip install -e ./plugins/vllm_add_dummy_model - - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py - - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process - - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process - - pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process - - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py -- - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process - - pytest -v -s entrypoints/test_chat_utils.py - - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests - -@@ -333,8 +331,6 @@ steps: - - vllm/ - - tests/models - commands: -- - pip install -e ./plugins/vllm_add_dummy_model -- - pytest -v -s models/test_oot_registration.py # it needs a clean process - - pytest -v -s models/test_registry.py - - pytest -v -s models/test_initialization.py - -@@ -360,23 +356,25 @@ steps: - - pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model' - - pytest -v -s models/embedding/language -m 'not core_model' - --- label: Multi-Modal Models Test (Standard) # 28min -+- label: Multi-Modal Models Test (Standard) # 40min - #mirror_hardwares: [amd] - source_file_dependencies: - - vllm/ - - tests/models/decoder_only/audio_language - - tests/models/decoder_only/vision_language - - tests/models/embedding/vision_language -+ - tests/models/encoder_decoder/audio_language - - tests/models/encoder_decoder/vision_language - commands: - - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model' - - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model' - - pytest -v -s models/embedding/vision_language -m core_model -+ - pytest -v -s models/encoder_decoder/audio_language -m core_model - - pytest -v -s models/encoder_decoder/language -m core_model - - pytest -v -s models/encoder_decoder/vision_language -m core_model - --- label: Multi-Modal Models Test (Extended) 1 # 1h16m -+- label: Multi-Modal Models Test (Extended) 1 # 48m - optional: true - source_file_dependencies: - - vllm/ -@@ -469,11 +467,28 @@ steps: - - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)' - - pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)' - - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py -- - pip install -e ./plugins/vllm_add_dummy_model -- - pytest -v -s distributed/test_distributed_oot.py - - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py - - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py - -+- label: Plugin Tests (2 GPUs) # 40min -+ working_dir: "/vllm-workspace/tests" -+ num_gpus: 2 -+ fast_check: true -+ source_file_dependencies: -+ - vllm/plugins/ -+ - tests/plugins/ -+ commands: -+ # begin platform plugin tests, all the code in-between runs on dummy platform -+ - pip install -e ./plugins/vllm_add_dummy_platform -+ - pytest -v -s plugins_tests/test_platform_plugins.py -+ - pip uninstall vllm_add_dummy_platform -y -+ # end platform plugin tests -+ # other tests continue here: -+ - pip install -e ./plugins/vllm_add_dummy_model -+ - pytest -v -s distributed/test_distributed_oot.py -+ - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process -+ - pytest -v -s models/test_oot_registration.py # it needs a clean process -+ - - label: Multi-step Tests (4 GPUs) # 36min - working_dir: "/vllm-workspace/tests" - num_gpus: 4 -diff --git a/.github/ISSUE_TEMPLATE/400-bug report.yml b/.github/ISSUE_TEMPLATE/400-bug-report.yml -similarity index 100% -rename from .github/ISSUE_TEMPLATE/400-bug report.yml -rename to .github/ISSUE_TEMPLATE/400-bug-report.yml -diff --git a/.github/ISSUE_TEMPLATE/500-feature request.yml b/.github/ISSUE_TEMPLATE/500-feature-request.yml -similarity index 100% -rename from .github/ISSUE_TEMPLATE/500-feature request.yml -rename to .github/ISSUE_TEMPLATE/500-feature-request.yml -diff --git a/.github/ISSUE_TEMPLATE/600-new model.yml b/.github/ISSUE_TEMPLATE/600-new-model.yml -similarity index 100% -rename from .github/ISSUE_TEMPLATE/600-new model.yml -rename to .github/ISSUE_TEMPLATE/600-new-model.yml -diff --git a/.github/ISSUE_TEMPLATE/700-performance discussion.yml b/.github/ISSUE_TEMPLATE/700-performance-discussion.yml -similarity index 100% -rename from .github/ISSUE_TEMPLATE/700-performance discussion.yml -rename to .github/ISSUE_TEMPLATE/700-performance-discussion.yml -diff --git a/.github/ISSUE_TEMPLATE/800-misc discussion.yml b/.github/ISSUE_TEMPLATE/800-misc-discussion.yml -similarity index 100% -rename from .github/ISSUE_TEMPLATE/800-misc discussion.yml -rename to .github/ISSUE_TEMPLATE/800-misc-discussion.yml diff --git a/CMakeLists.txt b/CMakeLists.txt -index 83c803343..f3e2d0953 100644 +index 15db4a4f4..1870e976e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt -@@ -83,6 +83,24 @@ endif() +@@ -80,6 +80,24 @@ endif() # find_package(Torch REQUIRED) @@ -176,7 +27,7 @@ index 83c803343..f3e2d0953 100644 # # Forward the non-CUDA device extensions to external CMake scripts. # -@@ -90,6 +108,10 @@ if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda" AND +@@ -87,6 +105,10 @@ if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda" AND NOT VLLM_TARGET_DEVICE STREQUAL "rocm") if (VLLM_TARGET_DEVICE STREQUAL "cpu") include(${CMAKE_CURRENT_LIST_DIR}/cmake/cpu_extension.cmake) @@ -187,459 +38,24 @@ index 83c803343..f3e2d0953 100644 else() return() endif() -@@ -223,13 +245,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") - FetchContent_Declare( - cutlass - GIT_REPOSITORY https://github.com/nvidia/cutlass.git -- GIT_TAG 8aa95dbb888be6d81c6fbf7169718c5244b53227 -+ GIT_TAG v3.6.0 - GIT_PROGRESS TRUE - - # Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history. - # Important: If GIT_SHALLOW is enabled then GIT_TAG works only with branch names and tags. - # So if the GIT_TAG above is updated to a commit hash, GIT_SHALLOW must be set to FALSE -- GIT_SHALLOW FALSE -+ GIT_SHALLOW TRUE - ) - endif() - FetchContent_MakeAvailable(cutlass) -@@ -242,112 +264,19 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") - "csrc/custom_all_reduce.cu" - "csrc/permute_cols.cu" - "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu" -- "csrc/sparse/cutlass/sparse_scaled_mm_entry.cu" -- "csrc/sparse/cutlass/sparse_compressor_entry.cu" -- "csrc/cutlass_extensions/common.cpp") -- -- set_gencode_flags_for_srcs( -- SRCS "${VLLM_EXT_SRC}" -- CUDA_ARCHS "${CUDA_ARCHS}") -- -- # Only build Marlin kernels if we are building for at least some compatible archs. -- # Keep building Marlin for 9.0 as there are some group sizes and shapes that -- # are not supported by Machete yet. -- cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0" ${CUDA_ARCHS}) -- if (MARLIN_ARCHS) -- set(MARLIN_SRCS -- "csrc/quantization/fp8/fp8_marlin.cu" -- "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu" -- "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu" -- "csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu" -- "csrc/quantization/gptq_marlin/gptq_marlin.cu" -- "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu" -- "csrc/quantization/gptq_marlin/awq_marlin_repack.cu") -- set_gencode_flags_for_srcs( -- SRCS "${MARLIN_SRCS}" -- CUDA_ARCHS "${MARLIN_ARCHS}") -- list(APPEND VLLM_EXT_SRC "${MARLIN_SRCS}") -- message(STATUS "Building Marlin kernels for archs: ${MARLIN_ARCHS}") -- else() -- message(STATUS "Not building Marlin kernels as no compatible archs found" -- " in CUDA target architectures") -- endif() -- -- # The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require -- # CUDA 12.0 or later (and only work on Hopper, 9.0/9.0a for now). -- cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0;9.0a" "${CUDA_ARCHS}") -- if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS) -- set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu") -- set_gencode_flags_for_srcs( -- SRCS "${SRCS}" -- CUDA_ARCHS "${SCALED_MM_3X_ARCHS}") -- list(APPEND VLLM_EXT_SRC "${SRCS}") -- list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C3X=1") -- message(STATUS "Building scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}") -- else() -- if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS) -- message(STATUS "Not building scaled_mm_c3x as CUDA Compiler version is " -- "not >= 12.0, we recommend upgrading to CUDA 12.0 or " -- "later if you intend on running FP8 quantized models on " -- "Hopper.") -- else() -- message(STATUS "Not building scaled_mm_c3x as no compatible archs found " -- "in CUDA target architectures") -- endif() -- -- # clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't -- # build any 3x kernels -- set(SCALED_MM_3X_ARCHS) -- endif() -+ "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu" -+ "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu") - - # -- # For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x) -- # kernels for the remaining archs that are not already built for 3x. -- cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS -- "7.5;8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}") -- # subtract out the archs that are already built for 3x -- list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS}) -- if (SCALED_MM_2X_ARCHS) -- set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu") -- set_gencode_flags_for_srcs( -- SRCS "${SRCS}" -- CUDA_ARCHS "${SCALED_MM_2X_ARCHS}") -- list(APPEND VLLM_EXT_SRC "${SRCS}") -- list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C2X=1") -- message(STATUS "Building scaled_mm_c2x for archs: ${SCALED_MM_2X_ARCHS}") -- else() -- if (SCALED_MM_3X_ARCHS) -- message(STATUS "Not building scaled_mm_c2x as all archs are already built" -- " for and covered by scaled_mm_c3x") -- else() -- message(STATUS "Not building scaled_mm_c2x as no compatible archs found " -- "in CUDA target architectures") -- endif() -- endif() -- -- # -- # 2:4 Sparse Kernels -- -- # The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor -- # require CUDA 12.2 or later (and only work on Hopper, 9.0/9.0a for now). -- if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS) -- set(SRCS "csrc/sparse/cutlass/sparse_compressor_c3x.cu" -- "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu") -- set_gencode_flags_for_srcs( -- SRCS "${SRCS}" -- CUDA_ARCHS "${SCALED_MM_3X_ARCHS}") -- list(APPEND VLLM_EXT_SRC "${SRCS}") -- list(APPEND VLLM_GPU_FLAGS "-DENABLE_SPARSE_SCALED_MM_C3X=1") -- message(STATUS "Building sparse_scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}") -- else() -- if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS) -- message(STATUS "Not building sparse_scaled_mm_c3x kernels as CUDA Compiler version is " -- "not >= 12.2, we recommend upgrading to CUDA 12.2 or later " -- "if you intend on running FP8 sparse quantized models on Hopper.") -- else() -- message(STATUS "Not building sparse_scaled_mm_c3x as no compatible archs found " -- "in CUDA target architectures") -- endif() -+ # The CUTLASS kernels for Hopper require sm90a to be enabled. -+ # This is done via the below gencode option, BUT that creates kernels for both sm90 and sm90a. -+ # That adds an extra 17MB to compiled binary, so instead we selectively enable it. -+ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0) -+ set_source_files_properties( -+ "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu" -+ PROPERTIES -+ COMPILE_FLAGS -+ "-gencode arch=compute_90a,code=sm_90a") - endif() - - -@@ -550,7 +479,7 @@ else() - FetchContent_Declare( - vllm-flash-attn - GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git -- GIT_TAG 04325b6798bcc326c86fb35af62d05a9c8c8eceb -+ GIT_TAG 96266b1111111f3d11aabefaf3bacbab6a89d03c - GIT_PROGRESS TRUE - # Don't share the vllm-flash-attn build between build types - BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn -diff --git a/Dockerfile b/Dockerfile -index 153bff9cf..088314eb3 100644 ---- a/Dockerfile -+++ b/Dockerfile -@@ -234,8 +234,8 @@ RUN mv vllm test_docs/ - #################### TEST IMAGE #################### - - #################### OPENAI API SERVER #################### --# openai api server alternative --FROM vllm-base AS vllm-openai -+# base openai image with additional requirements, for any subsequent openai-style images -+FROM vllm-base AS vllm-openai-base - - # install additional dependencies for openai api server - RUN --mount=type=cache,target=/root/.cache/pip \ -@@ -247,5 +247,14 @@ RUN --mount=type=cache,target=/root/.cache/pip \ - - ENV VLLM_USAGE_SOURCE production-docker-image - -+# define sagemaker first, so it is not default from `docker build` -+FROM vllm-openai-base AS vllm-sagemaker -+ -+COPY examples/sagemaker-entrypoint.sh . -+RUN chmod +x sagemaker-entrypoint.sh -+ENTRYPOINT ["./sagemaker-entrypoint.sh"] -+ -+FROM vllm-openai-base AS vllm-openai -+ - ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] - #################### OPENAI API SERVER #################### -diff --git a/Dockerfile.neuron b/Dockerfile.neuron -index 77162bc82..269139fe9 100644 ---- a/Dockerfile.neuron -+++ b/Dockerfile.neuron -@@ -1,6 +1,6 @@ - # default base image - # https://gallery.ecr.aws/neuron/pytorch-inference-neuronx --ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.1.2-neuronx-py310-sdk2.20.2-ubuntu20.04" -+ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.5.1-neuronx-py310-sdk2.21.0-ubuntu22.04" - - FROM $BASE_IMAGE - -@@ -22,9 +22,9 @@ WORKDIR ${APP_MOUNT}/vllm - - RUN python3 -m pip install --upgrade pip - RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas --RUN python3 -m pip install sentencepiece transformers==4.36.2 -U -+RUN python3 -m pip install sentencepiece transformers==4.45.2 -U - RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U --RUN python3 -m pip install --pre neuronx-cc==2.15.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U -+RUN python3 -m pip install neuronx-cc==2.16.345.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U - - COPY . . - ARG GIT_REPO_CHECK=0 -diff --git a/Dockerfile.xpu b/Dockerfile.xpu -index a374f20d7..07f98a62f 100644 ---- a/Dockerfile.xpu -+++ b/Dockerfile.xpu -@@ -1,4 +1,4 @@ --FROM intel/oneapi-basekit:2024.2.1-0-devel-ubuntu22.04 AS vllm-base -+FROM intel/oneapi-basekit:2025.0.1-0-devel-ubuntu22.04 AS vllm-base - - RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \ - echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \ -@@ -21,7 +21,8 @@ RUN apt-get update -y && \ - python3 \ - python3-dev \ - python3-pip \ -- # vim \ -+ libze-intel-gpu-dev \ -+ libze-intel-gpu1 \ - wget - - WORKDIR /workspace/vllm -@@ -32,21 +33,9 @@ RUN --mount=type=cache,target=/root/.cache/pip \ - pip install --no-cache-dir \ - -r requirements-xpu.txt - --RUN git clone https://github.com/intel/pti-gpu && \ -- cd pti-gpu/sdk && \ -- git checkout 6c491f07a777ed872c2654ca9942f1d0dde0a082 && \ -- mkdir build && \ -- cd build && \ -- cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=../cmake/toolchains/icpx_toolchain.cmake -DBUILD_TESTING=OFF .. && \ -- make -j && \ -- cmake --install . --config Release --prefix "/usr/local" -- - ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/" - - COPY . . --ARG GIT_REPO_CHECK --RUN --mount=type=bind,source=.git,target=.git \ -- if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi - - ENV VLLM_TARGET_DEVICE=xpu - diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py -index b67849038..4408ca545 100644 +index ea70a1f48..d311ec9d9 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py -@@ -240,7 +240,7 @@ async def async_request_openai_completions( +@@ -259,7 +259,7 @@ async def async_request_openai_completions( + "prompt": request_func_input.prompt, "temperature": 0.0, - "best_of": request_func_input.best_of, "max_tokens": request_func_input.output_len, - "logprobs": request_func_input.logprobs, + "min_tokens": request_func_input.output_len, "stream": True, - "ignore_eos": request_func_input.ignore_eos, - } -diff --git a/benchmarks/benchmark_long_document_qa_throughput.py b/benchmarks/benchmark_long_document_qa_throughput.py -new file mode 100644 -index 000000000..13477ef53 ---- /dev/null -+++ b/benchmarks/benchmark_long_document_qa_throughput.py -@@ -0,0 +1,184 @@ -+""" -+Offline benchmark to test the long document QA throughput. -+ -+Example usage: -+ # This command run the vllm with 50GB CPU memory for offloading -+ # The workload samples 8 different prompts with a default input -+ # length of 20000 tokens, then replicates each prompt 2 times -+ # in random order. -+ python benchmark_long_document_qa_throughput.py \ -+ --model meta-llama/Llama-2-7b-chat-hf \ -+ --enable-prefix-caching \ -+ --num-documents 8 \ -+ --repeat-count 2 -+ -+Commandline arguments: -+ --num-documents: The number of documents to sample prompts from. -+ -+ --document-length: The length of each document in tokens. -+ (Optional, default: 20000) -+ -+ --output-len: The number of tokens to generate for each prompt. -+ (Optional, default: 10) -+ -+ --repeat-count: The number of times to repeat each prompt. -+ (Optional, default: 2) -+ -+ --repeat-mode: The mode to repeat prompts. The supported modes are: -+ - 'random': shuffle the prompts randomly. (Default) -+ - 'tile': the entire prompt list is repeated in sequence. (Potentially -+ lowest cache hit) -+ - 'interleave': each prompt is repeated consecutively before -+ moving to the next element. (Highest cache hit) -+ -+ --shuffle-seed: Random seed when the repeat mode is "random". -+ (Optional, default: 0) -+ -+In the meantime, it also supports all the vLLM engine args to initialize the -+LLM engine. You can refer to the `vllm.engine.arg_utils.EngineArgs` for more -+details. -+""" -+ -+import dataclasses -+import random -+import time -+ -+from vllm import LLM, SamplingParams -+from vllm.engine.arg_utils import EngineArgs -+from vllm.utils import FlexibleArgumentParser -+ -+ -+def test_long_document_qa(llm=None, sampling_params=None, prompts=None): -+ """ -+ Test long document QA with the given prompts and sampling parameters. -+ Print the time spent in processing all the prompts. -+ -+ Args: -+ llm: The language model used for generating responses. -+ sampling_params: Sampling parameter used to generate the response. -+ prompts: A list of prompt strings to be processed by the LLM. -+ """ -+ start_time = time.time() -+ llm.generate(prompts, sampling_params=sampling_params) -+ end_time = time.time() -+ print(f"Time to execute all requests: {end_time - start_time:.4f} secs") -+ -+ -+def repeat_prompts(prompts, repeat_count, mode: str): -+ """ -+ Repeat each prompt in the list for a specified number of times. -+ The order of prompts in the output list depends on the mode. -+ -+ Args: -+ prompts: A list of prompts to be repeated. -+ repeat_count: The number of times each prompt is repeated. -+ mode: The mode of repetition. Supported modes are: -+ - 'random': Shuffle the prompts randomly after repetition. -+ - 'tile': Repeat the entire prompt list in sequence. -+ Example: [1, 2, 3] -> [1, 2, 3, 1, 2, 3]. -+ - 'interleave': Repeat each prompt consecutively before moving to -+ the next. Example: [1, 2, 3] -> [1, 1, 2, 2, 3, 3]. -+ -+ Returns: -+ A list of repeated prompts in the specified order. -+ -+ Raises: -+ ValueError: If an invalid mode is provided. -+ """ -+ print("Repeat mode: ", mode) -+ if mode == 'random': -+ repeated_prompts = prompts * repeat_count -+ random.shuffle(repeated_prompts) -+ return repeated_prompts -+ elif mode == 'tile': -+ return prompts * repeat_count -+ elif mode == 'interleave': -+ repeated_prompts = [] -+ for prompt in prompts: -+ repeated_prompts.extend([prompt] * repeat_count) -+ return repeated_prompts -+ else: -+ raise ValueError(f"Invalid mode: {mode}, only support " -+ "'random', 'tile', 'interleave'") -+ -+ -+def main(args): -+ random.seed(args.shuffle_seed) -+ -+ # Prepare the prompts: -+ # we append the document id at the beginning to avoid any of the document -+ # being the prefix of other documents -+ prompts = [ -+ str(i) + ' '.join(['hi'] * args.document_length) -+ for i in range(args.num_documents) -+ ] -+ -+ prompts = repeat_prompts(prompts, args.repeat_count, mode=args.repeat_mode) -+ -+ warmup_prompts = [ -+ "This is warm up request " + str(i) + \ -+ ' '.join(['hi'] * args.document_length) -+ for i in range(args.num_documents)] -+ -+ # Create the LLM engine -+ engine_args = EngineArgs.from_cli_args(args) -+ llm = LLM(**dataclasses.asdict(engine_args)) -+ sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len) -+ -+ print("------warm up------") -+ test_long_document_qa( -+ llm=llm, -+ prompts=warmup_prompts, -+ sampling_params=sampling_params, -+ ) -+ -+ print("------start generating------") -+ test_long_document_qa( -+ llm=llm, -+ prompts=prompts, -+ sampling_params=sampling_params, -+ ) -+ -+ -+if __name__ == "__main__": -+ parser = FlexibleArgumentParser( -+ description= -+ 'Benchmark the performance with or without automatic prefix caching.') -+ -+ parser.add_argument( -+ '--document-length', -+ type=int, -+ # Roughly the number of tokens for a system paper, -+ # excluding images -+ default=20000, -+ help='Range of input lengths for sampling prompts,' -+ 'specified as "min:max" (e.g., "128:256").') -+ -+ parser.add_argument('--num-documents', -+ type=int, -+ default=8, -+ help='Range of input lengths for sampling prompts,' -+ 'specified as "min:max" (e.g., "128:256").') -+ -+ parser.add_argument('--output-len', type=int, default=10) -+ -+ parser.add_argument('--repeat-count', -+ type=int, -+ default=2, -+ help='Number of times to repeat each prompt') -+ -+ parser.add_argument("--repeat-mode", -+ type=str, -+ default='random', -+ help='The mode to repeat prompts. The supported ' -+ 'modes are "random", "tile", and "interleave". ' -+ 'See repeat_prompts() in the source code for details.') -+ -+ parser.add_argument("--shuffle-seed", -+ type=int, -+ default=0, -+ help='Random seed when the repeat mode is "random"') -+ -+ parser = EngineArgs.add_cli_args(parser) -+ args = parser.parse_args() -+ main(args) + "stream_options": { + "include_usage": True, diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py -index 5e9381f71..a0930c2fc 100644 +index 4fff7a8fc..531770259 100644 --- a/benchmarks/benchmark_prefix_caching.py +++ b/benchmarks/benchmark_prefix_caching.py -@@ -33,7 +33,8 @@ from typing import List, Optional, Tuple +@@ -35,7 +35,8 @@ from typing import Optional from transformers import PreTrainedTokenizerBase @@ -649,17 +65,17 @@ index 5e9381f71..a0930c2fc 100644 from vllm.engine.arg_utils import EngineArgs from vllm.utils import FlexibleArgumentParser -@@ -190,7 +191,7 @@ def main(args): +@@ -192,7 +193,7 @@ def main(args): engine_args = EngineArgs.from_cli_args(args) - llm = LLM(**dataclasses.asdict(engine_args)) + llm = LLM(**dataclasses.asdict(engine_args), load_in_low_bit=args.load_in_low_bit) - sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len) - -@@ -242,6 +243,13 @@ if __name__ == "__main__": - "when dataset-path is not provided.", + sampling_params = SamplingParams(temperature=0, + max_tokens=args.output_len, +@@ -252,6 +253,13 @@ if __name__ == "__main__": + "detokenization time in the latency measurement)"), ) + parser.add_argument( @@ -674,1274 +90,11 @@ index 5e9381f71..a0930c2fc 100644 - main(args) + main(args) \ No newline at end of file -diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py -index 4eb0e1f8a..7d1653e95 100644 ---- a/benchmarks/benchmark_serving.py -+++ b/benchmarks/benchmark_serving.py -@@ -779,7 +779,8 @@ def main(args: argparse.Namespace): - np.random.seed(args.seed) - - backend = args.backend -- model_id = args.model -+ # model_id = args.model -+ model_id = args.model.split('/')[-1] - tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model - tokenizer_mode = args.tokenizer_mode - -diff --git a/benchmarks/benchmark_serving_input.py b/benchmarks/benchmark_serving_input.py -new file mode 100644 -index 000000000..6c0b10ef0 ---- /dev/null -+++ b/benchmarks/benchmark_serving_input.py -@@ -0,0 +1,1243 @@ -+r"""Benchmark online serving throughput. -+ -+On the server side, run one of the following commands: -+ vLLM OpenAI API server -+ vllm serve \ -+ --swap-space 16 \ -+ --disable-log-requests -+ -+ (TGI backend) -+ ./launch_tgi_server.sh -+ -+On the client side, run: -+ python benchmarks/benchmark_serving.py \ -+ --backend \ -+ --model \ -+ --dataset-name sharegpt \ -+ --dataset-path \ -+ --request-rate \ # By default is inf -+ --num-prompts # By default is 1000 -+ -+ when using tgi backend, add -+ --endpoint /generate_stream -+ to the end of the command above. -+""" -+import argparse -+import asyncio -+import base64 -+import io -+import json -+import os -+import random -+import time -+import warnings -+from dataclasses import dataclass -+from datetime import datetime -+from typing import Any, AsyncGenerator, Collection, Dict, List, Optional, Tuple -+ -+import numpy as np -+from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput, -+ RequestFuncOutput) -+from datasets import load_dataset -+from PIL.Image import Image -+from tqdm.asyncio import tqdm -+from transformers import PreTrainedTokenizerBase -+ -+try: -+ from vllm.transformers_utils.tokenizer import get_tokenizer -+except ImportError: -+ from backend_request_func import get_tokenizer -+ -+try: -+ from vllm.utils import FlexibleArgumentParser -+except ImportError: -+ from argparse import ArgumentParser as FlexibleArgumentParser -+ -+MILLISECONDS_TO_SECONDS_CONVERSION = 1000 -+ -+ -+@dataclass -+class BenchmarkMetrics: -+ completed: int -+ total_input: int -+ total_output: int -+ request_throughput: float -+ request_goodput: float -+ output_throughput: float -+ total_token_throughput: float -+ mean_ttft_ms: float -+ median_ttft_ms: float -+ std_ttft_ms: float -+ percentiles_ttft_ms: List[Tuple[float, float]] -+ mean_tpot_ms: float -+ median_tpot_ms: float -+ std_tpot_ms: float -+ percentiles_tpot_ms: List[Tuple[float, float]] -+ mean_itl_ms: float -+ median_itl_ms: float -+ std_itl_ms: float -+ percentiles_itl_ms: List[Tuple[float, float]] -+ # E2EL stands for end-to-end latency per request. -+ # It is the time taken on the client side from sending -+ # a request to receiving a complete response. -+ mean_e2el_ms: float -+ median_e2el_ms: float -+ std_e2el_ms: float -+ percentiles_e2el_ms: List[Tuple[float, float]] -+ -+ -+def sample_sharegpt_requests( -+ dataset_path: str, -+ num_requests: int, -+ tokenizer: PreTrainedTokenizerBase, -+ fixed_output_len: Optional[int] = None, -+) -> List[Tuple[str, int, int, None]]: -+ # Load the dataset. -+ with open(dataset_path, encoding='utf-8') as f: -+ dataset = json.load(f) -+ # Filter out the conversations with less than 2 turns. -+ dataset = [data for data in dataset if len(data["conversations"]) >= 2] -+ # Only keep the first two turns of each conversation. -+ dataset = [(data["conversations"][0]["value"], -+ data["conversations"][1]["value"]) for data in dataset] -+ -+ # Shuffle the dataset. -+ random.shuffle(dataset) -+ -+ # Filter out sequences that are too long or too short -+ filtered_dataset: List[Tuple[str, int, int]] = [] -+ for i in range(len(dataset)): -+ if len(filtered_dataset) == num_requests: -+ break -+ -+ # Tokenize the prompts and completions. -+ prompt = dataset[i][0] -+ prompt_token_ids = tokenizer(prompt).input_ids -+ completion = dataset[i][1] -+ completion_token_ids = tokenizer(completion).input_ids -+ prompt_len = len(prompt_token_ids) -+ output_len = len(completion_token_ids -+ ) if fixed_output_len is None else fixed_output_len -+ if prompt_len < 4 or (fixed_output_len is None and output_len < 4): -+ # Prune too short sequences. -+ continue -+ if prompt_len > 1024 or prompt_len + output_len > 2048: -+ # Prune too long sequences. -+ continue -+ filtered_dataset.append((prompt, prompt_len, output_len, None)) -+ -+ return filtered_dataset -+ -+ -+def sample_sonnet_requests( -+ dataset_path: str, -+ num_requests: int, -+ input_len: int, -+ output_len: int, -+ prefix_len: int, -+ tokenizer: PreTrainedTokenizerBase, -+) -> List[Tuple[str, str, int, int, None]]: -+ assert ( -+ input_len > prefix_len -+ ), "'args.sonnet-input-len' must be greater than 'args.prefix-input-len'." -+ -+ # Load the dataset. -+ with open(dataset_path, encoding='utf-8') as f: -+ poem_lines = f.readlines() -+ -+ # Tokenize the poem lines. -+ poem_token_ids = tokenizer(poem_lines).input_ids -+ average_poem_len = sum( -+ len(token_ids) for token_ids in poem_token_ids) / len(poem_token_ids) -+ -+ # Base prefix for all requests. -+ base_prompt = "Pick as many lines as you can from these poem lines:\n" -+ base_message = [{ -+ "role": "user", -+ "content": base_prompt, -+ }] -+ base_prompt_formatted = tokenizer.apply_chat_template( -+ base_message, add_generation_prompt=True, tokenize=False) -+ base_prompt_offset = len(tokenizer(base_prompt_formatted).input_ids) -+ -+ assert ( -+ input_len > base_prompt_offset -+ ), f"Please set 'args.sonnet-input-len' higher than {base_prompt_offset}." -+ num_input_lines = round( -+ (input_len - base_prompt_offset) / average_poem_len) -+ -+ # First approximately `prefix_len` number of tokens in the -+ # prompt are fixed poem lines. -+ assert ( -+ prefix_len > base_prompt_offset -+ ), f"Please set 'args.sonnet-prefix-len' higher than {base_prompt_offset}." -+ -+ num_prefix_lines = round( -+ (prefix_len - base_prompt_offset) / average_poem_len) -+ prefix_lines = poem_lines[:num_prefix_lines] -+ -+ # Sample the rest of lines per request. -+ sampled_requests: List[Tuple[str, int, int]] = [] -+ for _ in range(num_requests): -+ num_lines_needed = num_input_lines - num_prefix_lines -+ sampled_lines = "".join(prefix_lines + -+ random.choices(poem_lines, k=num_lines_needed)) -+ -+ prompt = f"{base_prompt}{sampled_lines}" -+ message = [ -+ { -+ "role": "user", -+ "content": prompt, -+ }, -+ ] -+ prompt_formatted = tokenizer.apply_chat_template( -+ message, add_generation_prompt=True, tokenize=False) -+ prompt_len = len(tokenizer(prompt_formatted).input_ids) -+ sampled_requests.append( -+ (prompt, prompt_formatted, prompt_len, output_len, None)) -+ -+ return sampled_requests -+ -+ -+def sample_mmmu_pro_vision_requests( -+ dataset, -+ num_requests: int, -+ tokenizer: PreTrainedTokenizerBase, -+ fixed_output_len: Optional[int] = None, -+) -> List[Tuple[str, str, int, Optional[Dict[str, Collection[str]]]]]: -+ sampled_requests: List[Tuple[str, int, int, Dict[str, -+ Collection[str]]]] = [] -+ for data in dataset: -+ if len(sampled_requests) == num_requests: -+ break -+ -+ # MMMU-Pro vision direct prompt -+ # Ref: https://github.com/MMMU-Benchmark/MMMU/blob/6ce42f4d8f70c1841c67867152648974415b5cac/mmmu-pro/prompts.yaml#L5 -+ prompt = ( -+ "Answer with the option letter from the given choices directly. " -+ "The last line of your response should be of the following " -+ "format: 'Answer: $LETTER' (without quotes) where LETTER is one of " -+ "options.") -+ -+ prompt_token_ids = tokenizer(prompt).input_ids -+ if fixed_output_len is None: -+ # Default max output len is set to 128 -+ print("--hf-output-len is not provided. Using default value 128.") -+ fixed_output_len = 128 -+ -+ prompt_len = len(prompt_token_ids) -+ output_len = fixed_output_len -+ -+ assert isinstance( -+ data["image"], -+ Image), ("Input image format must be `PIL.Image.Image`, " -+ f"given {type(data['image'])}.") -+ image: Image = data["image"] -+ image = image.convert("RGB") -+ image_data = io.BytesIO() -+ image.save(image_data, format='JPEG') -+ image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8") -+ mm_content = { -+ "type": "image_url", -+ "image_url": { -+ "url": f"data:image/jpeg;base64,{image_base64}" -+ }, -+ } -+ -+ sampled_requests.append((prompt, prompt_len, output_len, mm_content)) -+ -+ return sampled_requests -+ -+ -+def sample_hf_requests( -+ dataset_path: str, -+ dataset_subset: str, -+ dataset_split: str, -+ num_requests: int, -+ tokenizer: PreTrainedTokenizerBase, -+ random_seed: int, -+ fixed_output_len: Optional[int] = None, -+) -> List[Tuple[str, str, int, Optional[Dict[str, Collection[str]]]]]: -+ -+ # Special case for MMMU-Pro vision dataset -+ if dataset_path == 'MMMU/MMMU_Pro' and dataset_subset == 'vision': -+ assert dataset_split == "test" -+ dataset = load_dataset(dataset_path, -+ name=dataset_subset, -+ split=dataset_split, -+ streaming=True) -+ assert "image" in dataset.features, ( -+ "MMMU/MMMU_Pro vision dataset must have 'image' column.") -+ filter_func = lambda x: isinstance(x["image"], Image) -+ dataset = dataset.shuffle(seed=random_seed).filter(filter_func) -+ return sample_mmmu_pro_vision_requests(dataset, num_requests, -+ tokenizer, fixed_output_len) -+ -+ dataset = load_dataset(dataset_path, -+ name=dataset_subset, -+ split=dataset_split, -+ streaming=True) -+ assert "conversations" in dataset.features, ( -+ "HF Dataset must have 'conversations' column.") -+ filter_func = lambda x: len(x["conversations"]) >= 2 -+ filtered_dataset = dataset.shuffle(seed=random_seed).filter(filter_func) -+ sampled_requests: List[Tuple[str, int, int, Dict[str, -+ Collection[str]]]] = [] -+ for data in filtered_dataset: -+ if len(sampled_requests) == num_requests: -+ break -+ -+ # Tokenize the prompts and completions. -+ prompt = data["conversations"][0]["value"] -+ prompt_token_ids = tokenizer(prompt).input_ids -+ completion = data["conversations"][1]["value"] -+ completion_token_ids = tokenizer(completion).input_ids -+ prompt_len = len(prompt_token_ids) -+ output_len = len(completion_token_ids -+ ) if fixed_output_len is None else fixed_output_len -+ if fixed_output_len is None and (prompt_len < 4 or output_len < 4): -+ # Prune too short sequences. -+ continue -+ if fixed_output_len is None and \ -+ (prompt_len > 1024 or prompt_len + output_len > 2048): -+ # Prune too long sequences. -+ continue -+ -+ if "image" in data and isinstance(data["image"], Image): -+ image: Image = data["image"] -+ image = image.convert("RGB") -+ image_data = io.BytesIO() -+ image.save(image_data, format='JPEG') -+ image_base64 = base64.b64encode( -+ image_data.getvalue()).decode("utf-8") -+ mm_content = { -+ "type": "image_url", -+ "image_url": { -+ "url": f"data:image/jpeg;base64,{image_base64}" -+ }, -+ } -+ elif "image" in data and isinstance(data["image"], str): -+ if (data["image"].startswith("http://") or \ -+ data["image"].startswith("file://")): -+ image_url = data["image"] -+ else: -+ image_url = f"file://{data['image']}" -+ -+ mm_content = { -+ "type": "image_url", -+ "image_url": { -+ "url": image_url -+ }, -+ } -+ else: -+ mm_content = None -+ -+ sampled_requests.append((prompt, prompt_len, output_len, mm_content)) -+ -+ return sampled_requests -+ -+ -+def sample_random_requests( -+ prefix_len: int, -+ input_len: int, -+ output_len: int, -+ num_prompts: int, -+ range_ratio: float, -+ tokenizer: PreTrainedTokenizerBase, -+) -> List[Tuple[str, int, int]]: -+ prefix_token_ids = np.random.randint(0, -+ tokenizer.vocab_size, -+ size=prefix_len).tolist() -+ -+ input_lens = np.random.randint( -+ int(input_len * range_ratio), -+ input_len + 1, -+ size=num_prompts, -+ ) -+ output_lens = np.random.randint( -+ int(output_len * range_ratio), -+ output_len + 1, -+ size=num_prompts, -+ ) -+ offsets = np.random.randint(0, tokenizer.vocab_size, size=num_prompts) -+ input_requests = [] -+ -+ prompt=""" -+MANY YEARS LATER as he faced the firing squad, Colonel Aureliano Buendía was to remember that distant afternoon when his father took him to discover ice. At that time Macondo was a village of twenty adobe houses, built on the bank of a river of clear water that ran along a bed of polished stones, which were white and enormous, like prehistoric eggs. The world was so recent that many things lacked names, and in order to indicate them it was necessary to point. Every year during the month of March a family of ragged gypsies would set up their tents near the village, and with a great uproar of pipes and kettledrums they would display new inventions. First they brought the magnet. A heavy gypsy with an untamed beard and sparrow hands, who introduced himself as Melquíades, put on a bold public demonstration of what he himself called the eighth wonder of the learned alchemists of Macedonia. He went from house to house dragging two metal ingots and everybody was amazed to see pots, pans, tongs, and braziers tumble down from their places and beams creak from the desperation of nails and screws trying to emerge, and even objects that had been lost for a long time appeared from where they had been searched for most and went dragging along in turbulent confusion behind Melquíades’ magical irons. “Things have a life of their own,” the gypsy proclaimed with a harsh accent. “It’s simply a matter of waking up their souls.” José Arcadio Buendía, whose unbridled imagination always went beyond the genius of nature and even beyond miracles and magic, thought that it would be possible to make use of that useless invention to extract gold from the bowels of the earth. Melquíades, who was an honest man, warned him: “It won’t work for that.” But José Arcadio Buendía at that time did not believe in the honesty of gypsies, so he traded his mule and a pair of goats for the two magnetized ingots. Úrsula Iguarán, his wife, who relied on those animals to increase their poor domestic holdings, was unable to dissuade him. “Very soon well have gold enough and more to pave the floors of the house,” her husband replied. For several months he worked hard to demonstrate the truth of his idea. He explored every inch of the region, even the riverbed, dragging the two iron ingots along and reciting Melquíades’ incantation aloud. The only thing he succeeded in doing was to unearth a suit of fifteenth-century armor which had all of its pieces soldered together with rust and inside of which there was the hollow resonance of an enormous stone-filled gourd. When José Arcadio Buendía and the four men of his expedition managed to take the armor apart, they found inside a calcified skeleton with a copper locket containing a woman’s hair around its neck. In March the gypsies returned. This time they brought a telescope and a magnifying glass the size of a drum, which they exhibited as the latest discovery of the Jews of Amsterdam. They placed a gypsy woman at one end of the village and set up the telescope at the entrance to the tent. For the price of five reales, people could look into the telescope and see the gypsy woman an arm’s length away. “Science has eliminated distance,” Melquíades proclaimed. “In a short time, man will be able to see what is happening in any place in the world without leaving his own house.” A burning noonday sun brought out a startling demonstration with the gigantic magnifying glass: they put a pile of dry hay in the middle of the street and set it on fire by concentrating the sun’s rays. José Arcadio Buendía, who had still not been consoled for the failure of big magnets, conceived the idea of using that invention as a weapon of war. Again Melquíades tried to dissuade him, but he finally accepted the two magnetized ingots and three colonial coins in exchange for the magnifying glass. Úrsula wept in consternation. That money was from a chest of gold coins that her father had put together ova an entire life of privation and that she had buried underneath her bed in hopes of a proper occasion to make use of it. José Arcadio Buendía made no at. tempt to console her, completely absorbed in his tactical experiments with the abnegation of a scientist and even at the risk of his own life. In an attempt to show the effects of the glass on enemy troops, he exposed himself to the concentration of the sun’s rays and suffered burns which turned into sores that took a long time to heal. Over the protests of his wife, who was alarmed at such a dangerous invention, at one point he was ready to set the house on fire. He would spend hours on end in his room, calculating the strategic possibilities of his novel weapon until he succeeded in putting together a manual of startling instructional clarity and an irresistible power of conviction. He sent it to the government, accompanied by numerous descriptions of his experiments and several pages of explanatory sketches; by a messenger who crossed the mountains, got lost in measureless swamps, forded stormy rivers, and was on the point of perishing under the lash of despair, plague, and wild beasts until he found a route that joined the one used by the mules that carried the mail. In spite of the fact that a trip to the capital was little less than impossible at that time, José Arcadio Buendía promised to undertake it as soon as the government ordered him to so that he could put on some practical demonstrations of his invention for the military authorities and could train them himself in the complicated art of solar war. For several years he waited for an answer. Finally, tired of waiting, he bemoaned to Melquíades the failure of his project and the gypsy then gave him a convincing proof of his honesty: he gave him back the doubloons in exchange for the magnifying glass, and he left him in addition some Portuguese maps and several instruments of navigation. In his own handwriting he set down a concise synthesis of the studies by Monk Hermann. which he left José Arcadio so that he would be able to make use of the astrolabe, the compass, and the sextant. José Arcadio Buendía spent the long months of the rainy season shut up in a small room that he had built in the rear of the house so that no one would disturb his experiments. Having completely abandoned his domestic obligations, he spent entire nights in the courtyard watching the course of the stars and he almost contracted sunstroke from trying to establish an exact method to ascertain noon. When he became an expert in the use and manipulation of his instruments, he conceived a notion of space that allowed him to navigate across unknown seas, to visit uninhabited territories, and to establish relations with splendid beings without having to leave his study. That was the period in which he acquired the habit of talking to himself, of walking through the house without paying attention to anyone, as Úrsula and the children broke their backs in the garden, growing banana and caladium, cassava and yams, ahuyama roots and eggplants. Suddenly, without warning, his feverish activity was interrupted and was replaced by a kind of fascination. He spent several days as if he were bewitched, softly repeating to himself a string of fearful conjectures without giving credit to his own understanding. Finally, one Tuesday in December, at lunchtime, all at once he released the whole weight of his torment. The children would remember for the rest of their lives the august solemnity with which their father, devastated by his prolonged vigil and by the wrath of his imagination, revealed his discovery to them: “The earth is round, like an orange.” Úrsula lost her patience. “If you have to go crazy, please go crazy all by yourself!” she shouted. “But don’t try to put your gypsy ideas into the heads of the children.” José Arcadio Buendía, impassive, did not let himself be frightened by the desperation of his wife, who, in a seizure of rage, mashed the astrolabe against the floor. He built another one, he gathered the men of the village in his little room, and he demonstrated to them, with theories that none of them could understand, the possibility of returning to where one had set out by consistently sailing east. The whole village was convinced that José Arcadio Buendía had lost his reason, when Melquíades returned to set things straight. He gave public praise to the intelligence of a man who from pure astronomical speculation had evolved a theory that had already been proved in practice, although unknown in Macondo until then, and as a proof of his admiration he made him a gift that was to have a profound influence on the future of the village: the laboratory of an alchemist. By then Melquíades had aged with surprising rapidity. On his first trips he seemed to be the same age as José Arcadio Buendía. But while the latter had preserved his extraordinary strength, which permitted him to pull down a horse by grabbing its ears, the gypsy seemed to have been worn dowse by some tenacious illness. It was, in reality, the result of multiple and rare diseases contracted on his innumerable trips around the world. According to what he himself said as he spoke to José Arcadio Buendía while helping him set up the laboratory, death followed him everywhere, sniffing at the cuffs of his pants, but never deciding to give him the final clutch of its claws. He was a fugitive from all the plagues and catastrophes that had ever lashed mankind. He had survived pellagra in Persia, scurvy in the Malayan archipelago, leprosy in Alexandria, beriberi in Japan, bubonic plague in Madagascar, an earthquake in Sicily, and a disastrous shipwreck in the Strait of Magellan. That prodigious creature, said to possess the keys of Nostradamus, was a gloomy man, enveloped in a sad aura, with an Asiatic look that seemed to know what there was on the other side of things. He wore a large black hat that looked like a raven with widespread wings, and a velvet vest across which the patina of the centuries had skated. But in spite of his immense wisdom and his mysterious breadth, he had a human burden, an earthly condition that kept him involved in the small problems of daily life. He would complain of the ailments of old age, he suffered from the most insignificant economic difficulties, and he had stopped laughing a long time back because scurvy had made his teeth drop out. On that suffocating noontime when the gypsy revealed his secrets, José Arcadio Buendía had the certainty that it was the beginning of a great friendship. The children were startled by his fantastic stories. Aureliano, who could not have been more than five at the time, would remember him for the rest of his life as he saw him that afternoon, sitting against the metallic and quivering light from the window, lighting up with his deep organ voice the darkest reaches of the imagination, while down over his temples there flowed the grease that was being melted by the heat. José Arcadio, his older brother, would pass on that wonderful image as a hereditary memory to all of his descendants. Úrsula on the other hand, held a bad memory of that visit, for she had entered the room just as Melquíades had carelessly broken a flask of bichloride of mercury. “It’s the smell of the devil,” she said. “Not at all,” Melquíades corrected her. “It has been proven that the devil has sulphuric properties and this is just a little corrosive sublimate.” Always didactic, he went into a learned exposition of the diabolical properties of cinnabar, but Úrsula paid no attention to him, although she took the children off to pray. That biting odor would stay forever in her mind linked to the memory of Melquíades. The rudimentary laboratory—in addition to a profusion of pots, funnels, retorts, filters, and sieves—was made up of a primitive water pipe, a glass beaker with a long, thin neck, a reproduction of the philosopher’s egg, and a still the gypsies themselves had built in accordance with modern descriptions of the three-armed alembic of Mary the Jew. Along with those items, Melquíades left samples of the seven metals that corresponded to the seven planets, the formulas of Moses and Zosimus for doubling the quantity of gold, and a set of notes and sketches concerning the processes of the Great Teaching that would permit those who could interpret them to undertake the manufacture of the philosopher’s stone. Seduced by the simplicity of the formulas to double the quantity of gold, José Arcadio Buendía paid court to Úrsula for several weeks so that she would let him dig up her colonial coins and increase them by as many times as it was possible to subdivide mercury. Úrsula gave in, as always, to her husband’s unyielding obstinacy. Then José Arcadio Buendía threw three doubloons into a pan and fused them with copper filings, orpiment, brimstone, and lead. He put it all to boil in a pot of castor oil until he got a thick and pestilential syrup which was more like common caramel than valuable gold. In risky and desperate processes of distillation, melted with the seven planetary metals, mixed with hermetic mercury and vitriol of Cyprus, and put back to cook in hog fat for lack of any radish oil, Úrsula’s precious inheritance was reduced to a large piece of burnt hog cracklings that was firmly stuck to the bottom of the pot. When the gypsies came back, Úrsula had turned the whole population of the village against them. But curiosity was greater than fear, for that time the gypsies went about the town making a deafening noise with all manner of musical instruments while a hawker announced the exhibition of the most fabulous discovery of the Naciancenes. So that everyone went to the tent and by paying one cent they saw a youthful Melquíades, recovered, unwrinkled, with a new and flashing set of teeth. Those who remembered his gums that had been destroyed by scurvy, his flaccid cheeks, and his withered lips trembled with fear at the final proof of the gypsy’s supernatural power. The fear turned into panic when Melquíades took out his teeth, intact, encased in their gums, and showed them to the audience for an instant—a fleeting instant in which he went back to being the same decrepit man of years past—and put them back again and smiled once more with the full control of his restored youth. Even José Arcadio Buendía himself considered that Melquíades’ knowledge had reached unbearable extremes, but he felt a healthy excitement when the gypsy explained to him atone the workings of his false teeth. It seemed so simple and so prodigious at the same time that overnight he lost all interest in his experiments in alchemy. He underwent a new crisis of bad humor. He did not go back to eating regularly, and he would spend the day walking through the house. “Incredible things are happening in the world,” he said to Úrsula. “Right there across the river there are all kinds of magical instruments while we keep on living like donkeys.” Those who had known him since the foundation of Macondo were startled at how much he had changed under Melquíades’ influence. At first José Arcadio Buendía had been a kind of youthful patriarch who would give instructions for planting and advice for the raising of children and animals, and who collaborated with everyone, even in the physical work, for the welfare of the community. Since his house from the very first had been the best in the village, the others had been built in its image and likeness. It had a small, welllighted living roost, a dining room in the shape of a terrace with gaily colored flowers, two bedrooms, a courtyard with a gigantic chestnut tree, a well kept garden, and a corral where goats, pigs, and hens lived in peaceful communion. The only animals that were prohibited, not just in his house but in the entire settlement, were fighting cocks. Úrsula’s capacity for work was the same as that of her husband. Active, small, severe, that woman of unbreakable nerves who at no moment in her life had been heard to sing seemed to be everywhere, from dawn until quite late at night, always pursued by the soft whispering of her stiff, starched petticoats. Thanks to her the floors of tamped earth, the unwhitewashed mud walls, the rustic, wooden furniture they had built themselves were always dean, and the old chests where they kept their clothes exhaled the warm smell of basil. José Arcadio Buendía, who was the most enterprising man ever to be seen in the village, had set up the placement of the houses in such a way that from all of them one could reach the river and draw water with the same effort, and he had lined up the streets with such good sense that no house got more sun than another during the hot time of day. Within a few years Macondo was a village that was more orderly and hard working than any known until then by its three hundred inhabitants. It was a truly happy village where no one was over thirty years of age and where no one had died. Since the time of its founding, José Arcadio Buendía had built traps and cages. In a short time he filled not only his own house but all of those in the village with troupials, canaries, bee eaters, and redbreasts. The concert of so many different birds became so disturbing that Úrsula would plug her ears with beeswax so as not to lose her sense of reality. The first time that Melquíades’ tribe arrived, selling glass balls for headaches, everyone was surprised that they had been able to find that village lost in the drowsiness of the swamp, and the gypsies confessed that they had found their way by the song of the birds. That spirit of social initiative disappeared in a short time, pulled away by the fever of the magnets, the astronomical calculations, the dreams of transmutation, and the urge to discover the wonders of the world. From a clean and active man, José Arcadio Buendía changed into a man lazy in appearance, careless in his dress, with a wild beard that Úrsula managed to trim with great effort and a kitchen knife. There were many who considered him the victim of some strange spell. But even those most convinced of his madness left work and family to follow him when he brought out his tools to clear the land and asked the assembled group to open a way that would put Macondo in contact with the great inventions. José Arcadio Buendía was completely ignorant of the geography of the region. He knew that to the east there lay an impenetrable mountain chain and that on the other side of the mountains there was the ardent city of Riohacha, where in times past—according to what he had been told by the first Aureliano Buendía, his grandfather—Sir Francis Drake had gone crocodile hunting with cannons and that he repaired hem and stuffed them with straw to bring to Queen Elizabeth. In his youth, José Arcadio Buendía and his men, with wives and children, animals and all kinds of domestic implements, had crossed the mountains in search of an outlet to the sea, and after twenty-six months they gave up the expedition and founded Macondo, so they would not have to go back. It was, therefore, a route that did not interest him, for it could lead only to the past. To the south lay the swamps, covered with an eternal vegetable scum and the whole vast universe of the great swamp, which, according to what the gypsies said, had no limits. The great swamp in the west mingled with a boundless extension of water where there were soft-skinned cetaceans that had the head and torso of a woman, causing the ruination of sailors with the charm of their extraordinary breasts. The gypsies sailed along that route for six months before they reached the strip of land over which the mules that carried the mail passed. According to José Arcadio Buendía’s calculations, the only possibility of contact with civilization lay along the northern route. So he handed out clearing tools and hunting weapons to the same men who had been with him during the founding of Macondo. He threw his directional instruments and his maps into a knapsack, and he undertook the reckless adventure. During the first days they did not come across any appreciable obstacle. They went down along the stony bank of the river to the place where years before they had found the soldier’s armor, and from there they went into the woods along a path between wild orange trees. At the end of the first week they killed and roasted a deer, but they agreed to eat only half of it and salt the rest for the days that lay ahead. With that precaution they tried to postpone the necessity of having to eat macaws, whose blue flesh had a harsh and musky taste. Then, for more than ten days, they did not see the sun again. The ground became soft and damp, like volcanic ash, and the vegetation was thicker and thicker, and the cries of the birds and the uproar of the monkeys became more and more remote, and the world became eternally sad. The men on the expedition felt overwhelmed by their most ancient memories in that paradise of dampness and silence, going back to before original sin, as their boots sank into pools of steaming oil and their machetes destroyed bloody lilies and golden salamanders. For a week, almost without speaking, they went ahead like sleepwalkers through a universe of grief, lighted only by the tenuous reflection of luminous insects, and their lungs were overwhelmed by a suffocating smell of blood. They could not return because the strip that they were opening as they went along would soon close up with a new vegetation that. almost seemed to grow before their eyes. “It’s all right,” José Arcadio Buendía would say. “The main thing is not to lose our bearings.” Always following his compass, he kept on guiding his men toward the invisible north so that they would be able to get out of that enchanted region. It was a thick night, starless, but the darkness was becoming impregnated with a fresh and clear air. Exhausted by the long crossing, they hung up their hammocks and slept deeply for the first time in two weeks. When they woke up, with the sun already high in the sky, they were speechless with fascination. Before them, surrounded by ferns and palm trees, white and powdery in the silent morning light, was an enormous Spanish galleon. Tilted slightly to the starboard, it had hanging from its intact masts the dirty rags of its sails in the midst of its rigging, which was adorned with orchids. The hull, covered with an armor of petrified barnacles and soft moss, was firmly fastened into a surface of stones. The whole structure seemed to occupy its own space, one of solitude and oblivion, protected from the vices of time and the habits of the birds. Inside, where the expeditionaries explored with careful intent, there was nothing but a thick forest of flowers. The discovery of the galleon, an indication of the proximity of the sea, broke José Arcadio Buendía’s drive. He considered it a trick of his whimsical fate to have searched for the sea without finding it, at the cost of countless sacrifices and suffering, and to have found it all of a sudden without looking for it, as if it lay across his path like an insurmountable object. Many years later Colonel Aureliano Buendía crossed the region again, when it was already a regular mail route, and the only part of the ship he found was its burned-out frame in the midst of a field of poppies. Only then, convinced that the story had not been some product of his father’s imagination, did he wonder how the galleon had been able to get inland to that spot. But José Arcadio Buendía did not concern himself with that when he found the sea after another four days’ journey from the galleon. His dreams ended as he faced that ashen, foamy, dirty sea, which had not merited the risks and sacrifices of the adventure. “God damn it!” he shouted. “Macondo is surrounded by water on all sides.” The idea of a peninsular Macondo prevailed for a long time, inspired by the arbitrary map that José Arcadio Buendía sketched on his return from the expedition. He drew it in rage, evilly, exaggerating the difficulties of communication, as if to punish himself for the absolute lack of sense with which he had chosen the place. “We’ll never get anywhere,” he lamented to Úrsula. “We’re going to rot our lives away here without receiving the benefits of science.” That certainty, mulled over for several months in the small room he used as his laboratory, brought him to the conception of the plan to move Maeondo to a better place. But that time Úrsula had anticipated his feverish designs. With the secret and implacable labor of a small ant she predisposed the women of the village against the flightiness of their husbands, who were already preparing for the move. José Arcadio Buendía did not know at what moment or because of what adverse forces his plan had become enveloped in a web of pretexts, disappointments, and evasions until it turned into nothing but an illusion. Úrsula watched him with innocent attention and even felt some pity for him on the morning when she found him in the back room muttering about his plans for moving as he placed his laboratory pieces in their original boxes. She let him finish. She let him nail up the boxes and put his initials on them with an inked brush, without reproaching him, but knowing now that he knew (because she had heard him say so in his soft monologues) that the men of the village would not back him up in his undertaking. Only when he began to take down the door of the room did Úrsula dare ask him what he was doing, and he answered with a certain bitterness. “Since no one wants to leave, we’ll leave all by ourselves.” Úrsula did not become upset. “We will not leave,” she said. “We will stay here, because we have had a son here.” “We have still not had a death,” he said. “A person does not belong to a place until there is someone dead under the ground.” Úrsula replied with a soft firmness: “If I have to die for the rest of you to stay here, I will die.” José Arcadio Buendía had not thought that his wife’s will was so firm. He tried to seduce her with the charm of his fantasy, with the promise of a prodigious world where all one had to do was sprinkle some magic liquid on the ground and the plants would bear fruit whenever a man wished, and where all manner of instruments against pain were sold at bargain prices. But Úrsula was insensible to his clairvoyance. “Instead of going around thinking about your crazy inventions, you should be worrying about your sons,” she replied. “Look at the state they’re in, running wild just like donkeys.” José Arcadio Buendía took his wife’s words literally. He looked out the window and saw the barefoot children in the sunny garden and he had the impression that only at that instant had they begun to exist, conceived by Úrsula’s spell, Something occurred inside of him then, something mysterious and definitive that uprooted him from his own time and carried him adrift through an unexplored region of his memory. While Úrsula continued sweeping the house, which was safe now from being abandoned for the rest of her life, he stood there with an absorbed look, contemplating the children until his eyes became moist and he dried them with the back of his hand, exhaling a deep sigh of resignation. “All right,” he said. “Tell them to come help me take the things out of the boxes.” José Arcadio, the older of the children, was fourteen. He had a square head, thick hair, and his father’s character. Although he had the same impulse for growth and physical strength, it was early evident that he lacked imagination. He had been conceived and born during the difficult crossing of the mountains, before the founding of Macondo, and his parents gave thanks to heaven when they saw he had no animal features. Aureliano, the first human being to be born in Macondo, would be six years old in March. He was silent and withdrawn. He had wept in his mother’s womb and had been born with his eyes open. As they were cutting the umbilical cord, he moved his head from side to side, taking in the things in the room and examining the faces of the people with a fearless curiosity. Then, indifferent to those who came close to look at him, he kept his attention concentrated on the palm roof, which looked as if it were about to collapse under the tremendous pressure of the rain. Úrsula did not remember the intensity of that look again until one day when little Aureliano, at the age of three, went into the kitchen at the moment she was taking a pot of boiling soup from the stove and putting it on the table. The child, Perplexed, said from the doorway, “It’s going to spill.” The pot was firmly placed in the center of the table, but just as soon as the child made his announcement, it began an unmistakable movement toward the edge, as if impelled by some inner dynamism, and it fell and broke on the floor. Úrsula, alarmed, told her husband about the episode, but he interpreted it as a natural phenomenon. That was the way he always was alien to the existence of his sons, partly because he considered childhood as a period of mental insufficiency, and partly because he was always too absorbed in his fantastic speculations. But since the afternoon when he called the children in to help him unpack the things in the laboratory, he gave them his best hours. In the small separate room, where the walls were gradually being covered by strange maps and fabulous drawings, he taught them to read and write and do sums, and he spoke to them about the wonders of the world, not only where his learning had extended, but forcing the limits of his imagination to extremes. It was in that way that the boys ended up learning that in the southern extremes of Africa there were men so intelligent and peaceful that their only pastime was to sit and think, and that it was possible to cross the Aegean Sea on foot by jumping from island to island all the way to the port of Salonika. Those hallucinating sessions remained printed on the memories of the boys in such a way that many years later, a second before the regular army officer gave the firing squad the command to fire, Colonel Aureliano Buendía saw once more that warm March afternoon on which his father had interrupted the lesson in physics and stood fascinated, with his hand in the air and his eyes motionless, listening to the distant pipes, drums, and jingles of the gypsies, who were coming to the village once more, announcing the latest and most startling discovery of the sages of Memphis. They were new gypsies, young men and women who knew only their own language, handsome specimens with oily skins and intelligent hands, whose dances and music sowed a panic of uproarious joy through the streets, with parrots painted all colors reciting Italian arias, and a hen who laid a hundred golden eggs to the sound of a tambourine, and a trained monkey who read minds, and the multi-use machine that could be used at the same time to sew on buttons and reduce fevers, and the apparatus to make a person forget his bad memories, and a poultice to lose time, and a thousand more inventions so ingenious and unusual that José Arcadio Buendía must have wanted to invent a memory machine so that he could remember them all. In an instant they transformed the village. The inhabitants of Macondo found themselves lost is their own streets, confused by the crowded fair. Holding a child by each hand so as not to lose them in the tumult, bumping into acrobats with gold-capped teeth and jugglers with six arms, suffocated by the mingled breath of manure and sandals that the crowd exhaled, José Arcadio Buendía went about everywhere like a madman, looking for Melquíades so that he could reveal to him the infinite secrets of that fabulous nightmare. He asked several gypsies, who did not understand his language. Finally he reached the place where Melquíades used to set up his tent and he found a taciturn Armenian who in Spanish was hawking a syrup to make oneself invisible. He had drunk down a glass of the amber substance in one gulp as José Arcadio Buendía elbowed his way through the absorbed group that was witnessing the spectacle, and was able to ask his question. The gypsy wrapped him in the frightful climate of his look before he turned into a puddle of pestilential and smoking pitch over which the echo of his reply still floated: “Melquíades is dead.” Upset by the news, José Arcadio Buendía stood motionless, trying to rise above his affliction, until the group dispersed, called away by other artifices, and the puddle of the taciturn Armenian evaporated completely. Other gypsies confirmed later on that Melquíades had in fact succumbed to the fever on the beach at Singapore and that his body had been thrown into the deepest part of the Java Sea. The children had no interest in the news. They insisted that their father take them to see the overwhelming novelty of the sages of Memphis that was being advertised at the entrance of a tent that, according to what was said, had belonged to King Solomon. They insisted so much that José Arcadio Buendía paid the thirty reales and led them into the center of the tent, where there was a giant with a hairy torso and a shaved head, with a copper ring in his nose and a heavy iron chain on his ankle, watching over a pirate chest. When it was opened by the giant, the chest gave off a glacial exhalation. Inside there was only an enormous, transparent block with infinite internal needles in which the light of the sunset was broken up into colored stars. Disconcerted, knowing that the children were waiting for an immediate explanation, José Arcadio Buendía ventured a murmur: “It’s the largest diamond in the world.” “No,” the gypsy countered. “It’s ice.” José Arcadio Buendía, without understanding, stretched out his hand toward the cake, but the giant moved it away. “Five reales more to touch it,” he said. José Arcadio Buendía paid them and put his hand on the ice and held it there for several minutes as his heart filled with fear and jubilation at the contact with mystery. Without knowing what to say, he paid ten reales more so that his sons could have that prodigious experience. Little José Arcadio refused to touch it. Aureliano, on the other hand, took a step forward and put his hand on it, withdrawing it immediately. “It’s boiling,” he exclaimed, startled. But his father paid no attention to him. Intoxicated by the evidence of the miracle, he forgot at that moment about the frustration of his delirious undertakings and Melquíades’ body, abandoned to the appetite of the squids. He paid another five reales and with his hand on the cake, as if giving testimony on the holy scriptures, he exclaimed: “This is the great invention of our time.” WHEN THE PIRATE Sir Francis Drake attacked Riohacha in the sixteenth century, Úrsula Iguarán’s great-great-grandmother became so frightened with the ringing of alarm bells and the firing of cannons that she lost control of her nerves and sat down on a lighted stove. The burns changed her into a useless wife for the rest of her days. She could only sit on one side, cushioned by pillows, and something strange must have happened to her way of walking, for she never walked again in public. She gave up all kinds of social activity, obsessed with the notion that her body gave off a singed odor. Dawn would find her in the courtyard, for she did not dare fall asleep lest she dream of the English and their ferocious attack dogs as they came through the windows of her bedroom to submit her to shameful tortures with their red-hot irons. Her husband, an Aragonese merchant by whom she had two children, spent half the value of his store on medicines and pastimes in an attempt to alleviate her terror. Finally he sold the business and took the family to live far from the sea in a settlement of peaceful Indians located in the foothills, where he built his wife a bedroom without windows so that the pirates of her dream would have no way to get in. In that hidden village there was a native-born tobacco planter who had lived there for some time, Don José Arcadio Buendía, with whom Úrsula’s great-great-grandfather established a partnership that was so lucrative that within a few years they made a fortune. Several centuries later the greatgreat-grandson of the native-born planter married the great-great-granddaughter of the Aragonese. Therefore, every time that Úrsula became exercised over her husband’s mad ideas, she would leap back over three hundred years of fate and curse the day that Sir Francis Drake had attacked Riohacha. It was simply a way. of giving herself some relief, because actually they were joined till death by a bond that was more solid that love: a common prick of conscience. They were cousins. They had grown up together in the old village that both of their ancestors, with their work and their good habits, had transformed into one of the finest towns in the province. Although their marriage was predicted from the time they had come into the world, when they expressed their desire to be married their own relatives tried to stop it. They were afraid that those two healthy products of two races that had interbred over the centuries would suffer the shame of breeding iguanas. There had already been a horrible precedent. An aunt of Úrsula’s, married to an uncle of José Arcadio Buendía, had a son who went through life wearing loose, baggy trousers and who bled to death after having lived forty-two years in the purest state of virginity, for he had been born and had grown up with a cartilaginous tail in the shape of a corkscrew and with a small tuft of hair on the tip. A pig’s tail that was never allowed to be seen by any woman and that cost him his life when a butcher friend did him the favor of chopping it off with his cleaver. José Arcadio Buendía, with the whimsy of his nineteen years, resolved the problem with a single phrase: “I don’t care if I have piglets as long as they can talk.” So they were married amidst a festival of fireworks and a brass band that went on for three days. They would have been happy from then on if Úrsula’s mother had not terrified her with all manner of sinister predictions about their offspring, even to the extreme of advising her to refuse to consummate the marriage. Fearing that her stout and willful husband would rape her while she slept, Úrsula, before going to bed, would put on a rudimentary kind of drawers that her mother had made out of sailcloth and had reinforced with a system of crisscrossed leather straps and that was closed in the front by a thick iron buckle. That was how they lived for several months. During the day he would take care of his fighting cocks and she would do frame embroidery with her mother. At night they would wrestle for several hours in an anguished violence that seemed to be a substitute for the act of love, until popular intuition got a whiff of something irregular and the rumor spread that Úrsula was still a virgin a year after her marriage because her husband was impotent. José Arcadio Buendía was the last one to hear the rumor. “Look at what people are going around saying, Úrsula,” he told his wife very calmly. “Let them talk,” she said. “We know that it’s not true.” So the situation went on the same way for another six months until that tragic Sunday when José Arcadio Buendía won a cockfight from Prudencio Aguilar. Furious, aroused by the blood of his bird, the loser backed away from José Arcadio Buendía so that everyone in the cockpit could hear what he was going to tell him. “Congratulations!” he shouted. “Maybe that rooster of yours can do your wife a favor.” José Arcadio Buendía serenely picked up his rooster. “I’ll be right back,” he told everyone. And then to Prudencio Aguilar: “You go home and get a weapon, because I’m going to kill you.” Ten minutes later he returned with the notched spear that had belonged to his grandfather. At the door to the cockpit, where half the town had gathered, Prudencio Aguilar was waiting for him. There was no time to defend himself. José Arcadio Buendía’s spear, thrown with the strength of a bull and with the same good aim with which the first Aureliano Buendía had exterminated the jaguars in the region, pierced his throat. That night, as they held a wake over the corpse in the cockpit, José Arcadio Buendía went into the bedroom as his wife was putting on her chastity pants. Pointing the spear at her he ordered: “Take them off.” Úrsula had no doubt about her husband’s decision. “You’ll be responsible for what happens,” she murmured. José Arcadio Buendía stuck the spear into the dirt floor. “If you bear iguanas, we’ll raise iguanas,” he said. “But there’ll be no more killings in this town because of you.” It was a fine June night, cool and with a moon, and they were awake and frolicking in bed until dawn, indifferent to the breeze that passed through the bedroom, loaded with the weeping of Prudencio Aguilar’s kin. The matter was put down as a duel of honor, but both of them were left with a twinge in their conscience. One night, when she could not sleep, Úrsula went out into the courtyard to get some water and she saw Prudencio Aguilar by the water jar. He was livid, a sad expression on his face, trying to cover the hole in his throat with a plug made of esparto grass. It did not bring on fear in her, but pity. She went back to the room and told her husband what she had seen, but he did not think much of it. “This just means that we can’t stand the weight of our conscience.” Two nights later Úrsula saw Prudencio Aguilar again, in the bathroom, using the esparto plug to wash the clotted blood from his throat. On another night she saw him strolling in the rain. José Arcadio Buendía, annoyed by his wife’s hallucinations, went out into the courtyard armed with the spear. There was the dead man with his sad expression. “You go to hell,” José Arcadio Buendía shouted at him. “Just as many times as you come back, I’ll kill you again.” Prudencio Aguilar did not go away, nor did José Arcadio Buendía dare throw the spear. He never slept well after that. He was tormented by the immense desolation with which the dead man had looked at him through the rain, his deep nostalgia as he yearned for living people, the anxiety with which he searched through the house looking for some water with which to soak his esparto plug. “He must be suffering a great deal,” he said to Úrsula. “You can see that he’s so very lonely.” She was so moved that the next time she saw the dead man uncovering the pots on the stove she understood what he was looking for, and from then on she placed water jugs all about the house. One night when he found him washing his wound in his own room, José Anedio Buendía could no longer resist. “It’s all right, Prudencio,” he told him. “We’re going to leave this town, just as far away as we can go, and we’ll never come back. Go in peace now.” That was how they undertook the crossing of the mountains. Several friends of José Arcadio Buendía, young men like him, excited, by the adventure, dismantled their houses and packed up, along with their wives and children, to head toward the land that no one had promised them. Before he left, José Arcadio Buendía buried the spear in the courtyard and, one after the other, he cut the throats of his magnificent fighting cocks, trusting that in that way he could give some measure of peace to Prudencio Aguilar. All that Úrsula took along were a trunk with her bridal clothes, a few household utensils, and the small chest with the gold pieces that she had inherited from her father. They did not lay out any definite itinerary. They simply tried to go in a direction opposite to the road to Riohacha so that they would not leave any trace or meet any people they knew. It was an absurd journey. After fourteen months, her stomach corrupted by monkey meat and snake stew, Úrsula gave birth to a son who had all of his features human. She had traveled half of the trip in a hammock that two men carried on their shoulders, because swelling had disfigured her legs and her varicose veins had puffed up like bubbles. Although it was pitiful to see them with their sunken stomachs and languid eyes, the children survived the journey better than their parents, and most of the time it was fun for them. One morning, after almost two years of crossing, they became the first mortals to see the western slopes of the mountain range. From the cloudy summit they saw the immense aquatic expanse of the great swamp as it spread out toward the other side of the world. But they never found the sea. One night, after several months of lost wandering through the swamps, far away now from the last Indians they had met on their way, they camped on the banks of a stony river whose waters were like a torrent of frozen glass. Years later, during the second civil war, Colonel Aureliano Buendía tried to follow that same route in order to take Riohacha by surprise and after six days of traveling he understood that it was madness. Nevertheless, the night on which they camped beside the river, his father’s host had the look of shipwrecked people with no escape, but their number had grown during the crossing and they were all prepared (and they succeeded) to die of old age. José Arcadio Buendía dreamed that night that right there a noisy city with houses having mirror wails rose up. He asked what city it was and they answered him with a name that he had never heard, that had no meaning at all, but that had a supernatural echo in his dream: Macondo. On the following day he convinced his men that they would never find the sea. He ordered them to cut down the trees to make a clearing beside the river, at the coolest spot on the bank, and there they founded the village. José Arcadio Buendía did not succeed in deciphering the dream of houses with mirror walls until the day he discovered ice. Then he thought he understood its deep meaning. He thought that in the near future they would be able to manufacture blocks of ice on a large scale from such a common material as water and with them build the new houses of the village. Macondo would no longer be a burning place, where the hinges and door knockers twisted with the heat, but would be changed into a wintry city. If he did not persevere in his attempts to build an ice factory, it was because at that time he was absolutely enthusiastic over the education of his sons, especially that of Aureliano, who from the first had revealed a strange intuition for alchemy. The laboratory had been dusted off. Reviewing Melquíades’ notes, serene now, without the exaltation of novelty, in prolonged and patient sessions they tried to separate Úrsula’s gold from the debris that was stuck to the bottom of the pot. Young José Arcadio scarcely took part in the process. While his father was involved body and soul with his water pipe, the willful first-born, who had always been too big for his age, had become a monumental adolescent. His voice had changed. An incipient fuzz appeared on his upper lip. One night, as Úrsula went into the room where he was undressing to go to bed, she felt a mingled sense of shame and pity: he was the first man that she had seen naked after her husband, and he was so well-equipped for life that he seemed abnormal. Úrsula, pregnant for the third time, relived her newlywed terror. Around that time a merry, foul-mouthed, provocative woman came to the house to help with the chorea, and she knew how to read the future in cards. Úrsula spoke to her about her son. She thought that his disproportionate size was something as unnatural as her cousin’s tail of a pig. The woman let out an expansive laugh that resounded through the house like a spray of broken glass. “Just the opposite,” she said. “He’ll be very lucky.” In order to confirm her prediction she brought her cards to the house a few days later and locked herself up with José Arcadio in a granary off the kitchen. She calmly placed her cards on an old carpenter’s bench. saying anything that came into her head, while the boy waited beside her, more bored than intrigued. Suddenly she reached out her hand and touched him. “Lordy!” she said, sincerely startled, and that was all she could say. José Arcadio felt his bones filling up with foam, a languid fear, and a terrible desire to weep. The woman made no insinuations. But José Arcadio kept looking for her all night long, for the smell of smoke that she had under her armpits and that had got caught under his skin. He wanted to be with her all the time, he wanted her to be his mother, for them never to leave the granary, and for her to say “Lordy!” to him. One day he could not stand it any more and. he went looking for her at her house: He made a formal visit, sitting uncomprehendingly in the living room without saying a word. At that moment he had no desire for her. He found her different, entirely foreign to the image that her smell brought on, as if she were someone else. He drank his coffee and left the house in depression. That night, during the frightful time of lying awake, he desired her again with a brutal anxiety, but he did not want her that time as she had been in the granary but as she had been that afternoon. Days later the woman suddenly called him to her house, where she was alone with her mother, and she had him come into the bedroom with the pretext of showing him a deck of cards. Then she touched him with such freedom that he suffered a delusion after the initial shudder, and he felt more fear than pleasure. She asked him to come and see her that night. He agreed. in order to get away, knowing that he was incapable of going. But that night, in his burning bed, he understood that he had to go we her, even if he were not capable. He got dressed by feel, listening in the dark to his brother’s calm breathing, the dry cough of his father in the next room, the asthma of the hens in the courtyard, the buzz of the mosquitoes, the beating of his heart, and the inordinate bustle of a world that he had not noticed until then, and he went out into the sleeping street. With all his heart he wanted the door to be barred and not just closed as she had promised him. But it was open. He pushed it with the tips of his fingers and the hinges yielded with a mournful and articulate moan that left a frozen echo inside of him. From the moment he entered, sideways and trying not to make a noise, he caught the smell. He was still in the hallway, where the woman’s three brothers had their hammocks in positions that he could not see and that he could not determine in the darkness as he felt his way along the hall to push open the bedroom door and get his bearings there so as not to mistake the bed. He found it. He bumped against the ropes of the hammocks, which were lower than he had suspected, and a man who had been snoring until then turned in his sleep and said in a kind of delusion, “It was Wednesday.” When he pushed open the bedroom door, he could not prevent it from scraping against the uneven floor. Suddenly, in the absolute darkness, he understood with a hopeless nostalgia that he was completely disoriented. Sleeping in the narrow room were the mother, another daughter with her husband and two children, and the woman, who may not have been there. He could have guided himself by the smell if the smell had not been all over the house, so devious and at the same time so definite, as it had always been on his skin. He did not move for a long time, wondering in fright how he had ever got to that abyss of abandonment, when a hand with all its fingers extended and feeling about in the darkness touched his face. He was not surprised, for without knowing, he had been expecting it. Then he gave himself over to that hand, and in a terrible state of exhaustion he let himself be led to a shapeless place where his clothes were taken off and he was heaved about like a sack of potatoes and thrown from one side to the other in a bottomless darkness in which his arms were useless, where it no longer smelled of woman but of ammonia, and where he tried to remember her face and found before him the face of Úrsula, confusedly aware that he was doing something that for a very long time he had wanted to do but that he had imagined could really never be done, not knowing what he was doing because he did not know where his feet were or where his head was, or whose feet or whose head, and feeling that he could no longer resist the glacial rumbling of his kidneys and the air of his intestines, and fear, and the bewildered anxiety to flee and at the same time stay forever in that exasperated silence and that fearful solitude. Her name was Pilar Ternera. She had been part of the exodus that ended with the founding of Macondo, dragged along by her family in order to separate her from the man who had raped her at fourteen and had continued to love her until she was twenty-two, but who never made up his mind to make the situation public because he was a man apart. He promised to follow her to the ends of the earth, but only later on, when he put his affairs in order, and she had become tired of waiting for him, always identifying him with the tall and short, blond and brunet men that her cards promised from land and sea within three days, three months, or three years. With her waiting she had lost the strength of her thighs, the firmness of her breasts, her habit of tenderness, but she kept the madness of her heart intact. Maddened by that prodigious plaything, José Arcadio followed her path every night through the labyrinth of the room. On a certain occasion he found the door barred, and he knocked several times, knowing that if he had the boldness to knock the first time he would have had to knock until the last, and after an interminable wait she opened the door for him. During the day, lying down to dream, he would secretly enjoy the memories of the night before. But when she came into the house, merry, indifferent, chatty, he did not have to make any effort to hide his tension, because that woman, whose explosive laugh frightened off the doves, had nothing to do with the invisible power that taught him how to breathe from within and control his heartbeats, and that had permitted him to understand why man are afraid of death. He was so wrapped up in himself that he did not even understand the joy of everyone when his father and his brother aroused the household with the news that they had succeeded in penetrating the metallic debris and had separated Úrsula’s gold. They had succeeded, as a matter of fact, after putting in complicated and persevering days at it. Úrsula was happy, and she even gave thanks to God for the invention of alchemy, while the people of the village crushed into the laboratory, and they served them guava jelly on crackers to celebrate the wonder, and José Arcadio Buendía let them see the crucible with the recovered gold, as if he had just invented it. Showing it all around, he ended up in front of his older son, who during the past few days had barely put in an appearance in the laboratory. He put the dry and yellowish mass in front of his eyes and asked him: “What does it look like to you?” José Arcadio answered sincerely: “Dog shit.” His father gave him a blow with the back of his hand that brought out blood and tears. That night Pilar Ternera put arnica compresses on the swelling, feeling about for the bottle and cotton in the dark, and she did everything she wanted with him as long as it did not bother him, making an effort to love him without hurting him. They reached such a state of intimacy that later, without realizing it, they were whispering to each other. “I want to be alone with you,” he said. “One of these days I’m going to tell everybody and we can stop all of this sneaking around.” She did not try to calm him down. “That would be fine,” she said “If we’re alone, we’ll leave the lamp lighted so that we can see each other, and I can holler as much as I want without anybody’s having to butt in, and you can whisper in my ear any crap you can think of.” That conversation, the biting rancor that he felt against his father, and the imminent possibility of wild love inspired a serene courage in him. In a spontaneous way, without any preparation, he told everything to his brother. At first young Aureliano understood only the risk, the immense possibility of danger that his brother’s adventures implied, and he could not understand the fascination of the subject. Little by little he became contaminated with the anxiety. He wondered about the details of the dangers, he identified himself with the suffering and enjoyment of his brother, he felt frightened and happy. He would stay awake waiting for him until dawn in the solitary bed that seemed to have a bottom of live coals, and they would keep on talking until it was time to get up, so that both of them soon suffered from the same drowsiness, felt the same lack of interest in alchemy and the wisdom of their father, and they took refuge in solitude. “Those kids are out of their heads,” Úrsula said. “They must have worms.” She prepared a repugnant potion for them made out of mashed wormseed, which they both drank with unforeseen stoicism, and they sat down at the same time on their pots eleven times in a single day, expelling some rose-colored parasites that they showed to everybody with great jubilation, for it allowed them to deceive Úrsula as to the origin of their distractions and drowsiness. Aureliano not only understood by then, he also lived his brother’s experiences as something of his own, for on one occasion when the latter was explaining in great detail the mechanism of love, he interrupted him to ask: “What does it feel like?” José Arcadio gave an immediate reply: “It’s like an earthquake.” One January Thursday at two o’clock in the morning, Amaranta was born. Before anyone came into the room, Úrsula examined her carefully. She was light and watery, like a newt, but all of her parts were human: Aureliano did not notice the new thing except when the house became full of people. Protected by the confusion, he went off in search of his brother, who had not been in bed since eleven o’clock, and it was such an impulsive decision that he did not even have time to ask himself how he could get him out of Pilar Ternera’s bedroom. He circled the house for several hours, whistling private calls, until the proximity of dawn forced him to go home. In his mother’s room, playing with the newborn little sister and with a face that drooped with innocence, he found José Arcadio. Úrsula was barely over her forty days’ rest when the gypsies returned. They were the same acrobats and jugglers that had brought the ice. Unlike Melquíades’ tribe, they had shown very quickly that they were not heralds of progress but purveyors of amusement. Even when they brought the ice they did not advertise it for its usefulness in the life of man but as a simple circus curiosity. This time, along with many other artifices, they brought a flying carpet. But they did not offer it as a fundamental contribution to the development of transport, rather as an object of recreation. The people at once dug up their last gold pieces to take advantage of a quick flight over the houses of the village. Protected by the delightful cover of collective disorder, José Arcadio and Pilar passed many relaxing hours. They were two happy lovers among the crowd, and they even came to suspect that love could be a feeling that was more relaxing and deep than the happiness, wild but momentary, of their secret nights. Pilar, however, broke the spell. Stimulated by the enthusiasm that José Arcadio showed in her companionship, she confused the form and the occasion, and all of a sudden she threw the whole world on top of him. “Now you really are a man,” she told him. And since he did not understand what she meant, she spelled it out to him. “You’re going to be a father.” José Arcadio did not dare leave the house for several days. It was enough for him to hear the rocking laughter of Pilar in the kitchen to run and take refuge in the laboratory, where the artifacts of alchemy had come alive again with Úrsula’s blessing. José Arcadio Buendía received his errant son with joy and initiated him in the search for the philosopher’s stone, which he had finally undertaken. One afternoon the boys grew enthusiastic over the flying carpet that went swiftly by the laboratory at window level carrying the gypsy who was driving it and several children from the village who were merrily waving their hands, but José Arcadio Buendía did not even look at it. “Let them dream,” he said. “We’ll do better flying than they are doing, and with more scientific resources than a miserable bedspread.” In spite of his feigned interest, José Arcadio must understood the powers of the philosopher’s egg, which to him looked like a poorly blown bottle. He did not succeed in escaping from his worries. He lost his appetite and he could not sleep. He fell into an ill humor, the same as his father’s over the failure of his undertakings, and such was his upset that José Arcadio Buendía himself relieved him of his duties in the laboratory, thinking that he had taken alchemy too much to heart. Aureliano, of course, understood that his brother’s affliction did not have its source in the search for the philosopher’s stone but he could not get into his confidence. He had lost his former spontaneity. From an accomplice and a communicative person he had become withdrawn and hostile. Anxious for solitude, bitten by a virulent rancor against the world, one night he left his bed as usual, but he did not go to Pilar Ternera’s house, but to mingle is the tumult of the fair. After wandering about among all kinds of contraptions with out becoming interested in any of them, he spotted something that was not a part of it all: a very young gypsy girl, almost a child, who was weighted down by beads and was the most beautiful woman that José Arcadio had ever seen in his life. She was in the crowd that was witnessing the sad spectacle of the man who had been turned into a snake for having disobeyed his parents. José Arcadio paid no attention. While the sad interrogation of the snake-man was taking place, he made his way through the crowd up to the front row, where the gypsy girl was, and he stooped behind her. He pressed against her back. The girl tried to separate herself, but José Arcadio pressed more strongly against her back. Then she felt him. She remained motionless against him, trembling with surprise and fear, unable to believe the evidence, and finally she turned her head and looked at him with a tremulous smile. At that instant two gypsies put the snake-man into his cage and carried him into the tent. The gypsy who was conducting the show announced: “And now, ladies and gentlemen, we are going to show the terrible test of the woman who must have her head chopped off every night at this time for one hundred and fifty years as punishment for having seen what she should not have.” José Arcadio and the gypsy girl did not witness the decapitation. They went to her tent, where they kissed each other with a desperate anxiety while they took off their clothes. The gypsy girl removed the starched lace corsets she had on and there she was, changed into practically nothing. She was a languid little frog, with incipient breasts and legs so thin that they did not even match the size of José Arcadio’s arms, but she had a decision and a warmth that compensated for her fragility. Nevertheless, José Arcadio could not respond to her because they were in a kind of public tent where the gypsies passed through with their circus things and did their business, and would even tarry by the bed for a game of dice. The lamp hanging from the center pole lighted the whole place up. During a pause in the caresses, José Arcadio stretched out naked on the bed without knowing what to do, while the girl tried to inspire him. A gypsy woman with splendid flesh came in a short time after accompanied by a man who was not of the caravan but who was not from the village either, and they both began to undress in front of the bed. Without meaning to, the woman looked at José Arcadio and examined his magnificent animal in repose with a kind of pathetic fervor. “My boy,” she exclaimed, “may God preserve you just as you are.” José Arcadio’s companion asked them to leave them alone, and the couple lay down on the ground, close to the bed. The passion of the others woke up José Arcadio’s fervor. On the first contact the bones of the girl seemed to become disjointed with a disorderly crunch like the sound of a box of dominoes, and her skin broke out into a pale sweat and her eyes filled with tears as her whole body exhaled a lugubrious lament and a vague smell of mud. But she bore the impact with a firmness of character and a bravery that were admirable. José Arcadio felt himself lifted up into the air toward a state of seraphic inspiration, where his heart burst forth with an outpouring of tender obscenities that entered the girl through her ears and came out of her mouth translated into her language. It was Thursday. On Saturday night, José Arcadio wrapped a red cloth around his head and left with the gypsies. When Úrsula discovered his absence she searched for him all through the village. In the remains of the gypsy camp there was nothing but a garbage pit among the still smoking ashes of the extinguished campfires. Someone who was there looking for beads among the trash told Úrsula that the night before he had seen her son in the tumult of the caravan pushing the snake-man’s cage on a cart. “He’s become a gypsy” she shouted to her husband, who had not shown the slightest sign of alarm over the disappearance. “I hope it’s true,” José Arcadio Buendía said, grinding in his mortar the material that had been ground a thousand times and reheated and ground again. “That way he’ll learn to be a man.” Úrsula asked where the gypsies had gone. She went along asking and following the road she had been shown, thinking that she still had time to catch up to them. She kept getting farther away from the village until she felt so far away that she did not think about returning. José Arcadio Buendía did not discover that his wife was missing until eight o’clock at night, when he left the material warming in a bed of manure and went to see what was wrong with little Amaranta, who was getting hoarse from crying. In a few hours he gathered a group of well-equipped men, put Amaranta in the hands of a woman who offered to nurse her, and was lost on invisible paths in pursuit of Úrsula. Aureliano went with them. Some Indian fishermen, whose language they could not understand, told them with signs that they had not seen anyone pass. After three days of useless searching they returned to the village. For several weeks José Arcadio Buendía let himself be overcome by consternation. He took care of little Amaranta like a mother. He bathed and dressed her, took her to be nursed four times a day, and even sang to her at night the songs that Úrsula never knew how to sing. On a certain occasion Pilar Ternera volunteered to do the household chores until Úrsula came back. Aureliano, whose mysterious intuition had become sharpened with the misfortune, felt a glow of clairvoyance when he saw her come in. Then he knew that in some inexplicable way she was to blame for his brother’s flight and the consequent disappearance of his mother, and he harassed her with a silent and implacable hostility in such a way that the woman did not return to the house. Time put things in their place. José Arcadio Buendía and his son did not know exactly when they returned to the laboratory, dusting things, lighting the water pipe, involved once more in the patient manipulation of the material that had been sleeping for several months in its bed of manure. Even Amaranta, lying in a wicker basket, observed with curiosity the absorbing work of her father and her brother in the small room where the air was rarefied by mercury vapors. On a certain occasion, months after Úrsula’s departure, strange things began to happen. An empty flask that had been forgotten in a cupboard for a long time became so heavy that it could not be moved. A pan of water on the worktable boiled without any fire under it for a half hour until it completely evaporated. José Arcadio Buendía and his son observed those phenomena with startled excitement, unable to explain them but interpreting them as predictions of the material. One day Amaranta’s basket began to move by itself and made a complete turn about the room, to the consternation of Auerliano, who hurried to stop it. But his father did not get upset. He put the basket in its place and tied it to the leg of a table, convinced that the long-awaited event was imminent. It was on that occasion that Auerliano heard him say: “If you don’t fear God, fear him through the metals. Suddenly, almost five months after her disappearance, Úrsula came back. She arrived exalted, rejuvenated, with new clothes in a style that was unknown in the village. José Arcadio Buendía could barely stand up under the impact. “That was it!” he shouted. “I knew it was going to happen.” And he really believed it, for during his prolonged imprisonment as he manipulated the material, he begged in the depth of his heart that the longed-for miracle should not be the discovery of the philosopher’s stone, or the freeing of the breath that makes metals live, or the faculty to convert the hinges and the locks of the house into gold, but what had just happened: Úrsula’s return. But she did not share his excitement. She gave him a conventional kiss, as if she had been away only an hour, and she told him: “Look out the door.” José Arcadio Buendía took a long time to get out of his perplexity when he went out into the street and saw the crowd. They were not gypsies. They were men and women like them, with straight hair and dark skin, who spoke the same language and complained of the same pains. They had mules loaded down with things to eat, oxcarts with furniture and domestic utensils, pure and simple earthly accessories put on sale without any fuss by peddlers of everyday reality. They came from the other side of the swamp, only two days away, where there were towns that received mail every month in the year and where they were familiar with the implements of good living. Úrsula had not caught up with the gypsies, but she had found the route that her husband had been unable to discover in his frustrated search for the great inventions. PILAR TERNERA’S son was brought to his grand parents’ house two weeks after he was born. Úrsula admitted him grudgingly, conquered once more by the obstinacy of her husband, who could not tolerate the idea that an offshoot of his blood should be adrift, but he imposed the condition that the child should never know his true identity. Although he was given the name José Arcadio, they ended up calling him simply Arcadio so as to avoid confusion. At that time there was so much activity in the town and so much bustle in the house that the care of the children was relegated to a secondary level. They were put in the care of Visitación, a Guajiro Indian woman who had arrived in town with a brother in flight from a plague of insomnia that had been scourging their tribe for several years. They were both so docile and willing to help that Úrsula took them on to help her with her household chores. That was how Arcadio and Amaranta came to speak the Guajiro language before Spanish, and they learned to drink lizard broth and eat spider eggs without Úrsula’s knowing it, for she was too busy with a promising business in candy animals. Macondo had changed. The people who had come with Úrsula spread the news of the good quality of its soil and its privileged position with respect to the swamp, so that from the narrow village of past times it changed into an active town with stores and workshops and a permanent commercial route over which the first Arabs arrived with their baggy pants and rings in their ears, swapping glass beads for macaws. José Arcadio Buendía did not have a moment’s rest. Fascinated by an immediate reality that came to be more fantastic than the vast universe of his imagination, he lost all interest in the alchemist’s laboratory, put to rest the material that had become attenuated with months of manipulation, and went back to being the enterprising man of earlier days when he had decided upon the layout of the streets and the location of the new houses so that no one would enjoy privileges that everyone did not have. He acquired such authority among the new arrivals that foundations were not laid or walls built without his being consulted, and it was decided that he should be the one in charge of the distribution of the land. When the acrobat gypsies returned, with their vagabond carnival transformed now into a gigantic organization of games of luck and chance, they were received with great joy, for it was thought that José Arcadio would be coming back with them. But José Arcadio did not return, nor did they come with the snake-man, who, according to what Úrsula thought, was the only one who could tell them about their son, so the gypsies were not allowed to camp in town or set foot in it in the future, for they were considered the bearers of concupiscence and perversion. José Arcadio Buendía, however, was explicit in maintaining that the old tribe of Melquíades, who had contributed so much to the growth of the village with his age-old wisdom and his fabulous inventions, would always find the gates open. But Melquíades’ tribe, according to what the wanderers said, had been wiped off the face of the earth because they had gone beyond the limits of human knowledge. Emancipated for the moment at least from the torment of fantasy, José Arcadio Buendía in a short time set up a system of order and work which allowed for only one bit of license: the freeing of the birds, which, since the time of the founding, had made time merry with their flutes, and installing in their place musical clocks in every house. They were wondrous clocks made of carved wood, which the Arabs had traded for macaws and which José Arcadio Buendía had synchronized with such precision that every half hour the town grew merry with the progressive chords of the same song until it reached the climax of a noontime that was as exact and unanimous as a complete waltz. It was also José Arcadio Buendía who decided during those years that they should plant almond trees instead of acacias on the streets, and who discovered, without ever revealing it, a way to make them live forever. Many years later, when Macondo was a field of wooden houses with zinc roofs, the broken and dusty almond trees still stood on the oldest streets, although no one knew who had planted them. While his father was putting the town in order and his mother was increasing their wealth with her marvelous business of candied little roosters and fish, which left the house twice a day strung along sticks of balsa wood, Aureliano spent interminable hours in the abandoned laboratory, learning the art of silverwork by his own experimentation. He had shot up so fast that in a short time the clothing left behind by his brother no longer fit him and he began to wear his father’s, but Visitación had to sew pleats in the shirt and darts in the pants, because Aureliano had not sequined the corpulence of the others. Adolescence had taken away the softness of his voice and had made him silent and definitely solitary, but, on the other hand, it had restored the intense expression that he had had in his eyes when he was born. He concentrated so much on his experiments in silverwork that he scarcely left the laboratory to eat. Worried ever his inner withdrawal, José Arcadio Buendía gave him the keys to the house and a little money, thinking that perhaps he needed a woman. But Aureliano spent the money on muriatic acid to prepare some aqua regia and he beautified the keys by plating them with gold. His excesses were hardly comparable to those of Arcadio and Amaranta, who had already begun to get their second teeth and still went about all day clutching at the Indians’ cloaks, stubborn in their decision not to speak Spanish but the Guajiro language. “You shouldn’t complain.” Úrsula told her husband. “Children inherit their parents’ madness.” And as she was lamenting her misfortune, convinced that the wild behavior of her children was something as fearful as a pig’s tail, Aureliano gave her a look that wrapped her in an atmosphere of uncertainty. “Somebody is coming,” he told her. Úrsula, as she did whenever he made a prediction, tried to break it down with her housewifely logic. It was normal for someone to be coming. Dozens of strangers came through Macondo every day without arousing suspicion or secret ideas. Nevertheless, beyond all logic, Aureliano was sure of his prediction. “I don’t know who it will be,” he insisted, “but whoever it is is already on the way.” That Sunday, in fact, Rebeca arrived. She was only eleven years old. She had made the difficult trip from Manaure with some hide dealers who had taken on the task of delivering her along with a letter to José Arcadio Buendía, but they could not explain precisely who the person was who had asked the favor. Her entire baggage consisted of a small trunk, a little rocking chair with small handpainted flowers, and a canvas sack which kept making a cloc-cloc-cloc sound, where she carried her parents’ bones. The letter addressed to José Arcadio Buendía was written is very warm terms by someone who still loved him very much in spite of time and distance, and who felt obliged by a basic humanitarian feeling to do the charitable thing and send him that poor unsheltered orphan, who was a second cousin of Úrsula’s and consequently also a relative of José Arcadio Buendía, although farther removed, because she was the daughter of that unforgettable friend Nicanor Ulloa and his very worthy wife Rebeca Montiel, may God keep them in His holy kingdom, whose remains the girl was carrying so that they might be given Christian burial. The names mentioned, as well as the signature on the letter, were perfectly legible, but neither José Arcadio, Buendía nor Úrsula remembered having any relatives with those names, nor did they know anyone by the name of the sender of the letter, much less the remote village of Manaure. It was impossible to obtain any further information from the girl. From the moment she arrived she had been sitting in the rocker, sucking her finger and observing everyone with her large, startled eyes without giving any sign of understanding what they were asking her. She wore a diagonally striped dress that had been dyed black, worn by use, and a pair of scaly patent leather boots. Her hair was held behind her ears with bows of black ribbon. She wore a scapular with the images worn away by sweat, and on her right wrist the fang of a carnivorous animal mounted on a backing of copper as an amulet against the evil eye. Her greenish skin, her stomach, round and tense as a drum. revealed poor health and hunger that were older than she was, but when they gave her something to eat she kept the plate on her knees without tasting anything. They even began to think that she was a deaf-mute until the Indians asked her in their language if she wanted some water and she moved her eyes as if she recognized them and said yes with her head. They kept her, because there was nothing else they could do. They decided to call her Rebeca, which according to the letter was her mother’s name, because Aureliano had the patience to read to her the names of all the saints and he did not get a reaction from any one of them. Since there was no cemetery in Macondo at that time, for no one had died up till then, they kept the bag of bones to wait for a worthy place of burial, and for a long time it got in the way everywhere and would be found where least expected, always with its clucking of a broody hen. A long time passed before Rebeca became incorporated into the life of the family. She would sit in her small rocker sucking her finger in the most remote corner of the house. Nothing attracted her attention except the music of the clocks, which she would look for every half hour with her frightened eyes as if she hoped to find it someplace in the air. They could not get her to eat for several days. No one understood why she had not died of hunger until the Indians, who were aware of everything, for they went ceaselessly about the house on their stealthy feet, discovered that Rebeca only liked to eat the damp earth of the courtyard and the cake of whitewash that she picked of the walls with her nails. It was obvious that her parents, or whoever had raised her, had scolded her for that habit because she did it secretively and with a feeling of guilt, trying to put away supplies so that she could eat when no one was looking. From then on they put her under an implacable watch. They threw cow gall onto the courtyard and, rubbed hot chili on the walls, thinking they could defeat her pernicious vice with those methods, but she showed such signs of astuteness and ingenuity to find some earth that Úrsula found herself forced to use more drastic methods. She put some orange juice and rhubarb into a pan that she left in the dew all night and she gave her the dose the following day on an empty stomach. Although no one had told her that it was the specific remedy for the vice of eating earth, she thought that any bitter substance in an empty stomach would have to make the liver react. Rebeca was so rebellious and strong in spite of her frailness that they had to tie her up like a calf to make her swallow the medicine, and they could barely keep back her kicks or bear up under the strange hieroglyphics that she alternated with her bites and spitting, and that, according to what the scandalized Indians said, were the vilest obscenities that one could ever imagine in their language. When Úrsula discovered that, she added whipping to the treatment. It was never established whether it was the rhubarb or the beatings that had effect, or both of them together, but the truth was that in a few weeks Rebeca began to show signs of recovery. She took part in the games of Arcadio and Amaranta, who treated her like an older sister, and she ate heartily, using the utensils properly. It was soon revealed that she spoke Spanish with as much fluency as the Indian language, that she had a remarkable ability for manual work, and that she could sing the waltz of the clocks with some very funny words that she herself had invented. It did not take long for them to consider her another member of the family. She was more affectionate to Úrsula than any of her own children had been, and she called Arcadio, and Amaranta brother and sister, Aureliano uncle, and José Arcadio Buendía grandpa. So that she finally deserved, as much as the others, the name of Rebeca Buendía, the only one that she ever had and that she bore with dignity until her death. One night about the time that Rebeca was cured of the vice of eating earth and was brought to sleep in the other children’s room, the Indian woman, who slept with them awoke by chance and heard a strange, intermittent sound in the corner. She got up in alarm, thinking that an animal had come into the room, and then she saw Rebeca in the rocker, sucking her finger and with her eyes lighted up in the darkness like those of a cat. Terrified, exhausted by her fate, Visitación recognized in those eyes the symptoms of the sickness whose threat had obliged her and her brother to exile themselves forever from an age-old kingdom where they had been prince and princess. It was the insomnia plague. Cataure, the Indian, was gone from the house by morning. His sister stayed because her fatalistic heart told her that the lethal sickness would follow her, no matter what, to the farthest corner of the earth. No one understood Visitación’s alarm. “If we don’t ever sleep again, so much the better,” José Arcadio Buendía said in good humor. “That way we can get more out of life.” But the Indian woman explained that the most fearsome part of the sickness of insomnia was not the impossibility of sleeping, for the body did not feel any fatigue at all, but its inexorable evolution toward a more critical manifestation: a loss of memory. She meant that when the sick person became used to his state of vigil, the recollection of his childhood began to be erased from his memory, then the name and notion of things, and finally the identity of people and even the awareness of his own being, until he sank into a kind of idiocy that had no past. José Arcadio Buendía, dying with laughter, thought that it was just a question of one of the many illnesses invented by the Indians’ superstitions. But Úrsula, just to be safe, took the precaution of isolating Rebeca from the other children. After several weeks, when Visitación’s terror seemed to have died down, José Arcadio Buendía found himself rolling over in bed, unable to fall asleep. Úrsula, who had also awakened, asked him what was wrong, and he answered: “I’m thinking about Prudencio Aguilar again.” They did not sleep a minute, but the following day they felt so rested that they forgot about the bad night. Aureliano commented with surprise at lunchtime that he felt very well in spite of the fact that he had spent the whole night in the laboratory gilding a brooch that he planned to give to Úrsula for her birthday. They did not become alarmed until the third day, when no one felt sleepy at bedtime and they realized that they had gone more than fifty hours without sleeping. “The children are awake too,” the Indian said with her fatalistic conviction. “Once it gets into a house no one can escape the plague.” They had indeed contracted the illness of insomnia. Úrsula, who had learned from her mother the medicinal value of plants, prepared and made them all drink a brew of monkshood, but they could not get to sleep and spent the whole day dreaming on their feet. In that state of hallucinated lucidity, not only did they see the images of their own dreams, but some saw the images dreamed by others. It was as if the house were full of visitors. Sitting in her rocker in a corner of the kitchen, Rebeca dreamed that a man who looked very much like her, dressed in white linen and with his shirt collar closed by a gold button, was bringing her a bouquet of roses. He was accompanied by a woman with delicate hands who took out one rose and put it in the child’s hair. Úrsula understood that the man and woman were Rebeca’s parents, but even though she made a great effort to recognize them, she confirmed her certainty that she had never seen them. In the meantime, through an oversight that José Arcadio Buendía never forgave himself for, the candy animals made in the house were still being sold in the town. Children and adults sucked with delight on the delicious little green roosters of insomnia, the exquisite pink fish of insomnia, and the tender yellow ponies of insomnia, so that dawn on Monday found the whole town awake. No one was alarmed at first. On the contrary, they were happy at not sleeping because there was so much to do in Macondo in those days that there was barely enough time. They worked so hard that soon they had nothing else to do and they could be found at three o’clock in the morning with their arms crossed, counting the notes in the waltz of the clock. Those who wanted to sleep, not from fatigue but because of the nostalgia for dreams, tried all kinds of methods of exhausting themselves. They would gather together to converse endlessly, to tell over and over for hours on end the same jokes, to complicate to the limits of exasperation the story about the capon, which was an endless game in which the narrator asked if they wanted him to tell them the story about the capon, and when they answered yes, the narrator would say that he had not asked them to say yes, but whether they wanted him to tell them the story about the capon, and when they answered no, the narrator told them that he had not asked them to say no, but whether they wanted him to tell them the story about the capon, and when they remained silent the narrator told them that he had not asked them to remain silent but whether they wanted him to tell them the story about the capon, and no one could leave because the narrator would say that he had not asked them to leave but whether they wanted him to tell them the story about the capon, and so on and on in a vicious circle that lasted entire nights. When José Arcadio Buendía realized that the plague had invaded the town, he gathered together the heads of families to explain to them what he knew about the sickness of insomnia, and they agreed on methods to prevent the scourge from spreading to other towns in the swamp. That was why they took the bells off the goats, bells that the Arabs had swapped them for macaws, and put them at the entrance to town at the disposal of those who would not listen to the advice and entreaties of the sentinels and insisted on visiting the town. All strangers who passed through the streets of Macondo at that time had to ring their bells so that the sick people would know that they were healthy. They were not allowed to eat or drink anything during their stay, for there was no doubt but that the illness was transmitted by mouth, and all food and drink had been contaminated by insomnia. In that way they kept the plague restricted to the perimeter of the town. So effective was the quarantine that the day came when the emergency situation was accepted as a natural thing and life was organized in such a way that work picked up its rhythm again and no one worried any more about the useless habit of sleeping. It was Aureliano who conceived the formula that was to protect them against loss of memory for several months. He discovered it by chance. An expert insomniac, having been one of the first, he had learned the art of silverwork to perfection. One day he was looking for the small anvil that he used for laminating metals and he could not remember its name. His father told him: “Stake.” Aureliano wrote the name on a piece of paper that he pasted to the base of the small anvil: stake. In that way he was sure of not forgetting it in the future. It did not occur to him that this was the first manifestation of a loss of memory, because the object had a difficult name to remember. But a few days later be, discovered that he had trouble remembering almost every object in the laboratory. Then he marked them with their respective names so that all he had to do was read the inscription in order to identify them. When his father told him about his alarm at having forgotten even the most impressive happenings of his childhood, Aureliano explained his method to him, and José Arcadio Buendía put it into practice all through the house and later on imposed it on the whole village. With an inked brush he marked everything with its name: table, chair, clock, door, wall, bed, pan. He went to the corral and marked the animals and plants: cow, goat, pig, hen, cassava, caladium, banana. Little by little, studying the infinite possibilities of a loss of memory, he realized that the day might come when things would be recognized by their inscriptions but that no one would remember their use. Then he was more explicit. The sign that he hung on the neck of the cow was an exemplary proof of the way in which the inhabitants of Macondo were prepared to fight against loss of memory: This is the cow. She must be milked every morning so that she will produce milk, and the milk must be boiled in order to be mixed with coffee to make coffee and milk. Thus they went on living in a reality that was slipping away, momentarily captured by words, but which would escape irremediably when they forgot the values of the written letters. At the beginning of the road into the swamp they put up a sign that said MACONDO and another larger one on the main street that said GOD EXISTS. In all the houses keys to memorizing objects and feelings had been written. But the system demanded so much vigilance and moral strength that many succumbed to the spell of an imaginary reality, one invented by themselves, which was less practical for them but more comforting. Pilar Ternera was the one who contributed most to popularize that mystification when she conceived the trick of reading the past in cards as she had read the future before. By means of that recourse the insomniacs began to live in a world built on the uncertain alternatives of the cards, where a father was remembered faintly as the dark man who had arrived at the beginning of April and a mother was remembered only as the dark woman who wore a gold ring on her left hand, and where a birth date was reduced to the last Tuesday on which a lark sang in the laurel tree. Defeated by those practices of consolation, José Arcadio Buendía then decided to build the memory machine that he had desired once in order to remember the marvelous inventions of the gypsies. The artifact was based on the possibility of reviewing every morning, from beginning to end, the totality of knowledge acquired during one’s life. He conceived of it as a spinning dictionary that a person placed on the axis could operate by means of a lever, so that in a very few hours there would pass before his eyes the notions most necessary for life. He had succeeded in writing almost fourteen thousand entries when along the road from the swamp a strange-looking old man with the sad sleepers’ bell appeared, carrying a bulging suitcase tied with a rope and pulling a cart covered with black cloth. He went straight to the house of José Arcadio Buendía. Visitación did not recognize him when she opened the door and she thought he had come with the idea of selling something, unaware that nothing could be sold in a town that was sinking irrevocably into the quicksand of forgetfulness. He was a decrepit man. Although his voice was also broken by uncertainty and his hands seemed to doubt the existence of things, it was evident that he came from the world where men could still sleep and remember. José Arcadio Buendía found him sitting in the living room fanning himself with a patched black hat as he read with compassionate attention the signs pasted to the walls. He greeted him with a broad show of affection, afraid that he had known him at another time and that he did not remember him now. But the visitor was aware of his falseness, He felt himself forgotten, not with the irremediable forgetfulness of the heart, but with a different kind of forgetfulness, which was more cruel and irrevocable and which he knew very well because it was the forgetfulness of death. Then he understood. He opened the suitcase crammed with indecipherable objects and from among then he took out a little case with many flasks. He gave José Arcadio Buendía a drink of a gentle color and the light went on in his memory. His eyes became moist from weeping even before he noticed himself in an absurd living room where objects were labeled and before he was ashamed of the solemn nonsense written on the walls, and even before he recognized the newcomer with a dazzling glow of joy. It was Melquíades. While Macondo was celebrating the recovery of its memory, José Arcadio Buendía and Melquíades dusted off their old friendship. The gypsy was inclined to stay in the town. He really had been through death, but he had returned because he could not bear the solitude. Repudiated by his tribe, having lost all of his supernatural faculties because of his faithfulness to life, he decided to take refuge in that corner of the world which had still not been discovered by death, dedicated to the operation of a daguerreotype laboratory. José Arcadio Buendía had never heard of that invention. But when he saw himself and his whole family fastened onto a sheet of iridescent metal for an eternity, he was mute with stupefaction. That was the date of the oxidized daguerreotype in which José Arcadio Buendía appeared with his bristly and graying hair, his card board collar attached to his shirt by a copper button, and an expression of startled solemnity, whom Úrsula described, dying with laughter, as a “frightened general.” José Arcadio Buendía was, in fact, frightened on that dear December morning when the daguerreotype was made, for he was thinking that people were slowly wearing away while his image would endure an a metallic plaque. Through a curious reversal of custom, it was Úrsula who got that idea out of his head, as it was also she who forgot her ancient bitterness and decided that Melquíades would stay on in the house, although she never permitted them to make a daguerreotype of her because (according to her very words) she did not want to survive as a laughingstock for her grandchildren. That morning she dressed the children in their best clothes, powdered their faces, and gave a spoonful of marrow syrup to each one so that they would all remain absolutely motionless during the nearly two minutes in front of Melquíades fantastic camera. In the family daguerreotype, the only one that ever existed, Aureliano appeared dressed in black velvet between Amaranta and Rebeca. He had the same languor and the same clairvoyant look that he would have years later as he faced the firing squad. But he still had not sensed the premonition of his fate. He was an expert silversmith, praised all over the swampland for the delicacy of his work. In the workshop, which he shared with Melquíades’ mad laboratory, he could barely be heard breathing. He seemed to be taking refuge in some other time, while his father and the gypsy with shouts interpreted the predictions of Nostradamus amidst a noise of flasks and trays and the disaster of spilled acids and silver bromide that was lost in the twists and turns it gave at every instant. That dedication to his work, the good judgment with which he directed his attention, had allowed Aureliano to earn in a short time more money than Úrsula had with her delicious candy fauna, but everybody thought it strange that he was now a full-grown man and had not known a woman. It was true that he had never had one. Several months later saw the return of Francisco the Man, as ancient vagabond who was almost two hundred years old and who frequently passed through Macondo distributing songs that he composed himself. In them Francisco the Man told in great detail the things that had happened in the towns along his route, from Manaure to the edge of the swamp, so that if anyone had a message to send or an event to make public, he would pay him two cents to include it in his repertory. That was how Úrsula learned of the death of her mother, as a simple consequence of listening to the songs in the hope that they would say something about her son José Arcadio. Francisco the Man, called that because he had once defeated the devil in a duel of improvisation, and whose real name no one knew, disappeared from Macondo during the insomnia plague and one night he appeared suddenly in Catarino’s store. The whole town went to listen to him to find out what had happened in the world. On that occasion there arrived with him a woman who was so fat that four Indians had to carry her in a rocking chair, and an adolescent mulatto girl with a forlorn look who protected her from the sun with an umbrella. Aureliano went to Catarino’s store that night. He found Francisco the Man, like a monolithic chameleon, sitting in the midst of a circle of bystanders. He was singing the news with his old, out-of-tune voice, accompanying himself with the same archaic accordion that Sir Walter Raleigh had given him in the Guianas and keeping time with his great walking feet that were cracked from saltpeter. In front of a door at the rear through which men were going and coming, the matron of the rocking chair was sitting and fanning herself in silence. Catarino, with a felt rose behind his ear, was selling the gathering mugs of fermented cane juice, and he took advantage of the occasion to go over to the men and put his hand on them where he should not have. Toward midnight the heat was unbearable. Aureliano listened to the news to the end without hearing anything that was of interest to his family. He was getting ready to go home when the matron signaled him with her hand. “You go in too.” she told him. “It only costs twenty cents.” Aureliano threw a coin into the hopper that the matron had in her lap and went into the room without knowing why. The adolescent mulatto girl, with her small bitch’s teats, was naked on the bed. Before Aureliano sixty-three men had passed through the room that night. From being used so much, kneaded with sweat and sighs, the air in the room had begun to turn to mud. The girl took off the soaked sheet and asked Aureliano to hold it by one side. It was as heavy as a piece of canvas. They squeezed it, twisting it at the ends until it regained its natural weight. They turned over the mat and the sweat came out of the other side. Aureliano was anxious for that operation never to end. He knew the theoretical mechanics of love, but he could not stay on his feet because of the weakness of his knees, and although he had goose pimples on his burning skin he could not resist the urgent need to expel the weight of his bowels. When the girl finished fixing up the bed and told him to get undressed, he gave her a confused explanation: “They made me come in. They told me to throw twenty cents into the hopper and hurry up.” The girl understood his confusion. “If you throw in twenty cents more when you go out, you can stay a little longer,” she said softly. Aureliano got undressed, tormented by shame, unable to get rid of the idea that-his nakedness could not stand comparison with that of his brother. In spite of the girl’s efforts he felt more and more indifferent and terribly alone. “I’ll throw in other twenty cents,” he said with a desolate voice. The girl thanked him in silence. Her back was raw. Her skin was stuck to her ribs and her breathing was forced because of an immeasurable exhaustion. Two years before, far away from there, she had fallen asleep without putting out the candle and had awakened surrounded by flames. The house where she lived with the grandmother who had raised her was reduced to ashes. Since then her grandmother carried her from town to town, putting her to bed for twenty cents in order to make up the value of the burned house. According to the girl’s calculations, she still had ten years of seventy men per night, because she also had to pay the expenses of the trip and food for both of them as well as the pay of the Indians who carried the rocking chair. When the matron knocked on the door the second time, Aureliano left the room without having done anything, troubled by a desire to weep. That night he could not sleep, thinking about the girl, with a mixture of desire and pity. He felt an irresistible need to love her and protect her. At dawn, worn out by insomnia and fever, he made the calm decision to marry her in order to free her from the despotism of her grandmother and to enjoy all the nights of satisfaction that she would give the seventy men. But at ten o’clock in the morning, when he reached Catarino’s store, the girl had left town. Time mitigated his mad proposal, but it aggravated his feelings of frustration. He took refuge in work. He resigned himself to being a womanless man for all his life in order to hide the shame of his uselessness. In the meantime, Melquíades had printed on his plates everything that was printable in Macondo, and he left the daguerreotype laboratory to the fantasies of José Arcadio Buendía who had resolved to use it to obtain scientific proof of the existence of God. Through a complicated process of superimposed exposures taken in different parts of the house, he was sure that sooner or later he would get a daguerreotype of God, if He existed, or put an end once and for all to the supposition of His existence. Melquíades got deeper into his interpretations of Nostradamus. He would stay up until very late, suffocating in his faded velvet vest, scribbling with his tiny sparrow hands, whose rings had lost the glow of former times. One night he thought he had found a prediction of the future of Macondo. It was to be a luminous city with great glass houses where there was no trace remaining of the race of the Buendía. “It’s a mistake,” José Arcadio Buendía thundered. “They won’t be houses of glass but of ice, as I dreamed, and there will always be a Buendía, per omnia secula seculorum.” Úrsula fought to preserve common sense in that extravagant house, having broadened her business of little candy animals with an oven that went all night turning out baskets and more baskets of bread and a prodigious variety of puddings, meringues, and cookies, which disappeared in a few hours on the roads winding through the swamp. She had reached an age where she had a right to rest, but she was nonetheless more and more active. So busy was she in her prosperous enterprises that one afternoon she looked distractedly toward the courtyard while the Indian woman helped her sweeten the dough and she saw two unknown and beautiful adolescent girls doing frame embroidery in the light of the sunset. They were Rebeca and Amaranta. As soon as they had taken off the mourning clothes for their grandmother, which they wore with inflexible rigor for three years, their bright clothes seemed to have given them a new place in the world. Rebeca, contrary to what might have been expected, was the more beautiful. She had a light complexion, large and peaceful eyes, and magical hands that seemed to work out the design of the embroidery with invisible threads. Amaranta, the younger, was somewhat graceless, but she had the natural distinction, the inner tightness of her dead grandmother. Next to them, although he was already revealing the physical drive of his father, Arcadio looked like a child. He set about learning the art of silverwork with Aureliano, who had also taught him how to read and write. Úrsula suddenly realized that the house had become full of people, that her children were on the point of marrying and having children, and that they would be obliged to scatter for lack of space. Then she took out the money she had accumulated over long years of hard labor, made some arrangements with her customers, and undertook the enlargement of the house. She had a formal parlor for visits built, another one that was more comfortable and cool for daily use, a dining room with a table with twelve places where the family could sit with all of their guests, nine bedrooms with windows on the courtyard and a long porch protected from the heat of noon by a rose garden with a railing on which to place pots of ferns and begonias. She had the kitchen enlarged to hold two ovens. The granary where Pilar Ternera had read José Arcadio’s future was torn down and another twice as large built so that there would never be a lack of food in the house. She had baths built is the courtyard in the shade of the chestnut tree, one for the women and another for the men, and in the rear a large stable, a fenced-in chicken yard, a shed for the milk cows, and an aviary open to the four winds so that wandering birds could roost there at their pleasure. Followed by dozens of masons and carpenters, as if she had contracted her husband’s hallucinating fever, Úrsula fixed the position of light and heat and distributed space without the least sense of its limitations. The primitive building of the founders became filled with tools and materials, of workmen exhausted by sweat, who asked everybody please not to molest them, exasperated by the sack of bones that followed them everywhere with its dull rattle. In that discomfort, breathing quicklime and tar, no one could see very well how from the bowels of the earth there was rising not only the largest house is the town, but the most hospitable and cool house that had ever existed in the region of the swamp. José Buendía, trying to surprise Divine Providence in the midst of the cataclysm, was the one who least understood it. The new house was almost finished when Úrsula drew him out of his chimerical world in order to inform him that she had an order to paint the front blue and not white as they had wanted. She showed him the official document. José Arcadio Buendía, without understanding what his wife was talking about, deciphered the signature. “Who is this fellow?” he asked: “The magistrate,” Úrsula answered disconsolately. They say he’s an authority sent by the government.” Don Apolinar Moscote, the magistrate, had arrived in Macondo very quietly. He put up at the Hotel Jacob—built by one of the first Arabs who came to swap knickknacks for macaws—and on the following day he rented a small room with a door on the street two blocks away from the Buendía house. He set up a table and a chair that he had bought from Jacob, nailed up on the wall the shield of the republic that he had brought with him, and on the door he painted the sign: Magistrate. His first order was for all the houses to be painted blue in celebration of the anniversary of national independence. José Arcadio Buendía, with the copy of the order in his hand, found him taking his nap in a hammock he had set up in the narrow office. “Did you write this paper?” he asked him. Don Apolinar Moscote, a mature man, timid, with a ruddy complexion, said yes. “By what right?” José Arcadio Buendía asked again. Don Apolinar Moscote picked up a paper from the drawer of the table and showed it to him. “I have been named magistrate of this town.” José Arcadio Buendía did not even look at the appointment. “In this town we do not give orders with pieces of paper,” he said without losing his calm. “And so that you know it once and for all, we don’t need any judge here because there’s nothing that needs judging.” Facing Don Apolinar Moscote, still without raising his voice, he gave a detailed account of how they had founded the village, of how they had distributed the land, opened the roads, and introduced the improvements that necessity required without having bothered the government and without anyone having bothered them. “We are so peaceful that none of us has died even of a natural death,” he said. “You can see that we still don’t have any cemetery.” No once was upset that the government had not helped them. On the contrary, they were happy that up until then it had let them grow in peace, and he hoped that it would continue leaving them that way, because they had not founded a town so that the first upstart who came along would tell them what to do. Don Apolinar had put on his denim jacket, white like his trousers, without losing at any moment the elegance of his gestures. “So that if you want to stay here like any other ordinary citizen, you’re quite welcome,” José Arcadio Buendía concluded. “But if you’ve come to cause disorder by making the people paint their houses blue, you can pick up your junk and go back where you came from. Because my house is going to be white, white, like a dove.” Don Apolinar Moscote turned pale. He took a step backward and tightened his jaws as he said with a certain affliction: “I must warn you that I’m armed.” José Arcadio Buendía did not know exactly when his hands regained the useful strength with which he used to pull down horses. He grabbed Don Apolinar Moscote by the lapels and lifted him up to the level of his eyes. “I’m doing this,” he said, “because I would rather carry you around alive and not have to keep carrying you around dead for the rest of my life.” In that way he carried him through the middle of the street, suspended by the lapels, until he put him down on his two feet on the swamp road. A week later he was back with six barefoot and ragged soldiers, armed with shotguns, and an oxcart in which his wife and seven daughters were traveling. Two other carts arrived later with the furniture, the baggage, and the household utensils. He settled his family in the Hotel Jacob, while he looked for a house, and he went back to open his office under the protection of the soldiers. The founders of Macondo, resolving to expel the invaders, went with their older sons to put themselves at the disposal of José Arcadio Buendía. But he was against it, as he explained, because it was not manly to make trouble for someone in front of his family, and Don Apolinar had returned with his wife and daughters. So he decided to resolve the situation in a pleasant way. Aureliano went with him. About that time he had begun to cultivate the black mustache with waxed tips and the somewhat stentorian voice that would characterize him in the war. Unarmed, without paying any attention to the guards, they went into the magistrate’s office. Don Apolinar Moscote did not lose his calm. He introduced them to two of his daughters who happened to be there: Amparo, sixteen, dark like her mother, and Remedios, only nine, a pretty little girl with lilycolored skin and green eyes. They were gracious and well-mannered. As soon as the men came in, before being introduced, they gave them chairs to sit on. But they both remained standing. “Very well, my friend,” José Arcadio Buendía said, “you may stay here, not because you have those bandits with shotguns at the door, but out of consideration for your wife and daughters.” Don Apolinar Moscote was upset, but José Arcadio Buendía did not give him time to reply. “We only make two conditions,” he went on. “The first: that everyone can paint his house the color he feels like. The second: that the soldiers leave at once. We will guarantee order for you.” The magistrate raised his right hand with all the fingers extended. “Your word of honor?” “The word of your enemy,” José Arcadio Buendía said. And he added in a bitter tone: “Because I must tell you one thing: you and I are still enemies.” The soldiers left that same afternoon. A few days later José Arcadio Buendía found a house for the magistrate’s family. Everybody was at peace except Aureliano. The image of Remedios, the magistrate’s younger daughter, who, because of her age, could have been his daughter, kept paining him in some part of his body. It was a physical sensation that almost bothered him when he walked, like a pebble in his shoe. THE NEW HOUSE, white, like a dove, was inaugurated with a dance. Úrsula had got that idea from the afternoon when she saw Rebeca and Amaranta changed into adolescents, and it could almost have been said that the main reason behind the construction was a desire to have a proper place for the girls to receive visitors. In order that nothing would be lacking in splendor she worked like a galley slave as the repairs were under way, so that before they were finished she had ordered costly necessities for the decorations, the table service, and the marvelous invention that was to arouse the astonishment of the town and the jubilation of the young people: the pianola. They delivered it broken down, packed in several boxes that were unloaded along with the Viennese furniture, the Bohemian crystal, the table service from the Indies Company, the tablecloths from Holland, and a rich variety of lamps and candlesticks, hangings and drapes. The import house sent along at its own expense an Italian expert, Pietro Crespi, to assemble and tune the pianola, to instruct the purchasers in its functioning, and to teach them how to dance the latest music printed on its six paper rolls. Pietro Crespi was young and blond, the most handsome and well mannered man who had ever been seen in Macondo, so scrupulous in his dress that in spite of the suffocating heat he would work in his brocade vest and heavy coat of dark cloth. Soaked in sweat, keeping a reverent distance from the owners of the house, he spent several weeks shut up is the parlor with a dedication much like that of Aureliano in his silverwork. One morning, without opening the door, without calling anyone to witness the miracle, he placed the first roll in the pianola and the tormenting hammering and the constant noise of wooden lathings ceased in a silence that was startled at the order and neatness of the music. They all ran to the parlor. José Arcadio Buendía was as if struck by lightning, not because of the beauty of the melody, but because of the automatic working of the keys of the pianola, and he set up Melquíades’ camera with the hope of getting a daguerreotype of the invisible player. That day the Italian had lunch with them. Rebeca and Amaranta, serving the table, were intimidated by the way in which the angelic man with pale and ringless hands manipulated the utensils. In the living room, next to the parlor, Pietro Crespi taught them how to dance. He showed them the steps without touching them, keeping time with a metronome, under the friendly eye of Úrsula, who did not leave the room for a moment while her daughters had their lesson. Pietro Crespi wore special pants on those days, very elastic and tight, and dancing slippers, “You don’t have to worry so much,” José Arcadio Buendía told her. “The man’s a fairy.” But she did not leave off her vigilance until the apprenticeship was over and the Italian left Macondo. Then they began to organize the party. Úrsula drew up a strict guest list, in which the only ones invited were the descendants of the founders, except for the family of Pilar Ternera, who by then had had two more children by unknown fathers. It was truly a high-class list, except that it was determined by feelings of friendship, for those favored were not only the oldest friends of José Arcadio Buendía’s house since before they undertook the exodus and the founding of Macondo, but also their sons and grandsons, who were the constant companions of Aureliano and Arcadio since infancy, and their daughters, who were the only ones who visited the house to embroider with Rebeca and Amaranta. Don Apolinar Moscote, the benevolent ruler whose activity had been reduced to the maintenance from his scanty resources of two policemen armed with wooden clubs, was a figurehead. In older to support the household expenses his daughters had opened a sewing shop, where they made felt flowers as well as guava delicacies, and wrote love notes to order. But in spite of being modest and hard-working, the most beautiful girls in Iowa, and the most skilled at the new dances, they did not manage to be considered for the party. While Úrsula and the girls unpacked furniture, polished silverware, and hung pictures of maidens in boats full of roses, which gave a breath of new life to the naked areas that the masons had built, José Arcadio Buendía stopped his pursuit of the image of God, convinced of His nonexistence, and he took the pianola apart in order to decipher its magical secret. Two days before the party, swamped in a shower of leftover keys and hammers, bungling in the midst of a mix-up of strings that would unroll in one direction and roll up again in the other, he succeeded in a fashion in putting the instrument back together. There had never been as many surprises and as much dashing about as in those days, but the new pitch lamps were lighted on the designated day and hour. The house was opened, still smelling of resin and damp whitewash, and the children and grandchildren of the founders saw the porch with ferns and begonias, the quiet rooms, the garden saturated with the fragrance of the roses, and they gathered together in the parlor, facing the unknown invention that had been covered with a white sheet. Those who were familiar with the piano, popular in other towns in the swamp, felt a little disheartened, but more bitter was Úrsula’s disappointment when she put in the first roll so that Amaranta and Rebeca could begin the dancing and the mechanism did not work. Melquíades, almost blind by then, crumbling with decrepitude, used the arts of his timeless wisdom in an attempt to fix it. Finally José Arcadio Buendía managed, by mistake, to move a device that was stuck and the music came out, first in a burst and then in a flow of mixed-up notes. Beating against the strings that had been put in without order or concert and had been tuned with temerity, the hammers let go. But the stubborn descendants of the twenty-one intrepid people who plowed through the mountains in search of the sea to the west avoided the reefs of the melodic mix-up and the dancing went on until dawn. Pietro Crespi came back to repair the pianola. Rebeca and Amaranta helped him put the strings in order and helped him with their laughter at the mix-up of the melodies. It was extremely pleasant and so chaste in its way that Úrsula ceased her vigilance. On the eve of his departure a farewell dance for him was improvised with the pianola and with Rebeca he put on a skillful demonstration of modern dance, Arcadio and Amaranta matched them in grace and skill. But the exhibition was interrupted because Pilar Ternera, who was at the door with the onlookers, had a fight, biting and hair pulling, with a woman who had dared to comment that Arcadio had a woman’s behind. Toward midnight Pietro Crespi took his leave with a sentimental little speech, and he promised to return very soon. Rebeca accompanied him to the door, and having closed up the house and put out the lamps, she went to her room to weep. It was an inconsolable weeping that lasted for several days, the cause of which was not known even by Amaranta. Her hermetism was not odd. Although she seemed expansive and cordial, she had a solitary character and an impenetrable heart. She was a splendid adolescent with long and firm bones, but she still insisted on using the small wooden rocking chair with which she had arrived at the house, reinforced many times and with the arms gone. No one had discovered that even at that age she still had the habit of sucking her finger. That was why she would not lose an opportunity to lock herself in the bathroom and had acquired the habit of sleeping with her face to the wall. On rainy afternoons, embroidering with a group of friends on the begonia porch, she would lose the thread of the conversation and a tear of nostalgia would salt her palate when she saw the strips of damp earth and the piles of mud that the earthworms had pushed up in the garden. Those secret tastes, defeated in the past by oranges and rhubarb, broke out into an irrepressible urge when she began to weep. She went back to eating earth. The first time she did it almost out of curiosity, sure that the bad taste would be the best cure for the temptation. And, in fact, she could not bear the earth in her mouth. But she persevered, overcome by the growing anxiety, and little by little she was getting back her ancestral appetite, the taste of primary minerals, the unbridled satisfaction of what was the original food. She would put handfuls of earth in her pockets, and ate them in small bits without being seen, with a confused feeling of pleasure and rage, as she instructed her girl friends in the most difficult needlepoint and spoke about other men, who did not deserve the sacrifice of having one eat the whitewash on the walls because of them. The handfuls of earth made the only man who deserved that show of degradation less remote and more certain, as if the ground that he walked on with his fine patent leather boots in another part of the world were transmitting to her the weight and the temperature of his blood in a mineral savor that left a harsh aftertaste in her mouth and a sediment of peace in her heart. One afternoon, for no reason, Amparo Moscote asked permission to see the house. Amaranta and Rebeca, disconcerted by the unexpected visit, attended her with a stiff formality. They showed her the remodeled mansion, they had her listen to the rolls on the pianola, and they offered her orange marmalade and crackers. Amparo gave a lesson in dignity, personal charm, and good manners that impressed Úrsula in the few moments that she was present during the visit. After two hours, when the conversation was beginning to wane, Amparo took advantage of Amaranta’s distraction and gave Rebeca a letter. She was able to see the name of the Estimable Señorita Rebeca Buendía, written in the same methodical hand, with the same green ink, and the same delicacy of words with which the instructions for the operation of the pianola were written, and she folded the letter with the tips of her fingers and hid it in her bosom, looking at Amparo Moscote with an expression of endless and unconditional gratitude and a silent promise of complicity unto death. The sudden friendship between Amparo Moscote and Rebeca Buendía awakened the hopes of Aureliano. The memory of little Remedios had not stopped tormenting him, but he had not found a chance to see her. When he would stroll through town with his closest friends, Magnífico Visbal and Gerineldo Márquez—the sons of the founders of the same names—he would look for her in the sewing shop with an anxious glance, but he saw only the older sisters. The presence of Amparo Moscote in the house was like a premonition. “She has to come with her,” Aureliano would say to himself in a low voice. “She has to come.” He repeated it so many times and with such conviction that one afternoon when he was putting together a little gold fish in the work shop, he had the certainty that she had answered his call. Indeed, a short time later he heard the childish voice, and when he looked up his heart froze with terror as he saw the girl at the door, dressed in pink organdy and wearing white boots. “You can’t go in there, Remedios, Amparo Moscote said from the hall. They’re working.” But Aureliano did not give her time to respond. He picked up the little fish by the chain that came through its mouth and said to her. “Come in.” Remedios went over and asked some questions about the fish that Aureliano could not answer because he was seized with a sudden attack of asthma. He wanted to stay beside that lily skin forever, beside those emerald eyes, close to that voice that called him “sir” with every question. showing the same respect that she gave her father. Melquíades was in the corner seated at the desk scribbling indecipherable signs. Aureliano hated him. All he could do was tell Remedios that he was going to give her the little fish and the girl was so startled by the offer that she left the workshop as fast as she could. That afternoon Aureliano lost the hidden patience with which he had waited for a chance to see her. He neglected his work. In several desperate efforts of concentration he willed her to appear but Remedios did not respond. He looked for her in her sisters’ shop, behind the window shades in her house, in her father’s office, but he found her only in the image that saturated his private and terrible solitude. He would spend whole hours with Rebeca in the parlor listening to the music on the pianola. She was listening to it because it was the music with which Pietro Crespi had taught them how to dance. Aureliano listened to it simply because everything, even music, reminded him of Remedios. The house became full of loves Aureliano expressed it in poetry that had no beginning or end. He would write it on the harsh pieces of parchment that Melquíades gave him, on the bathroom walls, on the skin of his arms, and in all of it Remedios would appear transfigured: Remedios in the soporific air of two in the afternoon, Remedios in the soft breath of the roses, Remedios in the water-clock secrets of the moths, Remedios in the steaming morning bread, Remedios everywhere and Remedios forever. Rebeca waited for her love at four in the afternoon, embroidering by the window. She knew that the mailman’s mule arrived only every two weeks, but she always waited for him, convinced that he was going to arrive on some other day by mistake. It happened quite the opposite: once the mule did not come on the usual day. Mad with desperation, Rebeca got up in the middle of the night and ate handfuls of earth in the garden with a suicidal drive, weeping with pain and fury, chewing tender earthworms and chipping her teeth on snail shells. She vomited until dawn. She fell into a state of feverish prostration, lost consciousness, and her heart went into a shameless delirium. Úrsula, scandalized, forced the lock on her trunk and found at the bottom, tied together with pink ribbons, the sixteen perfumed letters and the skeletons of leaves and petals preserved in old books and the dried butterflies that turned to powder at the touch. Aureliano was the only one capable of understanding such desolation. That afternoon, while Úrsula was trying to rescue Rebeca from the slough of delirium, he went with Magnífico Visbal and Gerineldo Márquez to Catarino’s store. The establishment had been expanded with a gallery of wooden rooms where single women who smelled of dead flowers lived. A group made up of an accordion and drums played the songs of Francisco the Man, who had not been seen in Macondo for several years. The three friends drank fermented cane juice. Magnífico and Gerineldo, contemporaries of Aureliano but more skilled in the ways of the world, drank methodically with the women seated on their laps. One of the women, withered and with goldwork on her teeth, gave Aureliano a caress that made him shudder. He rejected her. He had discovered that the more he drank the more he thought about Remedios, but he could bear the torture of his recollections better. He did not know exactly when he began to float. He saw his friends and the women sailing in a radiant glow, without weight or mass, saying words that did not come out of their mouths and making mysterious signals that did not correspond to their expressions. Catarino put a hand on his shoulder and said to him: “It’s going on eleven.” Aureliano turned his head, saw the enormous disfigured face with a felt flower behind the ear, and then he lost his memory, as during the times of forgetfulness, and he recovered it on a strange dawn and in a room that was completely foreign, where Pilar Ternera stood in her slip, barefoot, her hair down, holding a lamp over him, startled with disbelief. “Aureliano!” Aureliano checked his feet and raised his head. He did not know how he had come there, but he knew what his aim was, because he had carried it hidden since infancy in an inviolable backwater of his heart. “I’ve come to sleep with you,” he said. His clothes were smeared with mud and vomit. Pilar Ternera, who lived alone at that time with her two younger children, did not ask him any questions. She took him to the bed. She cleaned his face with a damp cloth, took of his clothes, and then got completely undressed and lowered the mosquito netting so that her children would not see them if they woke up. She had become tired of waiting for the man who would stay, of the men who left, of the countless men who missed the road to her house, confused by the uncertainty of the cards. During the wait her skin had become wrinkled, her breasts had withered, the coals of her heart had gone out. She felt for Aureliano in the darkness, put her hand on his stomach and kissed him on the neck with a maternal tenderness. “My poor child,” she murmured. Aureliano shuddered. With a calm skill, without the slightest misstep, he left his accumulated grief behind and found Remedios changed into a swamp without horizons, smelling of a raw animal and recently ironed clothes. When he came to the surface he was weeping. First they were involuntary and broken sobs. Then he emptied himself out in an unleashed flow, feeling that something swollen and painful had burst inside of him. She waited, snatching his head with the tips of her fingers, until his body got rid of the dark material that would not let him live. They Pilar Ternera asked him: “Who is it?” And Aureliano told her. She let out a laugh that in other times frightened the doves and that now did not even wake up the children. “You’ll have to raise her first,” she mocked, but underneath the mockery Aureliano found a reservoir of understanding. When he went out of the room, leaving behind not only his doubts about his virility but also the bitter weight that his heart had borne for so many months, Pilar Ternera made him a spontaneous promise. “I’m going to talk to the girl,” she told him, “and you’ll see what I’ll serve her on the tray.” She kept her promise. But it was a bad moment, because the house had lost its peace of former days. When she discovered Rebeca’s passion, which was impossible to keep secret because of her shouts, Amaranta suffered an attack of fever. She also suffered from the barb of a lonely love. Shut up in the bathroom, she would release herself from the torment of a hopeless passion by writing feverish letters, which she finally hid in the bottom of her trunk. Úrsula barely had the strength to take care of the two sick girls. She was unable, after prolonged and insidious interrogations, to ascertain the causes of Amaranta’s prostration. Finally, in another moment of inspiration, she forced the lock on the trunk and found the letters tied with a pink ribbon, swollen with fresh lilies and still wet with tears, addressed and never sent to Pietro Crespi. Weeping with rage, she cursed the day that it had occurred to her to buy the pianola, and she forbade the embroidery lessons and decreed a kind of mourning with no one dead which was to be prolonged until the daughters got over their hopes. Useless was the intervention of José Arcadio Buendía, who had modified his first impression of Pietro Crespi and admired his ability in the manipulation of musical machines. So that when Pilar Ternera told Aureliano that Remedios had decided on marriage, he could see that the news would only give his parents more trouble. Invited to the parlor for a formal interview, José Arcadio Buendía and Úrsula listened stonily to their son’s declaration. When he learned the name of the fiancée, however, José Arcadio Buendía grew red with indignation. “Love is a disease,” he thundered. “With so many pretty and decent girls around, the only thing that occurs to you is to get married to the daughter of our enemy.” But Úrsula agreed with the choice. She confessed her affection for the seven Moscote sisters. for their beauty, their ability for work, their modesty, and their good manners, and she celebrated her son’s prudence. Conquered by his wife’s enthusiasm, José Arcadio Buendía then laid down one condition: Rebeca, who was the one he wanted, would marry Pietro Crespi. Úrsula would take Amaranta on a trip to the capital of the province when she had time, so that contact with different people would alleviate her disappointment. Rebeca got her health back just as soon as she heard of the agreement, and she wrote her fiancé a jubilant letter that she submitted to her parents’ approval and put into the mail without the use of any intermediaries. Amaranta pretended to accept the decision and little by little she recovered from her fevers, but she promised herself that Rebeca would marry only over her dead body. The following Saturday José Arcadio Buendía put on his dark suit, his celluloid collar, and the deerskin boots that he had worn for the first time the night of the party, and went to ask for the hand of Remedios Moscote. The magistrate and his wife received him, pleased and worried at the same time, for they did not know the reason for the unexpected visit, and then they thought that he was confused about the name of the intended bride. In order to remove the mistake, the mother woke Remedios up and carried her into the living room, still drowsy from sleep. They asked her if it was true that she had decided to get married, and she answered, whimpering, that she only wanted them to let her sleep. José Arcadio Buendía, understanding the distress of the Moscotes, went to clear things up with Aureliano. When he returned, the Moscotes had put on formal clothing, had rearranged the furniture and put fresh flowers in the vases, and were waiting in the company of their older daughters. Overwhelmed by the unpleasantness of the occasion and the bothersome hard collar, José Arcadio Buendía confirmed the fact that Remedios, indeed, was the chosen one. “It doesn’t make sense,” Don Apolinar Moscote said with consternation. “We have six other daughters, all unmarried, and at an age where they deserve it, who would be delighted to be the honorable wife of a gentleman as serious and hard-working as your son, and Aurelito lays his eyes precisely on the one who still wets her bed.” His wife, a well-preserved woman with afflicted eyelids and expression, scolded his mistake. When they finished the fruit punch, they willingly accepted Aureliano’s decision. Except that Señora Moscote begged the favor of speaking to Úrsula alone. Intrigued, protesting that they were involving her in men’s affairs, but really feeling deep emotion, Úrsula went to visit her the next day. A half hour later she returned with the news that Remedios had not reached puberty. Aureliano did not consider that a serious barrier. He had waited so long that he could wait as long as was necessary until his bride reached the age of conception. The newfound harmony was interrupted by the death of Melquíades. Although it was a foreseeable event, the circumstances were not. A few months after his return, a process of aging had taken place in him that was so rapid and critical that soon he was treated as one of those useless great-grandfathers who wander about the bedrooms like shades, dragging their feet, remembering better times aloud, and whom no one bothers about or remembers really until the morning they find them dead in their bed. At first José Arcadio Buendía helped him in his work, enthusiastic over the novelty of the daguerreotypes and the predictions of Nostradamus. But little by little he began abandoning him to his solitude, for communication was becoming Increasingly difficult. He was losing his sight and his hearing, he seemed to confuse the people he was speaking to with others he had known in remote epochs of mankind, and he would answer questions with a complex hodgepodge of languages. He would walk along groping in the air, although he passed between objects with an inexplicable fluidity, as if be were endowed with some instinct of direction based on an immediate prescience. One day he forgot to put in his false teeth, which at night he left in a glass of water beside his bed, and he never put them in again. When Úrsula undertook the enlargement of the house, she had them build him a special room next to Aureliano’s workshop, far from the noise and bustle of the house, with a window flooded with light and a bookcase where she herself put in order the books that were almost destroyed by dust and moths, the flaky stacks of paper covered with indecipherable signs, and the glass with his false teeth, where some aquatic plants with tiny yellow flowers had taken root. The new place seemed to please Melquíades, because he was never seen any more, not even in the dining room, He only went to Aureliano’s workshop, where he would spend hours on end scribbling his enigmatic literature on the parchments that he had brought with him and that seemed to have been made out of some dry material that crumpled like puff paste. There he ate the meals that Visitación brought him twice a day, although in the last days he lost his appetite and fed only on vegetables. He soon acquired the forlorn look that one sees in vegetarians. His skin became covered with a thin moss, similar to that which flourished on the antique vest that he never took off, and his breath exhaled the odor of a sleeping animal. Aureliano ended up forgetting about him, absorbed in the composition of his poems, but on one occasion he thought he understood something of what Melquíades was saying in his groping monologues, and he paid attention. In reality, the only thing that could be isolated in the rocky paragraphs was the insistent hammering on the word equinox, equinox, equinox, and the name of Alexander von Humboldt. Arcadio got a little closer to him when he began to help Aureliano in his silverwork. Melquíades answered that effort at communication at times by giving forth with phrases in Spanish that had very little to do with reality. One afternoon, however, he seemed to be illuminated by a sudden emotion. Years later, facing the firing squad, Arcadio would remember the trembling with which Melquíades made him listen to several pages of his impenetrable writing, which of course he did not understand, but which when read aloud were like encyclicals being chanted. Then he smiled for the first time in a long while and said in Spanish: “When I die, burn mercury in my room for three days.” Arcadio told that to José Arcadio Buendía and the latter tried to get more explicit information, but he received only one answer: “I have found immortality.” When Melquíades’ breathing began to smell, Arcadio took him to bathe in the river on Thursday mornings. He seemed to get better. He would undress and get into the water with the boys, and his mysterious sense of orientation would allow him to avoid the deep and dangerous spots. “We come from the water,” he said on a certain occasion. Much time passed in that way without anyone’s seeing him in the house except on the night when he made a pathetic effort to fix the pianola, and when he would go to the river with Arcadio, carrying under his arm a gourd and a bar of palm oil soap wrapped in a towel. One Thursday before they called him to go to the river, Aureliano heard him say: “I have died of fever on the dunes of Singapore.” That day he went into the water at a bad spot and they did not find him until the following day, a few miles downstream, washed up on a bright bend in the river and with a solitary vulture sitting on his stomach. Over the scandalized protests of Úrsula, who wept with more grief than she had had for her own father, José Arcadio Buendía was opposed to their burying him. “He is immortal,” he said, “and he himself revealed the formula of his resurrection.” He brought out the forgotten water pipe and put a kettle of mercury to boil next to the body, which little by little was filling with blue bubbles. Don Apolinar Moscote ventured to remind him that an unburied drowned man was a danger to public health. “None of that, because he’s alive,” was the answer of José Arcadio Buendía, who finished the seventy-two hours with the mercurial incense as the body was already beginning to burst with a livid fluorescence, the soft whistles of which impregnated the house with a pestilential vapor. Only then did he permit them to bury him, not in any ordinary way, but with the honors reserved for Macondo’s greatest benefactor. It was the first burial and the bestattended one that was ever seen in the town, only surpassed, a century later, by Big Mama’s funeral carnival. They buried him in a grave dug in the center of the plot destined for the cemetery, with a stone on which they wrote the only thing they knew about him: MELQUÍADES. They gave him his nine nights of wake. In the tumult that gathered in the courtyard to drink coffee, tell jokes, and play cards. Amaranta found a chance to confess her love to Pietro Crespi, who a few weeks before had formalized his promise to Rebeca and had set up a store for musical instruments and mechanical toys in the same section where the Arabs had lingered in other times swapping knickknacks for macaws, and which the people called the Street of the Turks. The Italian, whose head covered with patent leather curls aroused in women an irrepressible need to sigh, dealt with Amaranta as with a capricious little girl who was not worth taking seriously. “I have a younger brother,” he told her. “He’s coming to help me in the store.” Amaranta felt humiliated and told Pietro Crespi with a virulent anger that she was prepared to stop her sister’s wedding even if her own dead body had to lie across the door. The Italian was so impressed by the dramatics of the threat that he could not resist the temptation to mention it to Rebeca. That was how Amaranta’s trip, always put off by Úrsula’s work, was arranged in less than a week. Amaranta put up no resistance, but when she kissed Rebeca good-bye she whispered in her ear: “Don’t get your hopes up. Even if they send me to the ends of the earth I’ll find some way of stopping you from getting married, even if I have to kill you.” With the absence of Úrsula, with the invisible presence of Melquíades, who continued his stealthy shuffling through the rooms, the house seemed enormous and empty. Rebeca took charge of domestic order, while the Indian woman took care of the bakery. At dusk, when Pietro Crespi would arrive, preceded by a cool breath of lavender and always bringing a toy as a gift, his fiancée would receive the visitor in the main parlor with doors and windows open to be safe from any suspicion. It was an unnecessary precaution, for the Italian had shown himself to be so respectful that he did not even touch the hand of the woman who was going to be his wife within the year. Those visits were filling the house with remarkable toys. Mechanical ballerinas, music boxes, acrobatic monkeys, trotting horses, clowns who played the tambourine: the rich and startling mechanical fauna that Pietro Crespi brought dissipated José Arcadio Buendía’s affliction over the death of Melquíades and carried him back to his old days as an alchemist. He lived at that time in a paradise of disemboweled animals, of mechanisms that had been taken apart in an attempt to perfect them with a system of perpetual motion based upon the principles of the pendulum. Aureliano, for his part, had neglected the workshop in order to teach little Remedios to read and write. At first the child preferred her dolls to the man who would come every afternoon and who was responsible for her being separated from her toys in order to be bathed and dressed and seated in the parlor to receive the visitor. But Aureliano’s patience and devotion finally won her over, up to the point where she would spend many hours with him studying the meaning of the letters and sketching in a notebook with colored pencils little houses with cows in the corral and round suns with yellow rays that hid behind the hills. Only Rebeca was unhappy, because of Amaranta’s threat. She knew her sister’s character, the haughtiness of her spirit, and she was frightened by the virulence of her anger. She would spend whole hours sucking her finger in the bathroom, holding herself back with an exhausting iron will so as not to eat earth. In search of some relief for her uncertainty, she called Pilar Ternera to read her future. After a string of conventional vagaries, Pilar Ternera predicted: “You will not be happy as long as your parents remain unburied.” Rebeca shuddered. As in the memory of a dream she saw herself entering the house as a very small girl, with the trunk and the little rocker, and a bag whose contents she had never known. She remembered a bald gentleman dressed in linen and with his collar closed by a gold button, who had nothing to do with the king of hearts. She remembered a very young and beautiful woman with warm and perfumed hands, who had nothing in common with the jack of diamonds and his rheumatic hands, and who used to put flowers in her hair and take her out walking in the afternoon through a town with green streets. “I don’t understand,” she said. Pilar Ternera seemed disconcerted: “I don’t either, but that’s what the cards say.” Rebeca was so preoccupied with the enigma that she told it to José Arcadio Buendía, and he scolded her for believing in the predictions of the cards, but he undertook the silent task of searching closets and trunks, moving furniture and turning over beds and floorboards looking for the bag of bones. He remembered that he had not seen it since the time of the rebuilding. He secretly summoned the masons and one of them revealed that he had walled up the bag in some bedroom because it bothered him in his work. After several days of listening, with their ears against the walls, they perceived the deep cloc-cloc. They penetrated the wall and there were the bones in the intact bag. They buried it the same day in a grave without a stone next to that of Melquíades, and José Arcadio Buendía returned home free of a burden that for a moment had weighed on his conscience as much as the memory of Prudencio Aguilar. When he went through the kitchen he kissed Rebeca on the forehead. “Get those bad thoughts out of your head,” he told her. “You’re going to be happy.” The friendship with Rebeca opened up to Pilar Ternera the doors of the house, closed by Úrsula since the birth of Arcadio. She would arrive at any hour of the day, like a flock of goats, and would unleash her feverish energy in the hardest tasks. Sometimes she would go into the workshop and help Arcadio sensitize the daguerreotype plates with an efficiency and a tenderness that ended up by confusing him. That woman bothered him. The tan of her skin, her smell of smoke, the disorder of her laughter in the darkroom distracted his attention and made him bump into things. On a certain occasion Aureliano was there working on his silver, and Pilar Ternera leaned over the table to admire his laborious patience. Suddenly it happened. Aureliano made sure that Arcadio was in the darkroom before raising his eyes and meeting those of Pilar Ternera, whose thought was perfectly visible, as if exposed to the light of noon. “Well,” Aureliano said. “Tell me what it is.” Pilar Ternera bit her lips with a sad smile. “That you’d be good in a war,” she said. “Where you put your eye, you put your bullet.” Aureliano relaxed with the proof of the omen. He went back to concentrate on his work as if nothing had happened, and his voice took on a restful strength. “I will recognize him,” he said. “He’ll bear my name.” José Arcadio Buendía finally got what he was looking for: he connected the mechanism of the clock to a mechanical ballerina, and the toy danced uninterruptedly to the rhythm of her own music for three days. That discovery excited him much more than any of his other harebrained undertakings. He stopped eating. He stopped sleeping. Only the vigilance and care of Rebeca kept him from being dragged off by his imagination into a state of perpetual delirium from which he would not recover. He would spend the nights walking around the room thinking aloud, searching for a way to apply the principles of the pendulum to oxcarts, to harrows, to everything that was useful when put into motion. The fever of insomnia fatigued him so much that one dawn he could not recognize the old man with white hair and uncertain gestures who came into his bedroom. It was Prudencio Aguilar. When he finally identified him, startled that the dead also aged, José Arcadio Buendía felt himself shaken by nostalgia. “Prudencio,” he exclaimed. “You’ve come from a long way off!” After many years of death the yearning for the living was so intense, the need for company so pressing, so terrifying the neatness of that other death which exists within death, that Prudencio Aguilar had ended up loving his worst enemy. He had spent a great deal of time looking for him. He asked the dead from Riohacha about him, the dead who came from the Upar Valley, those who came from the swamp, and no one could tell him because Macondo was a town that was unknown to the dead until Melquíades arrived and marked it with a small black dot on the motley maps of death. José Arcadio Buendía conversed with Prudencio Aguilar until dawn. A few hours later, worn out by the vigil, he went into Aureliano’s workshop and asked him: “What day is today?” Aureliano told him that it was Tuesday. “I was thinking the same thing,” José Arcadio Buendía said, “but suddenly I realized that it’s still Monday, like yesterday. Look at the sky, look at the walls, look at the begonias. Today is Monday too.” Used to his manias, Aureliano paid no attention to him. On the next day, Wednesday, José Arcadio Buendía went back to the workshop. “This is a disaster,” he said. “Look at the air, listen to the buzzing of the sun, the same as yesterday and the day before. Today is Monday too.” That night Pietro Crespi found him on the porch, weeping for Prudencio Aguilar, for Melquíades, for Rebeca’s parents, for his mother and father, for all of those he could remember and who were now alone in death. He gave him a mechanical bear that walked on its hind legs on a tightrope, but he could not distract him from his obsession. He asked him what had happened to the project he had explained to him a few days before about the possibility of building a pendulum machine that would help men to fly and he answered that it was impossible because a pendulum could lift anything into the air but it could not lift itself. On Thursday he appeared in the workshop again with the painful look of plowed ground. “The time machine has broken,” he almost sobbed, “and Úrsula and Amaranta so far away!” Aureliano scolded him like a child and he adopted a contrite air. He spent six hours examining things, trying to find a difference from their appearance on the previous day in the hope of discovering in them some change that would reveal the passage of time. He spent the whole night in bed with his eyes open, calling to Prudencio Aguilar, to Melquíades, to all the dead, so that they would share his distress. But no one came. On Friday. before anyone arose, he watched the appearance of nature again until he did not have the slightest doubt but that it was Monday. Then he grabbed the bar from a door and with the savage violence of his uncommon strength he smashed to dust the equipment in the alchemy laboratory, the daguerreotype room, the silver workshop, shouting like a man possessed in some high-sounding and fluent but completely incomprehensible language. He was about to finish off the rest of the house when Aureliano asked the neighbors for help. Ten men were needed to get him down, fourteen to tie him up, twenty to drag him to the chestnut tree in the courtyard, where they left him tied up, barking in the strange language and giving off a green froth at the mouth. When Úrsula and Amaranta returned he was still tied to the trunk of the chestnut tree by his hands and feet, soaked with rain and in a state of total innocence. They spoke to him and he looked at them without recognizing them, saying things they did not understand. Úrsula untied his wrists and ankles, lacerated by the pressure of the rope, and left him tied only by the waist. Later on they built him a shelter of palm brandies to protect him from the sun and the rain. -+""" -+ -+ input_ids = tokenizer.encode(prompt, return_tensors="pt") -+ # print("old input_ids.shape:"+ str(input_ids.shape)) -+ -+ # 限制输入长度为 input_length -+ input_ids = input_ids[:, :input_len] -+ # print("latest input_ids.shape:"+ str(input_ids.shape)) -+ -+ # 将截断后的 prompt 解码回来 -+ true_str = tokenizer.batch_decode(input_ids)[0] -+ prompt = true_str -+ -+ for i in range(num_prompts): -+ # prompt = tokenizer.decode(prefix_token_ids + -+ # [(offsets[i] + i + j) % tokenizer.vocab_size -+ # for j in range(input_lens[i])]) -+ -+ input_requests.append((prompt, int(prefix_len + input_lens[i]), -+ int(output_lens[i]), None)) -+ -+ return input_requests -+ -+ -+async def get_request( -+ input_requests: List[Tuple[str, int, int]], -+ request_rate: float, -+ burstiness: float = 1.0, -+) -> AsyncGenerator[Tuple[str, int, int], None]: -+ """ -+ Asynchronously generates requests at a specified rate -+ with OPTIONAL burstiness. -+ -+ Args: -+ input_requests: -+ A list of input requests, each represented as a tuple. -+ request_rate: -+ The rate at which requests are generated (requests/s). -+ burstiness (optional): -+ The burstiness factor of the request generation. -+ Only takes effect when request_rate is not inf. -+ Default value is 1, which follows a Poisson process. -+ Otherwise, the request intervals follow a gamma distribution. -+ A lower burstiness value (0 < burstiness < 1) results -+ in more bursty requests, while a higher burstiness value -+ (burstiness > 1) results in a more uniform arrival of requests. -+ """ -+ input_requests = iter(input_requests) -+ -+ # Calculate scale parameter theta to maintain the desired request_rate. -+ assert burstiness > 0, ( -+ f"A positive burstiness factor is expected, but given {burstiness}.") -+ theta = 1.0 / (request_rate * burstiness) -+ -+ for request in input_requests: -+ yield request -+ -+ if request_rate == float("inf"): -+ # If the request rate is infinity, then we don't need to wait. -+ continue -+ -+ # Sample the request interval from the gamma distribution. -+ # If burstiness is 1, it follows exponential distribution. -+ interval = np.random.gamma(shape=burstiness, scale=theta) -+ # The next request will be sent after the interval. -+ await asyncio.sleep(interval) -+ -+ -+def calculate_metrics( -+ input_requests: List[Tuple[str, int, int]], -+ outputs: List[RequestFuncOutput], -+ dur_s: float, -+ tokenizer: PreTrainedTokenizerBase, -+ selected_percentile_metrics: List[str], -+ selected_percentiles: List[float], -+ gootput_config_dict: Dict[str, float], -+) -> Tuple[BenchmarkMetrics, List[int]]: -+ actual_output_lens: List[int] = [] -+ total_input = 0 -+ completed = 0 -+ good_completed = 0 -+ itls: List[float] = [] -+ tpots: List[float] = [] -+ all_tpots: List[float] = [] -+ ttfts: List[float] = [] -+ e2els: List[float] = [] -+ for i in range(len(outputs)): -+ if outputs[i].success: -+ # We use the tokenizer to count the number of output tokens for all -+ # serving backends instead of looking at len(outputs[i].itl) since -+ # multiple output tokens may be bundled together -+ # Note : this may inflate the output token count slightly -+ output_len = len( -+ tokenizer(outputs[i].generated_text, -+ add_special_tokens=False).input_ids) -+ actual_output_lens.append(output_len) -+ total_input += input_requests[i][1] -+ tpot = 0 -+ if output_len > 1: -+ tpot = (outputs[i].latency - outputs[i].ttft) / (output_len - -+ 1) -+ tpots.append(tpot) -+ # Note: if output_len <= 1, we regard tpot as 0 for goodput -+ all_tpots.append(tpot) -+ itls += outputs[i].itl -+ ttfts.append(outputs[i].ttft) -+ e2els.append(outputs[i].latency) -+ completed += 1 -+ else: -+ actual_output_lens.append(0) -+ -+ if gootput_config_dict: -+ valid_metrics = [] -+ slo_values = [] -+ -+ if "ttft" in gootput_config_dict: -+ valid_metrics.append(ttfts) -+ slo_values.append(gootput_config_dict["ttft"] / -+ MILLISECONDS_TO_SECONDS_CONVERSION) -+ if "tpot" in gootput_config_dict: -+ valid_metrics.append(all_tpots) -+ slo_values.append(gootput_config_dict["tpot"] / -+ MILLISECONDS_TO_SECONDS_CONVERSION) -+ if "e2el" in gootput_config_dict: -+ valid_metrics.append(e2els) -+ slo_values.append(gootput_config_dict["e2el"] / -+ MILLISECONDS_TO_SECONDS_CONVERSION) -+ -+ for req_metric in zip(*valid_metrics): -+ is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)]) -+ if is_good_req: -+ good_completed += 1 -+ -+ if completed == 0: -+ warnings.warn( -+ "All requests failed. This is likely due to a misconfiguration " -+ "on the benchmark arguments.", -+ stacklevel=2) -+ metrics = BenchmarkMetrics( -+ completed=completed, -+ total_input=total_input, -+ total_output=sum(actual_output_lens), -+ request_throughput=completed / dur_s, -+ request_goodput=good_completed / dur_s, -+ output_throughput=sum(actual_output_lens) / dur_s, -+ total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s, -+ mean_ttft_ms=np.mean(ttfts or 0) * -+ 1000, # ttfts is empty if streaming is not supported by backend -+ std_ttft_ms=np.std(ttfts or 0) * 1000, -+ median_ttft_ms=np.median(ttfts or 0) * 1000, -+ percentiles_ttft_ms=[(p, np.percentile(ttfts or 0, p) * 1000) -+ for p in selected_percentiles], -+ mean_tpot_ms=np.mean(tpots or 0) * 1000, -+ std_tpot_ms=np.std(tpots or 0) * 1000, -+ median_tpot_ms=np.median(tpots or 0) * 1000, -+ percentiles_tpot_ms=[(p, np.percentile(tpots or 0, p) * 1000) -+ for p in selected_percentiles], -+ mean_itl_ms=np.mean(itls or 0) * 1000, -+ std_itl_ms=np.std(itls or 0) * 1000, -+ median_itl_ms=np.median(itls or 0) * 1000, -+ percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000) -+ for p in selected_percentiles], -+ mean_e2el_ms=np.mean(e2els or 0) * 1000, -+ std_e2el_ms=np.std(e2els or 0) * 1000, -+ median_e2el_ms=np.median(e2els or 0) * 1000, -+ percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000) -+ for p in selected_percentiles], -+ ) -+ -+ return metrics, actual_output_lens -+ -+ -+async def benchmark( -+ backend: str, -+ api_url: str, -+ base_url: str, -+ model_id: str, -+ tokenizer: PreTrainedTokenizerBase, -+ input_requests: List[Tuple[str, int, int]], -+ logprobs: Optional[int], -+ best_of: int, -+ request_rate: float, -+ burstiness: float, -+ disable_tqdm: bool, -+ profile: bool, -+ selected_percentile_metrics: List[str], -+ selected_percentiles: List[str], -+ ignore_eos: bool, -+ gootput_config_dict: Dict[str, float], -+ max_concurrency: Optional[int], -+): -+ if backend in ASYNC_REQUEST_FUNCS: -+ request_func = ASYNC_REQUEST_FUNCS[backend] -+ else: -+ raise ValueError(f"Unknown backend: {backend}") -+ -+ print("Starting initial single prompt test run...") -+ test_prompt, test_prompt_len, test_output_len, test_mm_content = ( -+ input_requests[0]) -+ if backend != "openai-chat" and test_mm_content is not None: -+ # multi-modal benchmark is only available on OpenAI Chat backend. -+ raise ValueError( -+ "Multi-modal content is only supported on 'openai-chat' backend.") -+ test_input = RequestFuncInput( -+ model=model_id, -+ prompt=test_prompt, -+ api_url=api_url, -+ prompt_len=test_prompt_len, -+ output_len=test_output_len, -+ logprobs=logprobs, -+ best_of=best_of, -+ multi_modal_content=test_mm_content, -+ ignore_eos=ignore_eos, -+ ) -+ test_output = await request_func(request_func_input=test_input) -+ if not test_output.success: -+ raise ValueError( -+ "Initial test run failed - Please make sure benchmark arguments " -+ f"are correctly specified. Error: {test_output.error}") -+ else: -+ print("Initial test run completed. Starting main benchmark run...") -+ -+ if profile: -+ print("Starting profiler...") -+ profile_input = RequestFuncInput(model=model_id, -+ prompt=test_prompt, -+ api_url=base_url + "/start_profile", -+ prompt_len=test_prompt_len, -+ output_len=test_output_len, -+ logprobs=logprobs, -+ best_of=best_of, -+ multi_modal_content=test_mm_content, -+ ignore_eos=ignore_eos) -+ profile_output = await request_func(request_func_input=profile_input) -+ if profile_output.success: -+ print("Profiler started") -+ -+ if burstiness == 1.0: -+ distribution = "Poisson process" -+ else: -+ distribution = "Gamma distribution" -+ -+ print(f"Traffic request rate: {request_rate}") -+ print(f"Burstiness factor: {burstiness} ({distribution})") -+ print(f"Maximum request concurrency: {max_concurrency}") -+ -+ pbar = None if disable_tqdm else tqdm(total=len(input_requests)) -+ -+ # This can be used once the minimum Python version is 3.10 or higher, -+ # and it will simplify the code in limited_request_func. -+ # semaphore = (asyncio.Semaphore(max_concurrency) -+ # if max_concurrency else contextlib.nullcontext()) -+ semaphore = (asyncio.Semaphore(max_concurrency) -+ if max_concurrency else None) -+ -+ async def limited_request_func(request_func_input, pbar): -+ if semaphore is None: -+ return await request_func(request_func_input=request_func_input, -+ pbar=pbar) -+ async with semaphore: -+ return await request_func(request_func_input=request_func_input, -+ pbar=pbar) -+ -+ benchmark_start_time = time.perf_counter() -+ tasks: List[asyncio.Task] = [] -+ async for request in get_request(input_requests, request_rate, burstiness): -+ prompt, prompt_len, output_len, mm_content = request -+ request_func_input = RequestFuncInput(model=model_id, -+ prompt=prompt, -+ api_url=api_url, -+ prompt_len=prompt_len, -+ output_len=output_len, -+ logprobs=logprobs, -+ best_of=best_of, -+ multi_modal_content=mm_content, -+ ignore_eos=ignore_eos) -+ tasks.append( -+ asyncio.create_task( -+ limited_request_func(request_func_input=request_func_input, -+ pbar=pbar))) -+ outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks) -+ -+ if profile: -+ print("Stopping profiler...") -+ profile_input = RequestFuncInput( -+ model=model_id, -+ prompt=test_prompt, -+ api_url=base_url + "/stop_profile", -+ prompt_len=test_prompt_len, -+ output_len=test_output_len, -+ logprobs=logprobs, -+ best_of=best_of, -+ ) -+ profile_output = await request_func(request_func_input=profile_input) -+ if profile_output.success: -+ print("Profiler stopped") -+ -+ if pbar is not None: -+ pbar.close() -+ -+ benchmark_duration = time.perf_counter() - benchmark_start_time -+ -+ metrics, actual_output_lens = calculate_metrics( -+ input_requests=input_requests, -+ outputs=outputs, -+ dur_s=benchmark_duration, -+ tokenizer=tokenizer, -+ selected_percentile_metrics=selected_percentile_metrics, -+ selected_percentiles=selected_percentiles, -+ gootput_config_dict=gootput_config_dict, -+ ) -+ -+ print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='=')) -+ print("{:<40} {:<10}".format("Successful requests:", metrics.completed)) -+ print("{:<40} {:<10.2f}".format("Benchmark duration (s):", -+ benchmark_duration)) -+ print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input)) -+ print("{:<40} {:<10}".format("Total generated tokens:", -+ metrics.total_output)) -+ print("{:<40} {:<10.2f}".format("Request throughput (req/s):", -+ metrics.request_throughput)) -+ if gootput_config_dict: -+ print("{:<40} {:<10.2f}".format("Request goodput (req/s):", -+ metrics.request_goodput)) -+ print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):", -+ metrics.output_throughput)) -+ print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):", -+ metrics.total_token_throughput)) -+ -+ result = { -+ "duration": benchmark_duration, -+ "completed": metrics.completed, -+ "total_input_tokens": metrics.total_input, -+ "total_output_tokens": metrics.total_output, -+ "request_throughput": metrics.request_throughput, -+ "request_goodput:": -+ metrics.request_goodput if gootput_config_dict else None, -+ "output_throughput": metrics.output_throughput, -+ "total_token_throughput": metrics.total_token_throughput, -+ "input_lens": [output.prompt_len for output in outputs], -+ "output_lens": actual_output_lens, -+ "ttfts": [output.ttft for output in outputs], -+ "itls": [output.itl for output in outputs], -+ "generated_texts": [output.generated_text for output in outputs], -+ "errors": [output.error for output in outputs], -+ } -+ -+ def process_one_metric( -+ # E.g., "ttft" -+ metric_attribute_name: str, -+ # E.g., "TTFT" -+ metric_name: str, -+ # E.g., "Time to First Token" -+ metric_header: str, -+ ): -+ # This function prints and adds statistics of the specified -+ # metric. -+ if metric_attribute_name not in selected_percentile_metrics: -+ return -+ print("{s:{c}^{n}}".format(s=metric_header, n=50, c='-')) -+ print("{:<40} {:<10.2f}".format( -+ f"Mean {metric_name} (ms):", -+ getattr(metrics, f"mean_{metric_attribute_name}_ms"))) -+ print("{:<40} {:<10.2f}".format( -+ f"Median {metric_name} (ms):", -+ getattr(metrics, f"median_{metric_attribute_name}_ms"))) -+ result[f"mean_{metric_attribute_name}_ms"] = getattr( -+ metrics, f"mean_{metric_attribute_name}_ms") -+ result[f"median_{metric_attribute_name}_ms"] = getattr( -+ metrics, f"median_{metric_attribute_name}_ms") -+ result[f"std_{metric_attribute_name}_ms"] = getattr( -+ metrics, f"std_{metric_attribute_name}_ms") -+ for p, value in getattr(metrics, -+ f"percentiles_{metric_attribute_name}_ms"): -+ p_word = str(int(p)) if int(p) == p else str(p) -+ print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", -+ value)) -+ result[f"p{p_word}_{metric_attribute_name}_ms"] = value -+ -+ process_one_metric("ttft", "TTFT", "Time to First Token") -+ process_one_metric("tpot", "TPOT", -+ "Time per Output Token (excl. 1st token)") -+ process_one_metric("itl", "ITL", "Inter-token Latency") -+ process_one_metric("e2el", "E2EL", "End-to-end Latency") -+ -+ print("=" * 50) -+ -+ return result -+ -+ -+def check_goodput_args(args): -+ # Check and parse goodput arguments -+ gootput_config_dict = {} -+ VALID_NAMES = ["ttft", "tpot", "e2el"] -+ if args.goodput: -+ gootput_config_dict = parse_goodput(args.goodput) -+ for slo_name, slo_val in gootput_config_dict.items(): -+ if slo_name not in VALID_NAMES: -+ raise ValueError( -+ f"Invalid metric name found, {slo_name}: {slo_val}. " -+ "The service level objective name should be one of " -+ f"{str(VALID_NAMES)}. ") -+ if slo_val < 0: -+ raise ValueError( -+ f"Invalid value found, {slo_name}: {slo_val}. " -+ "The service level objective value should be " -+ "non-negative.") -+ return gootput_config_dict -+ -+ -+def parse_goodput(slo_pairs): -+ gootput_config_dict = {} -+ try: -+ for slo_pair in slo_pairs: -+ slo_name, slo_val = slo_pair.split(":") -+ gootput_config_dict[slo_name] = float(slo_val) -+ except ValueError as err: -+ raise argparse.ArgumentTypeError( -+ "Invalid format found for service level objectives. " -+ "Specify service level objectives for goodput as \"KEY:VALUE\" " -+ "pairs, where the key is a metric name, and the value is a " -+ "number in milliseconds.") from err -+ return gootput_config_dict -+ -+ -+def main(args: argparse.Namespace): -+ print(args) -+ random.seed(args.seed) -+ np.random.seed(args.seed) -+ -+ backend = args.backend -+ # model_id = args.model -+ model_id = args.model.split('/')[-1] -+ tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model -+ tokenizer_mode = args.tokenizer_mode -+ -+ if args.base_url is not None: -+ api_url = f"{args.base_url}{args.endpoint}" -+ base_url = f"{args.base_url}" -+ else: -+ api_url = f"http://{args.host}:{args.port}{args.endpoint}" -+ base_url = f"http://{args.host}:{args.port}" -+ -+ tokenizer = get_tokenizer(tokenizer_id, -+ tokenizer_mode=tokenizer_mode, -+ trust_remote_code=args.trust_remote_code) -+ -+ if args.dataset is not None: -+ warnings.warn( -+ "The '--dataset' argument will be deprecated in the next " -+ "release. Please use '--dataset-name' and " -+ "'--dataset-path' in the future runs.", -+ stacklevel=2) -+ input_requests = sample_sharegpt_requests( -+ dataset_path=args.dataset, -+ num_requests=args.num_prompts, -+ tokenizer=tokenizer, -+ fixed_output_len=args.sharegpt_output_len, -+ ) -+ -+ elif args.dataset_name == "sharegpt": -+ input_requests = sample_sharegpt_requests( -+ dataset_path=args.dataset_path, -+ num_requests=args.num_prompts, -+ tokenizer=tokenizer, -+ fixed_output_len=args.sharegpt_output_len, -+ ) -+ -+ elif args.dataset_name == "sonnet": -+ # Do not format the prompt, pass to message directly -+ if args.backend == "openai-chat": -+ input_requests = sample_sonnet_requests( -+ dataset_path=args.dataset_path, -+ num_requests=args.num_prompts, -+ input_len=args.sonnet_input_len, -+ output_len=args.sonnet_output_len, -+ prefix_len=args.sonnet_prefix_len, -+ tokenizer=tokenizer, -+ ) -+ input_requests = [(prompt, prompt_len, output_len, None) -+ for prompt, prompt_formatted, prompt_len, -+ output_len, _ in input_requests] -+ else: -+ assert ( -+ tokenizer.chat_template or tokenizer.default_chat_template -+ ), "Tokenizer/model must have chat template for sonnet dataset." -+ input_requests = sample_sonnet_requests( -+ dataset_path=args.dataset_path, -+ num_requests=args.num_prompts, -+ input_len=args.sonnet_input_len, -+ output_len=args.sonnet_output_len, -+ prefix_len=args.sonnet_prefix_len, -+ tokenizer=tokenizer, -+ ) -+ input_requests = [(prompt_formatted, prompt_len, output_len, None) -+ for prompt, prompt_formatted, prompt_len, -+ output_len, _ in input_requests] -+ -+ elif args.dataset_name == "hf": -+ input_requests = sample_hf_requests( -+ dataset_path=args.dataset_path, -+ dataset_subset=args.hf_subset, -+ dataset_split=args.hf_split, -+ num_requests=args.num_prompts, -+ tokenizer=tokenizer, -+ random_seed=args.seed, -+ fixed_output_len=args.hf_output_len, -+ ) -+ -+ elif args.dataset_name == "random": -+ input_requests = sample_random_requests( -+ prefix_len=args.random_prefix_len, -+ input_len=args.random_input_len, -+ output_len=args.random_output_len, -+ num_prompts=args.num_prompts, -+ range_ratio=args.random_range_ratio, -+ tokenizer=tokenizer, -+ ) -+ -+ else: -+ raise ValueError(f"Unknown dataset: {args.dataset_name}") -+ -+ gootput_config_dict = check_goodput_args(args) -+ -+ benchmark_result = asyncio.run( -+ benchmark( -+ backend=backend, -+ api_url=api_url, -+ base_url=base_url, -+ model_id=model_id, -+ tokenizer=tokenizer, -+ input_requests=input_requests, -+ logprobs=args.logprobs, -+ best_of=args.best_of, -+ request_rate=args.request_rate, -+ burstiness=args.burstiness, -+ disable_tqdm=args.disable_tqdm, -+ profile=args.profile, -+ selected_percentile_metrics=args.percentile_metrics.split(","), -+ selected_percentiles=[ -+ float(p) for p in args.metric_percentiles.split(",") -+ ], -+ ignore_eos=args.ignore_eos, -+ gootput_config_dict=gootput_config_dict, -+ max_concurrency=args.max_concurrency, -+ )) -+ -+ # Save config and results to json -+ if args.save_result: -+ result_json: Dict[str, Any] = {} -+ -+ # Setup -+ current_dt = datetime.now().strftime("%Y%m%d-%H%M%S") -+ result_json["date"] = current_dt -+ result_json["backend"] = backend -+ result_json["model_id"] = model_id -+ result_json["tokenizer_id"] = tokenizer_id -+ result_json["best_of"] = args.best_of -+ result_json["num_prompts"] = args.num_prompts -+ -+ # Metadata -+ if args.metadata: -+ for item in args.metadata: -+ if "=" in item: -+ kvstring = item.split("=") -+ result_json[kvstring[0].strip()] = kvstring[1].strip() -+ else: -+ raise ValueError( -+ "Invalid metadata format. Please use KEY=VALUE format." -+ ) -+ -+ # Traffic -+ result_json["request_rate"] = ( -+ args.request_rate if args.request_rate < float("inf") else "inf") -+ result_json["burstiness"] = args.burstiness -+ result_json["max_concurrency"] = args.max_concurrency -+ -+ # Merge with benchmark result -+ result_json = {**result_json, **benchmark_result} -+ -+ # Save to file -+ base_model_id = model_id.split("/")[-1] -+ max_concurrency_str = (f"-concurrency{args.max_concurrency}" -+ if args.max_concurrency is not None else "") -+ file_name = f"{backend}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" #noqa -+ if args.result_filename: -+ file_name = args.result_filename -+ if args.result_dir: -+ file_name = os.path.join(args.result_dir, file_name) -+ with open(file_name, "w", encoding='utf-8') as outfile: -+ json.dump(result_json, outfile) -+ -+ -+if __name__ == "__main__": -+ parser = FlexibleArgumentParser( -+ description="Benchmark the online serving throughput.") -+ parser.add_argument( -+ "--backend", -+ type=str, -+ default="vllm", -+ choices=list(ASYNC_REQUEST_FUNCS.keys()), -+ ) -+ parser.add_argument( -+ "--base-url", -+ type=str, -+ default=None, -+ help="Server or API base url if not using http host and port.", -+ ) -+ parser.add_argument("--host", type=str, default="localhost") -+ parser.add_argument("--port", type=int, default=8000) -+ parser.add_argument( -+ "--endpoint", -+ type=str, -+ default="/v1/completions", -+ help="API endpoint.", -+ ) -+ parser.add_argument( -+ "--dataset", -+ type=str, -+ default=None, -+ help="Path to the ShareGPT dataset, will be deprecated in the " -+ "next release.", -+ ) -+ parser.add_argument( -+ "--dataset-name", -+ type=str, -+ default="sharegpt", -+ choices=["sharegpt", "sonnet", "random", "hf"], -+ help="Name of the dataset to benchmark on.", -+ ) -+ parser.add_argument("--dataset-path", -+ type=str, -+ default=None, -+ help="Path to the sharegpt/sonnet dataset. " -+ "Or the huggingface dataset ID if using HF dataset.") -+ parser.add_argument( -+ "--max-concurrency", -+ type=int, -+ default=None, -+ help="Maximum number of concurrent requests. This can be used " -+ "to help simulate an environment where a higher level component " -+ "is enforcing a maximum number of concurrent requests. While the " -+ "--request-rate argument controls the rate at which requests are " -+ "initiated, this argument will control how many are actually allowed " -+ "to execute at a time. This means that when used in combination, the " -+ "actual request rate may be lower than specified with --request-rate, " -+ "if the server is not processing requests fast enough to keep up.") -+ -+ parser.add_argument( -+ "--model", -+ type=str, -+ required=True, -+ help="Name of the model.", -+ ) -+ parser.add_argument( -+ "--tokenizer", -+ type=str, -+ help= -+ "Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501 -+ ) -+ parser.add_argument( -+ "--best-of", -+ type=int, -+ default=1, -+ help="Generates `best_of` sequences per prompt and " -+ "returns the best one.", -+ ) -+ parser.add_argument("--use-beam-search", action="store_true") -+ parser.add_argument( -+ "--num-prompts", -+ type=int, -+ default=1000, -+ help="Number of prompts to process.", -+ ) -+ parser.add_argument( -+ "--logprobs", -+ type=int, -+ default=None, -+ help=("Number of logprobs-per-token to compute & return as part of " -+ "the request. If unspecified, then either (1) if beam search " -+ "is disabled, no logprobs are computed & a single dummy " -+ "logprob is returned for each token; or (2) if beam search " -+ "is enabled 1 logprob per token is computed"), -+ ) -+ parser.add_argument( -+ "--request-rate", -+ type=float, -+ default=float("inf"), -+ help="Number of requests per second. If this is inf, " -+ "then all the requests are sent at time 0. " -+ "Otherwise, we use Poisson process or gamma distribution " -+ "to synthesize the request arrival times.", -+ ) -+ parser.add_argument( -+ "--burstiness", -+ type=float, -+ default=1.0, -+ help="Burstiness factor of the request generation. " -+ "Only take effect when request_rate is not inf. " -+ "Default value is 1, which follows Poisson process. " -+ "Otherwise, the request intervals follow a gamma distribution. " -+ "A lower burstiness value (0 < burstiness < 1) results in more " -+ "bursty requests. A higher burstiness value (burstiness > 1) " -+ "results in a more uniform arrival of requests.", -+ ) -+ parser.add_argument("--seed", type=int, default=0) -+ parser.add_argument( -+ "--trust-remote-code", -+ action="store_true", -+ help="Trust remote code from huggingface", -+ ) -+ parser.add_argument( -+ "--disable-tqdm", -+ action="store_true", -+ help="Specify to disable tqdm progress bar.", -+ ) -+ parser.add_argument( -+ "--profile", -+ action="store_true", -+ help="Use Torch Profiler. The endpoint must be launched with " -+ "VLLM_TORCH_PROFILER_DIR to enable profiler.", -+ ) -+ parser.add_argument( -+ "--save-result", -+ action="store_true", -+ help="Specify to save benchmark results to a json file", -+ ) -+ parser.add_argument( -+ "--metadata", -+ metavar="KEY=VALUE", -+ nargs="*", -+ help="Key-value pairs (e.g, --metadata version=0.3.3 tp=1) " -+ "for metadata of this run to be saved in the result JSON file " -+ "for record keeping purposes.", -+ ) -+ parser.add_argument( -+ "--result-dir", -+ type=str, -+ default=None, -+ help="Specify directory to save benchmark json results." -+ "If not specified, results are saved in the current directory.", -+ ) -+ parser.add_argument( -+ "--result-filename", -+ type=str, -+ default=None, -+ help="Specify the filename to save benchmark json results." -+ "If not specified, results will be saved in " -+ "{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json" -+ " format.", -+ ) -+ parser.add_argument( -+ "--ignore-eos", -+ action="store_true", -+ help="Set ignore_eos flag when sending the benchmark request." -+ "Warning: ignore_eos is not supported in deepspeed_mii and tgi.") -+ parser.add_argument( -+ "--percentile-metrics", -+ type=str, -+ default="ttft,tpot,itl", -+ help="Comma-seperated list of selected metrics to report percentils. " -+ "This argument specifies the metrics to report percentiles. " -+ "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". " -+ "Default value is \"ttft,tpot,itl\".") -+ parser.add_argument( -+ "--metric-percentiles", -+ type=str, -+ default="99", -+ help="Comma-seperated list of percentiles for selected metrics. " -+ "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". " -+ "Default value is \"99\". " -+ "Use \"--percentile-metrics\" to select metrics.", -+ ) -+ parser.add_argument( -+ "--goodput", -+ nargs="+", -+ required=False, -+ help="Specify service level objectives for goodput as \"KEY:VALUE\" " -+ "pairs, where the key is a metric name, and the value is in " -+ "milliseconds. Multiple \"KEY:VALUE\" pairs can be provided, " -+ "separated by spaces. Allowed request level metric names are " -+ "\"ttft\", \"tpot\", \"e2el\". For more context on the definition of " -+ "goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 " -+ "and the blog: https://hao-ai-lab.github.io/blogs/distserve") -+ -+ # group for dataset specific arguments -+ sonnet_group = parser.add_argument_group("sonnet dataset options") -+ sonnet_group.add_argument( -+ "--sonnet-input-len", -+ type=int, -+ default=550, -+ help= -+ "Number of input tokens per request, used only for sonnet dataset.", -+ ) -+ sonnet_group.add_argument( -+ "--sonnet-output-len", -+ type=int, -+ default=150, -+ help= -+ "Number of output tokens per request, used only for sonnet dataset.", -+ ) -+ sonnet_group.add_argument( -+ "--sonnet-prefix-len", -+ type=int, -+ default=200, -+ help= -+ "Number of prefix tokens per request, used only for sonnet dataset.", -+ ) -+ -+ sharegpt_group = parser.add_argument_group("sharegpt dataset options") -+ sharegpt_group.add_argument( -+ "--sharegpt-output-len", -+ type=int, -+ default=None, -+ help="Output length for each request. Overrides the output length " -+ "from the ShareGPT dataset.") -+ -+ random_group = parser.add_argument_group("random dataset options") -+ random_group.add_argument( -+ "--random-input-len", -+ type=int, -+ default=1024, -+ help= -+ "Number of input tokens per request, used only for random sampling.", -+ ) -+ random_group.add_argument( -+ "--random-output-len", -+ type=int, -+ default=128, -+ help= -+ "Number of output tokens per request, used only for random sampling.", -+ ) -+ random_group.add_argument( -+ "--random-range-ratio", -+ type=float, -+ default=1.0, -+ help="Range of sampled ratio of input/output length, " -+ "used only for random sampling.", -+ ) -+ random_group.add_argument( -+ "--random-prefix-len", -+ type=int, -+ default=0, -+ help="Number of fixed prefix tokens before random " -+ " context. The length range of context in a random " -+ " request is [random-prefix-len, " -+ " random-prefix-len + random-prefix-len * random-range-ratio).") -+ -+ hf_group = parser.add_argument_group("hf dataset options") -+ hf_group.add_argument("--hf-subset", -+ type=str, -+ default=None, -+ help="Subset of the HF dataset.") -+ hf_group.add_argument("--hf-split", -+ type=str, -+ default=None, -+ help="Split of the HF dataset.") -+ hf_group.add_argument( -+ "--hf-output-len", -+ type=int, -+ default=None, -+ help="Output length for each request. Overrides the output lengths " -+ "from the sampled HF dataset.", -+ ) -+ -+ parser.add_argument( -+ '--tokenizer-mode', -+ type=str, -+ default="auto", -+ choices=['auto', 'slow', 'mistral'], -+ help='The tokenizer mode.\n\n* "auto" will use the ' -+ 'fast tokenizer if available.\n* "slow" will ' -+ 'always use the slow tokenizer. \n* ' -+ '"mistral" will always use the `mistral_common` tokenizer.') -+ -+ args = parser.parse_args() -+ main(args) diff --git a/cmake/utils.cmake b/cmake/utils.cmake -index 40430dae1..76efeda6c 100644 +index c9cd099b8..7c6c46f38 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake -@@ -379,7 +379,7 @@ function (define_gpu_extension_target GPU_MOD_NAME) +@@ -400,7 +400,7 @@ function (define_gpu_extension_target GPU_MOD_NAME) GPU "WITH_SOABI" "DESTINATION;LANGUAGE;USE_SABI" @@ -1950,7 +103,7 @@ index 40430dae1..76efeda6c 100644 # Add hipify preprocessing step when building with HIP/ROCm. if (GPU_LANGUAGE STREQUAL "HIP") -@@ -421,6 +421,11 @@ function (define_gpu_extension_target GPU_MOD_NAME) +@@ -442,6 +442,11 @@ function (define_gpu_extension_target GPU_MOD_NAME) target_link_libraries(${GPU_MOD_NAME} PRIVATE torch ${GPU_LIBRARIES}) @@ -1964,10 +117,10 @@ index 40430dae1..76efeda6c 100644 if (GPU_LANGUAGE STREQUAL "CUDA") diff --git a/cmake/xpu_extension.cmake b/cmake/xpu_extension.cmake new file mode 100644 -index 000000000..a99dcd5a3 +index 000000000..fd671a6bf --- /dev/null +++ b/cmake/xpu_extension.cmake -@@ -0,0 +1,61 @@ +@@ -0,0 +1,62 @@ +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + +# @@ -2010,6 +163,7 @@ index 000000000..a99dcd5a3 + "csrc/xpu/layernorm_xpu.cpp" + "csrc/xpu/pos_encoding_xpu.cpp" + "csrc/xpu/utils.cpp" ++ "csrc/xpu/fused_moe.cpp" + "csrc/xpu/pybind.cpp") + +define_gpu_extension_target( @@ -2029,1133 +183,6 @@ index 000000000..a99dcd5a3 +message(STATUS "Enabling C extension.") +add_dependencies(default_xpu _C) + -diff --git a/csrc/attention/paged_attention_v1.cu b/csrc/attention/paged_attention_v1.cu -index cb1a06994..27321148f 100644 ---- a/csrc/attention/paged_attention_v1.cu -+++ b/csrc/attention/paged_attention_v1.cu -@@ -53,7 +53,7 @@ void paged_attention_v1_launcher( - torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache, - torch::Tensor& value_cache, int num_kv_heads, float scale, - torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len, -- const c10::optional& alibi_slopes, float k_scale, -+ const std::optional& alibi_slopes, float k_scale, - float v_scale, const int tp_rank, const int blocksparse_local_blocks, - const int blocksparse_vert_stride, const int blocksparse_block_size, - const int blocksparse_head_sliding_step) { -@@ -176,7 +176,7 @@ void paged_attention_v1( - torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq] - torch::Tensor& seq_lens, // [num_seqs] - int64_t block_size, int64_t max_seq_len, -- const c10::optional& alibi_slopes, -+ const std::optional& alibi_slopes, - const std::string& kv_cache_dtype, double k_scale, double v_scale, - const int64_t tp_rank, const int64_t blocksparse_local_blocks, - const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size, -diff --git a/csrc/attention/paged_attention_v2.cu b/csrc/attention/paged_attention_v2.cu -index c457bdb89..a453b2243 100644 ---- a/csrc/attention/paged_attention_v2.cu -+++ b/csrc/attention/paged_attention_v2.cu -@@ -54,7 +54,7 @@ void paged_attention_v2_launcher( - torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache, - torch::Tensor& value_cache, int num_kv_heads, float scale, - torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len, -- const c10::optional& alibi_slopes, float k_scale, -+ const std::optional& alibi_slopes, float k_scale, - float v_scale, const int tp_rank, const int blocksparse_local_blocks, - const int blocksparse_vert_stride, const int blocksparse_block_size, - const int blocksparse_head_sliding_step) { -@@ -187,7 +187,7 @@ void paged_attention_v2( - torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq] - torch::Tensor& seq_lens, // [num_seqs] - int64_t block_size, int64_t max_seq_len, -- const c10::optional& alibi_slopes, -+ const std::optional& alibi_slopes, - const std::string& kv_cache_dtype, double k_scale, double v_scale, - const int64_t tp_rank, const int64_t blocksparse_local_blocks, - const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size, -diff --git a/csrc/cpu/attention.cpp b/csrc/cpu/attention.cpp -index e21832ba7..ef5b14088 100644 ---- a/csrc/cpu/attention.cpp -+++ b/csrc/cpu/attention.cpp -@@ -386,7 +386,7 @@ void paged_attention_v1_impl_launcher( - torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache, - torch::Tensor& value_cache, int num_kv_heads, float scale, - torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len, -- const c10::optional& alibi_slopes) { -+ const std::optional& alibi_slopes) { - int num_seqs = query.size(0); - int num_heads = query.size(1); - int head_size = query.size(2); -@@ -459,7 +459,7 @@ void paged_attention_v1( - torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache, - torch::Tensor& value_cache, int64_t num_kv_heads, double scale, - torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size, -- int64_t max_seq_len, const c10::optional& alibi_slopes, -+ int64_t max_seq_len, const std::optional& alibi_slopes, - const std::string& kv_cache_dtype, double k_scale, double v_scale, - const int64_t tp_rank, const int64_t blocksparse_local_blocks, - const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size, -@@ -702,7 +702,7 @@ void paged_attention_v2_impl_launcher( - torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache, - torch::Tensor& value_cache, int num_kv_heads, float scale, - torch::Tensor& block_tables, torch::Tensor& seq_lens, int block_size, -- int max_seq_len, const c10::optional& alibi_slopes) { -+ int max_seq_len, const std::optional& alibi_slopes) { - int num_seqs = query.size(0); - int num_heads = query.size(1); - int head_size = query.size(2); -@@ -781,7 +781,7 @@ void paged_attention_v2( - torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache, - torch::Tensor& value_cache, int64_t num_kv_heads, double scale, - torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size, -- int64_t max_seq_len, const c10::optional& alibi_slopes, -+ int64_t max_seq_len, const std::optional& alibi_slopes, - const std::string& kv_cache_dtype, double k_scale, double v_scale, - const int64_t tp_rank, const int64_t blocksparse_local_blocks, - const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size, -diff --git a/csrc/cpu/quant.cpp b/csrc/cpu/quant.cpp -index d9aed657a..33b163783 100644 ---- a/csrc/cpu/quant.cpp -+++ b/csrc/cpu/quant.cpp -@@ -359,7 +359,7 @@ void int8_scaled_mm(torch::Tensor& c, // [M, OC], row-major - const torch::Tensor& b, // [IC, OC], column-major - const torch::Tensor& a_scales, // [1] or [M] - const torch::Tensor& b_scales, // [1] or [OC] -- const c10::optional& bias // [OC] -+ const std::optional& bias // [OC] - ) { - CPU_KERNEL_GUARD_IN(cutlass_scaled_mm) - // Checks for conformality -@@ -442,8 +442,8 @@ void int8_scaled_mm_azp(torch::Tensor& c, // [M, OC], row-major - const torch::Tensor& a_scales, // [1] or [M] - const torch::Tensor& b_scales, // [1] or [OC] - const torch::Tensor& azp_adj, // [OC] -- const c10::optional& azp, // [1] or [M] -- const c10::optional& bias // [OC] -+ const std::optional& azp, // [1] or [M] -+ const std::optional& bias // [OC] - ) { - CPU_KERNEL_GUARD_IN(cutlass_scaled_mm_azp) - // Checks for conformality -@@ -561,7 +561,7 @@ void int8_scaled_mm_azp(torch::Tensor& c, // [M, OC], row-major - void static_scaled_int8_quant(torch::Tensor& out, // [..., hidden_size] - const torch::Tensor& input, // [..., hidden_size] - const torch::Tensor& scale, -- c10::optional const& azp) { -+ std::optional const& azp) { - CPU_KERNEL_GUARD_IN(static_scaled_int8_quant) - TORCH_CHECK(input.is_contiguous()); - TORCH_CHECK(out.is_contiguous()); -@@ -590,7 +590,7 @@ void dynamic_scaled_int8_quant( - torch::Tensor& out, // [..., hidden_size] - const torch::Tensor& input, // [..., hidden_size] - torch::Tensor& scale, // [..., 1] -- c10::optional const& azp) { -+ std::optional const& azp) { - CPU_KERNEL_GUARD_IN(dynamic_scaled_int8_quant) - TORCH_CHECK(input.is_contiguous()); - TORCH_CHECK(out.is_contiguous()); -diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp -index 03beefbc6..74e4d8189 100644 ---- a/csrc/cpu/torch_bindings.cpp -+++ b/csrc/cpu/torch_bindings.cpp -@@ -9,14 +9,14 @@ std::string init_cpu_threads_env(const std::string& cpu_ids); - void int8_scaled_mm(torch::Tensor& c, const torch::Tensor& a, - const torch::Tensor& b, const torch::Tensor& a_scales, - const torch::Tensor& b_scales, -- const c10::optional& bias); -+ const std::optional& bias); - - void int8_scaled_mm_azp(torch::Tensor& c, const torch::Tensor& a, - const torch::Tensor& b, const torch::Tensor& a_scales, - const torch::Tensor& b_scales, - const torch::Tensor& azp_adj, -- const c10::optional& azp, -- const c10::optional& bias); -+ const std::optional& azp, -+ const std::optional& bias); - - TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { - // vLLM custom ops -diff --git a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp -index 26f7423fd..ef413e6dd 100644 ---- a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp -+++ b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp -@@ -68,7 +68,7 @@ struct ScaledEpilogueBase { - // This overload handles the case where there might not be a tensor, in which - // case a nullptr is passed and a constant (0) is used. - template -- static auto args_from_tensor(c10::optional const& tensor) { -+ static auto args_from_tensor(std::optional const& tensor) { - static_assert(std::is_same_v>); - using Arguments = typename Descriptor::Arguments; - auto* data_ptr = tensor ? static_cast(tensor->data_ptr()) : nullptr; -@@ -223,7 +223,7 @@ struct ScaledEpilogueBiasAzp - static ArgumentType prepare_args(torch::Tensor const& a_scales, - torch::Tensor const& b_scales, - torch::Tensor const& azp_adj, -- c10::optional const& bias) { -+ std::optional const& bias) { - auto a_args = SUPER::template args_from_tensor(a_scales); - auto b_args = SUPER::template args_from_tensor(b_scales); - auto bias_args = SUPER::template args_from_tensor(bias); -@@ -301,7 +301,7 @@ struct ScaledEpilogueBiasAzpToken - torch::Tensor const& b_scales, - torch::Tensor const& azp_adj, - torch::Tensor const& azp, -- c10::optional const& bias) { -+ std::optional const& bias) { - auto a_args = SUPER::template args_from_tensor(a_scales); - auto b_args = SUPER::template args_from_tensor(b_scales); - auto bias_args = SUPER::template args_from_tensor(bias); -diff --git a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp -index c723adf12..c590c66a6 100644 ---- a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp -+++ b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp -@@ -67,7 +67,7 @@ struct ScaledEpilogueBase { - // This overload handles the case where there might not be a tensor, in which - // case a nullptr is passed and a constant (0) is used. - template -- static auto args_from_tensor(c10::optional const& tensor) { -+ static auto args_from_tensor(std::optional const& tensor) { - using Arguments = typename Descriptor::Arguments; - auto* data_ptr = tensor ? static_cast(tensor->data_ptr()) : nullptr; - static_assert(std::is_same_v> || -@@ -223,7 +223,7 @@ struct ScaledEpilogueBiasAzp - static ArgumentType prepare_args(torch::Tensor const& a_scales, - torch::Tensor const& b_scales, - torch::Tensor const& azp_adj, -- c10::optional const& bias) { -+ std::optional const& bias) { - auto a_args = SUPER::template args_from_tensor(a_scales); - auto b_args = SUPER::template args_from_tensor(b_scales); - auto bias_args = SUPER::template args_from_tensor(bias); -@@ -299,7 +299,7 @@ struct ScaledEpilogueBiasAzpToken - torch::Tensor const& b_scales, - torch::Tensor const& azp_adj, - torch::Tensor const& azp, -- c10::optional const& bias) { -+ std::optional const& bias) { - auto a_args = SUPER::template args_from_tensor(a_scales); - auto b_args = SUPER::template args_from_tensor(b_scales); - auto bias_args = SUPER::template args_from_tensor(bias); -diff --git a/csrc/cutlass_extensions/torch_utils.hpp b/csrc/cutlass_extensions/torch_utils.hpp -index 2c7857252..a1ff933cc 100644 ---- a/csrc/cutlass_extensions/torch_utils.hpp -+++ b/csrc/cutlass_extensions/torch_utils.hpp -@@ -97,7 +97,7 @@ static inline auto make_cute_layout(torch::Tensor const& tensor, - - template - static inline auto maybe_make_cute_layout( -- c10::optional const& tensor, -+ std::optional const& tensor, - std::string_view name = "tensor") { - using Layout = decltype(make_cute_layout(*tensor)); - -diff --git a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py -index a5beea1a3..b401736c9 100644 ---- a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py -+++ b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py -@@ -14,9 +14,9 @@ class VLLMDataType(enum.Enum): - - - class MixedInputKernelScheduleType(enum.Enum): -- TmaWarpSpecializedMixedInput = enum_auto() -- TmaWarpSpecializedPingpongMixedInput = enum_auto() -- TmaWarpSpecializedCooperativeMixedInput = enum_auto() -+ TmaWarpSpecialized = enum_auto() -+ TmaWarpSpecializedPingpong = enum_auto() -+ TmaWarpSpecializedCooperative = enum_auto() - - - VLLMDataTypeNames: Dict[Union[VLLMDataType, DataType], str] = { -@@ -68,11 +68,11 @@ VLLMKernelScheduleTag: Dict[Union[ - MixedInputKernelScheduleType, KernelScheduleType], str] = { - **KernelScheduleTag, # type: ignore - **{ -- MixedInputKernelScheduleType.TmaWarpSpecializedMixedInput: -- "cutlass::gemm::KernelTmaWarpSpecializedMixedInput", -- MixedInputKernelScheduleType.TmaWarpSpecializedPingpongMixedInput: -- "cutlass::gemm::KernelTmaWarpSpecializedPingpongMixedInput", -- MixedInputKernelScheduleType.TmaWarpSpecializedCooperativeMixedInput: -- "cutlass::gemm::KernelTmaWarpSpecializedCooperativeMixedInput", -+ MixedInputKernelScheduleType.TmaWarpSpecialized: -+ "cutlass::gemm::KernelTmaWarpSpecialized", -+ MixedInputKernelScheduleType.TmaWarpSpecializedPingpong: -+ "cutlass::gemm::KernelTmaWarpSpecializedPingpong", -+ MixedInputKernelScheduleType.TmaWarpSpecializedCooperative: -+ "cutlass::gemm::KernelTmaWarpSpecializedCooperative", - } - } -diff --git a/csrc/mamba/causal_conv1d/causal_conv1d.cu b/csrc/mamba/causal_conv1d/causal_conv1d.cu -index dd1e6de2e..f0e5533bc 100644 ---- a/csrc/mamba/causal_conv1d/causal_conv1d.cu -+++ b/csrc/mamba/causal_conv1d/causal_conv1d.cu -@@ -53,12 +53,12 @@ void set_conv_params_fwd(ConvParamsBase ¶ms, - const at::Tensor x, - const at::Tensor weight, - const at::Tensor out, -- const c10::optional& bias, -+ const std::optional& bias, - bool silu_activation, - int64_t pad_slot_id, -- const c10::optional& query_start_loc = std::nullopt, -- const c10::optional& cache_indices = std::nullopt, -- const c10::optional& has_initial_state = std::nullopt) { -+ const std::optional& query_start_loc = std::nullopt, -+ const std::optional& cache_indices = std::nullopt, -+ const std::optional& has_initial_state = std::nullopt) { - - // Reset the parameters - memset(¶ms, 0, sizeof(params)); -@@ -93,11 +93,11 @@ void set_conv_params_fwd(ConvParamsBase ¶ms, - - - void causal_conv1d_fwd(const at::Tensor &x, const at::Tensor &weight, -- const c10::optional &bias_, -- const c10::optional &conv_states, -- const c10::optional &query_start_loc, -- const c10::optional &cache_indices, -- const c10::optional &has_initial_state, -+ const std::optional &bias_, -+ const std::optional &conv_states, -+ const std::optional &query_start_loc, -+ const std::optional &cache_indices, -+ const std::optional &has_initial_state, - bool silu_activation, - // used to identify padding entries if cache_indices provided - // in case of padding, the kernel will return early -@@ -194,10 +194,10 @@ void causal_conv1d_fwd(const at::Tensor &x, const at::Tensor &weight, - void causal_conv1d_update(const at::Tensor &x, - const at::Tensor &conv_state, - const at::Tensor &weight, -- const c10::optional &bias_, -+ const std::optional &bias_, - bool silu_activation, -- const c10::optional &cache_seqlens_, -- const c10::optional &conv_state_indices_, -+ const std::optional &cache_seqlens_, -+ const std::optional &conv_state_indices_, - // used to identify padding entries if cache_indices provided - // in case of padding, the kernel will return early - int64_t pad_slot_id) { -diff --git a/csrc/mamba/mamba_ssm/selective_scan_fwd.cu b/csrc/mamba/mamba_ssm/selective_scan_fwd.cu -index 716246963..bd0a34119 100644 ---- a/csrc/mamba/mamba_ssm/selective_scan_fwd.cu -+++ b/csrc/mamba/mamba_ssm/selective_scan_fwd.cu -@@ -402,14 +402,14 @@ void set_ssm_params_fwd(SSMParamsBase ¶ms, - const torch::Tensor out, - const torch::Tensor z, - const torch::Tensor out_z, -- const c10::optional& D, -- const c10::optional& delta_bias, -+ const std::optional& D, -+ const std::optional& delta_bias, - const torch::Tensor ssm_states, - bool has_z, - bool delta_softplus, -- const c10::optional& query_start_loc, -- const c10::optional& cache_indices, -- const c10::optional& has_initial_state, -+ const std::optional& query_start_loc, -+ const std::optional& cache_indices, -+ const std::optional& has_initial_state, - bool varlen, - int64_t pad_slot_id) { - -@@ -504,13 +504,13 @@ void set_ssm_params_fwd(SSMParamsBase ¶ms, - - void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta, - const torch::Tensor &A, const torch::Tensor &B, const torch::Tensor &C, -- const c10::optional &D_, -- const c10::optional &z_, -- const c10::optional &delta_bias_, -+ const std::optional &D_, -+ const std::optional &z_, -+ const std::optional &delta_bias_, - bool delta_softplus, -- const c10::optional &query_start_loc, -- const c10::optional &cache_indices, -- const c10::optional &has_initial_state, -+ const std::optional &query_start_loc, -+ const std::optional &cache_indices, -+ const std::optional &has_initial_state, - const torch::Tensor &ssm_states, - // used to identify padding entries if cache_indices provided - // in case of padding, the kernel will return early -diff --git a/csrc/ops.h b/csrc/ops.h -index 347c50284..9efd9b0c2 100644 ---- a/csrc/ops.h -+++ b/csrc/ops.h -@@ -33,7 +33,7 @@ void paged_attention_v1( - torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache, - torch::Tensor& value_cache, int64_t num_kv_heads, double scale, - torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size, -- int64_t max_seq_len, const c10::optional& alibi_slopes, -+ int64_t max_seq_len, const std::optional& alibi_slopes, - const std::string& kv_cache_dtype, double k_scale, double v_scale, - const int64_t tp_rank, const int64_t blocksparse_local_blocks, - const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size, -@@ -44,7 +44,7 @@ void paged_attention_v2( - torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache, - torch::Tensor& value_cache, int64_t num_kv_heads, double scale, - torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size, -- int64_t max_seq_len, const c10::optional& alibi_slopes, -+ int64_t max_seq_len, const std::optional& alibi_slopes, - const std::string& kv_cache_dtype, double k_scale, double v_scale, - const int64_t tp_rank, const int64_t blocksparse_local_blocks, - const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size, -@@ -153,15 +153,15 @@ bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability); - void cutlass_scaled_mm(torch::Tensor& out, torch::Tensor const& a, - torch::Tensor const& b, torch::Tensor const& a_scales, - torch::Tensor const& b_scales, -- c10::optional const& bias); -+ std::optional const& bias); - - void cutlass_scaled_mm_azp(torch::Tensor& out, torch::Tensor const& a, - torch::Tensor const& b, - torch::Tensor const& a_scales, - torch::Tensor const& b_scales, - torch::Tensor const& azp_adj, -- c10::optional const& azp, -- c10::optional const& bias); -+ std::optional const& azp, -+ std::optional const& bias); - - bool cutlass_sparse_scaled_mm_supported(int64_t cuda_device_capability); - -@@ -169,7 +169,7 @@ void cutlass_scaled_sparse_mm(torch::Tensor& out, torch::Tensor const& a, - torch::Tensor const& b, torch::Tensor const& e, - torch::Tensor const& a_scales, - torch::Tensor const& b_scales, -- c10::optional const& bias); -+ std::optional const& bias); - - bool cutlass_sparse_compress_entry(torch::Tensor& a_compressed, - torch::Tensor& e, torch::Tensor const& a); -@@ -177,11 +177,11 @@ bool cutlass_sparse_compress_entry(torch::Tensor& a_compressed, - - void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input, - torch::Tensor const& scale, -- c10::optional const& azp); -+ std::optional const& azp); - - void dynamic_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input, - torch::Tensor& scales, -- c10::optional const& azp); -+ std::optional const& azp); - - torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight, - torch::Tensor b_gptq_qzeros, -@@ -198,34 +198,34 @@ void dynamic_scaled_fp8_quant(torch::Tensor& out, torch::Tensor const& input, - - void dynamic_per_token_scaled_fp8_quant( - torch::Tensor& out, torch::Tensor const& input, torch::Tensor& scale, -- c10::optional const& scale_ub); -+ std::optional const& scale_ub); - - void selective_scan_fwd(const torch::Tensor& u, const torch::Tensor& delta, - const torch::Tensor& A, const torch::Tensor& B, - const torch::Tensor& C, -- const c10::optional& D_, -- const c10::optional& z_, -- const c10::optional& delta_bias_, -+ const std::optional& D_, -+ const std::optional& z_, -+ const std::optional& delta_bias_, - bool delta_softplus, -- const c10::optional& query_start_loc, -- const c10::optional& cache_indices, -- const c10::optional& has_initial_state, -+ const std::optional& query_start_loc, -+ const std::optional& cache_indices, -+ const std::optional& has_initial_state, - const torch::Tensor& ssm_states, int64_t pad_slot_id); - - void causal_conv1d_update(const at::Tensor& x, const at::Tensor& conv_state, - const at::Tensor& weight, -- const c10::optional& bias_, -+ const std::optional& bias_, - bool silu_activation, -- const c10::optional& cache_seqlens_, -- const c10::optional& conv_state_indices_, -+ const std::optional& cache_seqlens_, -+ const std::optional& conv_state_indices_, - int64_t pad_slot_id); - - void causal_conv1d_fwd(const at::Tensor& x, const at::Tensor& weight, -- const c10::optional& bias_, -- const c10::optional& conv_states, -- const c10::optional& query_start_loc, -- const c10::optional& cache_indices, -- const c10::optional& has_initial_state, -+ const std::optional& bias_, -+ const std::optional& conv_states, -+ const std::optional& query_start_loc, -+ const std::optional& cache_indices, -+ const std::optional& has_initial_state, - bool silu_activation, int64_t pad_slot_id); - - #ifndef USE_ROCM -diff --git a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu -index e9987535b..e79785827 100644 ---- a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu -+++ b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu -@@ -226,7 +226,7 @@ __global__ void dynamic_scaled_int8_azp_quant_kernel( - void static_scaled_int8_quant(torch::Tensor& out, // [..., hidden_size] - torch::Tensor const& input, // [..., hidden_size] - torch::Tensor const& scale, -- c10::optional const& azp) { -+ std::optional const& azp) { - TORCH_CHECK(input.is_contiguous()); - TORCH_CHECK(out.is_contiguous()); - TORCH_CHECK(scale.numel() == 1); -@@ -257,7 +257,7 @@ void static_scaled_int8_quant(torch::Tensor& out, // [..., hidden_size] - void dynamic_scaled_int8_quant( - torch::Tensor& out, // [..., hidden_size] - torch::Tensor const& input, // [..., hidden_size] -- torch::Tensor& scales, c10::optional const& azp) { -+ torch::Tensor& scales, std::optional const& azp) { - TORCH_CHECK(input.is_contiguous()); - TORCH_CHECK(out.is_contiguous()); - TORCH_CHECK(scales.is_contiguous()); -diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu -index dbb72e8bb..865fef5ae 100644 ---- a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu -+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu -@@ -39,7 +39,7 @@ void cutlass_scaled_mm_sm75(torch::Tensor& out, torch::Tensor const& a, - torch::Tensor const& b, - torch::Tensor const& a_scales, - torch::Tensor const& b_scales, -- c10::optional const& bias) { -+ std::optional const& bias) { - TORCH_CHECK(a_scales.dtype() == torch::kFloat32); - TORCH_CHECK(b_scales.dtype() == torch::kFloat32); - if (bias) { -@@ -58,8 +58,8 @@ void cutlass_scaled_mm_azp_sm75(torch::Tensor& out, torch::Tensor const& a, - torch::Tensor const& a_scales, - torch::Tensor const& b_scales, - torch::Tensor const& azp_adj, -- c10::optional const& azp, -- c10::optional const& bias) { -+ std::optional const& azp, -+ std::optional const& bias) { - TORCH_CHECK(a_scales.dtype() == torch::kFloat32); - TORCH_CHECK(b_scales.dtype() == torch::kFloat32); - -@@ -94,7 +94,7 @@ void cutlass_scaled_mm_sm80(torch::Tensor& out, torch::Tensor const& a, - torch::Tensor const& b, - torch::Tensor const& a_scales, - torch::Tensor const& b_scales, -- c10::optional const& bias) { -+ std::optional const& bias) { - TORCH_CHECK(a_scales.dtype() == torch::kFloat32); - TORCH_CHECK(b_scales.dtype() == torch::kFloat32); - if (bias) { -@@ -113,8 +113,8 @@ void cutlass_scaled_mm_azp_sm80(torch::Tensor& out, torch::Tensor const& a, - torch::Tensor const& a_scales, - torch::Tensor const& b_scales, - torch::Tensor const& azp_adj, -- c10::optional const& azp, -- c10::optional const& bias) { -+ std::optional const& azp, -+ std::optional const& bias) { - TORCH_CHECK(a_scales.dtype() == torch::kFloat32); - TORCH_CHECK(b_scales.dtype() == torch::kFloat32); - -@@ -165,7 +165,7 @@ void cutlass_scaled_mm_sm89(torch::Tensor& out, torch::Tensor const& a, - torch::Tensor const& b, - torch::Tensor const& a_scales, - torch::Tensor const& b_scales, -- c10::optional const& bias) { -+ std::optional const& bias) { - TORCH_CHECK(a_scales.dtype() == torch::kFloat32); - TORCH_CHECK(b_scales.dtype() == torch::kFloat32); - if (bias) { -@@ -184,8 +184,8 @@ void cutlass_scaled_mm_azp_sm89(torch::Tensor& out, torch::Tensor const& a, - torch::Tensor const& a_scales, - torch::Tensor const& b_scales, - torch::Tensor const& azp_adj, -- c10::optional const& azp, -- c10::optional const& bias) { -+ std::optional const& azp, -+ std::optional const& bias) { - TORCH_CHECK(a_scales.dtype() == torch::kFloat32); - TORCH_CHECK(b_scales.dtype() == torch::kFloat32); - -diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu -index 123f4359c..e18d7d79e 100644 ---- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu -+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu -@@ -51,7 +51,7 @@ void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a, - torch::Tensor const& b, - torch::Tensor const& a_scales, - torch::Tensor const& b_scales, -- c10::optional const& bias) { -+ std::optional const& bias) { - TORCH_CHECK(a_scales.dtype() == torch::kFloat32); - TORCH_CHECK(b_scales.dtype() == torch::kFloat32); - if (bias) { -@@ -70,8 +70,8 @@ void cutlass_scaled_mm_azp_sm90(torch::Tensor& out, torch::Tensor const& a, - torch::Tensor const& a_scales, - torch::Tensor const& b_scales, - torch::Tensor const& azp_adj, -- c10::optional const& azp, -- c10::optional const& bias) { -+ std::optional const& azp, -+ std::optional const& bias) { - TORCH_CHECK(a_scales.dtype() == torch::kFloat32); - TORCH_CHECK(b_scales.dtype() == torch::kFloat32); - -diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu -index 4f7b6588e..3f2b52624 100644 ---- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu -+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu -@@ -9,26 +9,26 @@ void cutlass_scaled_mm_sm75(torch::Tensor& c, torch::Tensor const& a, - torch::Tensor const& b, - torch::Tensor const& a_scales, - torch::Tensor const& b_scales, -- c10::optional const& bias); -+ std::optional const& bias); - - void cutlass_scaled_mm_sm80(torch::Tensor& c, torch::Tensor const& a, - torch::Tensor const& b, - torch::Tensor const& a_scales, - torch::Tensor const& b_scales, -- c10::optional const& bias); -+ std::optional const& bias); - - void cutlass_scaled_mm_sm89(torch::Tensor& c, torch::Tensor const& a, - torch::Tensor const& b, - torch::Tensor const& a_scales, - torch::Tensor const& b_scales, -- c10::optional const& bias); -+ std::optional const& bias); - - #if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X - void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a, - torch::Tensor const& b, - torch::Tensor const& a_scales, - torch::Tensor const& b_scales, -- c10::optional const& bias); -+ std::optional const& bias); - #endif - - void cutlass_scaled_mm_azp_sm75(torch::Tensor& c, torch::Tensor const& a, -@@ -36,24 +36,24 @@ void cutlass_scaled_mm_azp_sm75(torch::Tensor& c, torch::Tensor const& a, - torch::Tensor const& a_scales, - torch::Tensor const& b_scales, - torch::Tensor const& azp_adj, -- c10::optional const& azp, -- c10::optional const& bias); -+ std::optional const& azp, -+ std::optional const& bias); - - void cutlass_scaled_mm_azp_sm80(torch::Tensor& c, torch::Tensor const& a, - torch::Tensor const& b, - torch::Tensor const& a_scales, - torch::Tensor const& b_scales, - torch::Tensor const& azp_adj, -- c10::optional const& azp, -- c10::optional const& bias); -+ std::optional const& azp, -+ std::optional const& bias); - - void cutlass_scaled_mm_azp_sm89(torch::Tensor& c, torch::Tensor const& a, - torch::Tensor const& b, - torch::Tensor const& a_scales, - torch::Tensor const& b_scales, - torch::Tensor const& azp_adj, -- c10::optional const& azp, -- c10::optional const& bias); -+ std::optional const& azp, -+ std::optional const& bias); - - #if defined CUDA_VERSION && CUDA_VERSION >= 12000 - void cutlass_scaled_mm_azp_sm90(torch::Tensor& c, torch::Tensor const& a, -@@ -61,8 +61,8 @@ void cutlass_scaled_mm_azp_sm90(torch::Tensor& c, torch::Tensor const& a, - torch::Tensor const& a_scales, - torch::Tensor const& b_scales, - torch::Tensor const& azp_adj, -- c10::optional const& azp, -- c10::optional const& bias); -+ std::optional const& azp, -+ std::optional const& bias); - #endif - - bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability) { -@@ -84,7 +84,7 @@ bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability) { - void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a, - torch::Tensor const& b, torch::Tensor const& a_scales, - torch::Tensor const& b_scales, -- c10::optional const& bias) { -+ std::optional const& bias) { - // Checks for conformality - TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2); - TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) && -@@ -148,8 +148,8 @@ void cutlass_scaled_mm_azp(torch::Tensor& c, torch::Tensor const& a, - torch::Tensor const& a_scales, - torch::Tensor const& b_scales, - torch::Tensor const& azp_adj, -- c10::optional const& azp, -- c10::optional const& bias) { -+ std::optional const& azp, -+ std::optional const& bias) { - // Checks for conformality - TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2); - TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) && -diff --git a/csrc/quantization/gptq_marlin/gptq_marlin.cu b/csrc/quantization/gptq_marlin/gptq_marlin.cu -index 0c698ced7..04ef842fb 100644 ---- a/csrc/quantization/gptq_marlin/gptq_marlin.cu -+++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu -@@ -834,6 +834,7 @@ __global__ void Marlin( - int4* sh_g_idx = sh_b + (stages * b_sh_stage); - int4* sh_zp = sh_g_idx + (stages * g_idx_stage); - int4* sh_s = sh_zp + (stages * zp_sh_stage); -+ int4* sh_red = sh_s + (stages * s_sh_stage); - - // Register storage for double buffer of shared memory reads. - FragA frag_a[2][thread_m_blocks]; -@@ -932,11 +933,11 @@ __global__ void Marlin( - int4* sh_s_stage = sh_s + s_sh_stage * pipe; - - if constexpr (group_blocks >= thread_k_blocks) { -+ if (s_sh_wr_pred) { -+ cp_async4(&sh_s_stage[s_sh_wr], &scales_ptr[s_gl_rd]); -+ } - // Only fetch scales if this tile starts a new group -- if (pipe % (group_blocks / thread_k_blocks) == 0) { -- if (s_sh_wr_pred) { -- cp_async4(&sh_s_stage[s_sh_wr], &scales_ptr[s_gl_rd]); -- } -+ if ((pipe + 1) % (group_blocks / thread_k_blocks) == 0) { - s_gl_rd += s_gl_rd_delta; - } - } else { -@@ -1038,9 +1039,7 @@ __global__ void Marlin( - // No act-order case - if constexpr (group_blocks != -1) { - if constexpr (group_blocks >= thread_k_blocks) { -- int4* sh_s_stage = -- sh_s + s_sh_stage * ((group_blocks / thread_k_blocks) * -- (pipe / (group_blocks / thread_k_blocks))); -+ int4* sh_s_stage = sh_s + s_sh_stage * pipe; - reinterpret_cast(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd]; - } else { - int warp_id = threadIdx.x / 32; -@@ -1339,15 +1338,15 @@ __global__ void Marlin( - int red_sh_wr = - red_sh_delta * j + (red_sh_rd - red_sh_stride * i); - if (i < red_off) { -- float* c_rd = -- reinterpret_cast(&sh[red_sh_delta * j + red_sh_rd]); -- float* c_wr = reinterpret_cast(&sh[red_sh_wr]); -+ float* c_rd = reinterpret_cast( -+ &sh_red[red_sh_delta * j + red_sh_rd]); -+ float* c_wr = reinterpret_cast(&sh_red[red_sh_wr]); - #pragma unroll - for (int k = 0; k < 4; k++) - reinterpret_cast(frag_c)[4 * 2 * m_block + j][k] += - c_rd[k] + c_wr[k]; - } -- sh[red_sh_wr] = -+ sh_red[red_sh_wr] = - reinterpret_cast(&frag_c)[4 * 2 * m_block + j]; - } - } -@@ -1357,7 +1356,7 @@ __global__ void Marlin( - #pragma unroll - for (int i = 0; i < 4 * 2; i++) { - float* c_rd = -- reinterpret_cast(&sh[red_sh_delta * i + red_sh_rd]); -+ reinterpret_cast(&sh_red[red_sh_delta * i + red_sh_rd]); - #pragma unroll - for (int j = 0; j < 4; j++) - reinterpret_cast(frag_c)[4 * 2 * m_block + i][j] += -@@ -1397,7 +1396,7 @@ __global__ void Marlin( - #pragma unroll - for (int i = 0; i < thread_m_blocks * 4; i++) { - cp_async4_pred( -- &sh[c_sh_wr + c_sh_wr_delta * i], -+ &sh_red[c_sh_wr + c_sh_wr_delta * i], - &C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + - c_gl_wr_delta_i * (i % 2)], - i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m); -@@ -1410,7 +1409,7 @@ __global__ void Marlin( - for (int i = 0; i < thread_m_blocks * 4; i++) { - if (i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m) { - if (!first) { -- int4 c_red = sh[c_sh_wr + i * c_sh_wr_delta]; -+ int4 c_red = sh_red[c_sh_wr + i * c_sh_wr_delta]; - #pragma unroll - for (int j = 0; j < 2 * 4; j++) { - reinterpret_cast( -@@ -1461,10 +1460,10 @@ __global__ void Marlin( - float* frag_c_ptr = reinterpret_cast(&frag_c); - #pragma unroll - for (int k = 0; k < th_size; k++) { -- sh[threadIdx.x] = -+ sh_red[threadIdx.x] = - C_tmp[c_cur_offset + active_threads * k + threadIdx.x]; - -- float* sh_c_ptr = reinterpret_cast(&sh[threadIdx.x]); -+ float* sh_c_ptr = reinterpret_cast(&sh_red[threadIdx.x]); - #pragma unroll - for (int f = 0; f < 4; f++) { - frag_c_ptr[k * 4 + f] += sh_c_ptr[f]; -@@ -1515,7 +1514,7 @@ __global__ void Marlin( - res = __hmul2(res, s[0]); - } - -- ((scalar_t2*)sh)[idx] = res; -+ ((scalar_t2*)sh_red)[idx] = res; - }; - - if (threadIdx.x / 32 < thread_n_blocks / 4) { -@@ -1543,7 +1542,7 @@ __global__ void Marlin( - i < div_ceil(16 * thread_m_blocks, threads / (2 * thread_n_blocks)); - i++) { - if (c_gl_wr < c_gl_wr_end) { -- C[c_gl_wr] = sh[c_sh_rd]; -+ C[c_gl_wr] = sh_red[c_sh_rd]; - c_gl_wr += c_gl_wr_delta; - c_sh_rd += c_sh_rd_delta; - } -@@ -1865,9 +1864,12 @@ bool is_valid_cache_size(thread_config_t const& th_config, int max_m_blocks, - - float pipe_size = (a_size + b_size) * pipe_stages; - -+ float reduce_size = max(th_config.num_threads * 32 * 4, -+ (tb_n / 64) * 32 * (tb_max_m / 16) * 4 * 2 * 4 * 2); -+ - TORCH_CHECK(max_shared_mem / 2 > scales_cache_size); // Sanity - -- return pipe_size < 0.95f * (max_shared_mem - scales_cache_size); -+ return pipe_size + reduce_size < 0.95f * (max_shared_mem - scales_cache_size); - } - - bool is_valid_config(thread_config_t const& th_config, int max_m_blocks, -diff --git a/csrc/quantization/machete/generate.py b/csrc/quantization/machete/generate.py -index ac63afe79..a9b5ddf4c 100644 ---- a/csrc/quantization/machete/generate.py -+++ b/csrc/quantization/machete/generate.py -@@ -63,7 +63,7 @@ torch::Tensor mm_dispatch_{{type_sig}}(MMArgs args) { - - - static inline std::optional maybe_scalartype( -- c10::optional const& t) { -+ std::optional const& t) { - if (!t) { - return std::nullopt; - } else { -@@ -189,7 +189,7 @@ using Kernel_{{type_sig}} = MacheteKernelTemplate< - {{DataTypeTag[t.b_group_zeropoint]}}, // GroupZeroT - {{DataTypeTag[t.b_channel_scale]}}, // ChannelScaleT - {{DataTypeTag[t.a_token_scale]}}, // TokenScaleT -- cutlass::gemm::KernelTmaWarpSpecializedCooperativeMixedInput, -+ cutlass::gemm::KernelTmaWarpSpecializedCooperative, - Sch>; - - {% for sch in schs %} -@@ -223,7 +223,7 @@ torch::Tensor prepack_B_dispatch(PrepackBArgs args) { - {{DataTypeTag[t.convert]}}, // ElementConvert - {{DataTypeTag[t.accumulator]}}, // Accumulator - cutlass::layout::ColumnMajor, -- cutlass::gemm::KernelTmaWarpSpecializedCooperativeMixedInput> -+ cutlass::gemm::KernelTmaWarpSpecializedCooperative> - >(args.B); - } - {%- endfor %} -@@ -239,7 +239,7 @@ torch::Tensor prepack_B_dispatch(PrepackBArgs args) { - }; // namespace machete - """ - --TmaMI = MixedInputKernelScheduleType.TmaWarpSpecializedCooperativeMixedInput -+TmaMI = MixedInputKernelScheduleType.TmaWarpSpecializedCooperative - TmaCoop = EpilogueScheduleType.TmaWarpSpecializedCooperative - - -@@ -300,7 +300,7 @@ def generate_sch_sig(schedule_config: ScheduleConfig) -> str: - # mostly unique shorter sch_sig - def generate_terse_sch_sig(schedule_config: ScheduleConfig) -> str: - kernel_terse_names_replace = { -- "KernelTmaWarpSpecializedCooperativeMixedInput_": "TmaMI_", -+ "KernelTmaWarpSpecializedCooperative": "TmaMI_", - "TmaWarpSpecializedCooperative_": "TmaCoop_", - "StreamKScheduler": "streamK", - } -diff --git a/csrc/quantization/machete/machete_collective_builder.cuh b/csrc/quantization/machete/machete_collective_builder.cuh -index a74cf8b2d..ee825583d 100644 ---- a/csrc/quantization/machete/machete_collective_builder.cuh -+++ b/csrc/quantization/machete/machete_collective_builder.cuh -@@ -18,16 +18,14 @@ struct VLLMCollectiveBuilder< - ElementAccumulator, TileShape_MNK, ClusterShape_MNK, StageCountType, - KernelScheduleType, - cute::enable_if_t<( -+ cute::is_same_v || -+ cute::is_same_v || - cute::is_same_v || -- cute::is_same_v || -- cute::is_same_v)>> { -+ KernelTmaWarpSpecializedCooperative>)>> { - using CollectiveOp = machete::MacheteCollectiveMma< - ElementPairA_, GmemLayoutA_, AlignmentA, ElementPairB_, GmemLayoutB_, - AlignmentB, ElementAccumulator, TileShape_MNK, ClusterShape_MNK, - StageCountType, KernelScheduleType>; - }; - --}; // namespace cutlass::gemm::collective -\ No newline at end of file -+}; // namespace cutlass::gemm::collective -diff --git a/csrc/quantization/machete/machete_mainloop.cuh b/csrc/quantization/machete/machete_mainloop.cuh -index 816f33a10..4071b19a3 100644 ---- a/csrc/quantization/machete/machete_mainloop.cuh -+++ b/csrc/quantization/machete/machete_mainloop.cuh -@@ -66,13 +66,11 @@ struct MacheteCollectiveMma { - using Schedule = KernelScheduleType; - static_assert( - cute::is_same_v || -- cute::is_same_v || -+ cute::is_same_v || -+ cute::is_same_v || - cute::is_same_v || -- cute::is_same_v || - cute::is_same_v || -- cute::is_same_v, -+ cute::is_same_v, - "KernelSchedule must be one of the warp specialized policies"); - - public: -@@ -113,8 +111,7 @@ struct MacheteCollectiveMma { - // For coop schedules we have two warp groups cooperatively issuing wgmma - // instructions so we use 2 atoms along the M dim (one for each warpgroup) - using AtomLayoutMNK = cute::conditional_t< -- cute::is_same_v, -+ cute::is_same_v, - Layout>, Layout>>; - - using TiledMma = decltype(cute::make_tiled_mma( -diff --git a/csrc/quantization/machete/machete_mm_kernel.cuh b/csrc/quantization/machete/machete_mm_kernel.cuh -index d4d19ae5d..e4af06791 100644 ---- a/csrc/quantization/machete/machete_mm_kernel.cuh -+++ b/csrc/quantization/machete/machete_mm_kernel.cuh -@@ -183,11 +183,11 @@ struct MacheteKernelTemplate { - torch::Tensor const& A, // MxK matrix - torch::Tensor const& B, // KxN prepacked matrix - torch::Tensor& D, // MxN matrix -- c10::optional const& maybe_g_scales, // scale_KxN matrix -- c10::optional const& maybe_g_zeros, // scale_KxN matrix -- c10::optional maybe_group_size, -- c10::optional const& maybe_ch_scales, // len N vector -- c10::optional const& maybe_tok_scales) // len M vector -+ std::optional const& maybe_g_scales, // scale_KxN matrix -+ std::optional const& maybe_g_zeros, // scale_KxN matrix -+ std::optional maybe_group_size, -+ std::optional const& maybe_ch_scales, // len N vector -+ std::optional const& maybe_tok_scales) // len M vector - { - static_assert(!with_group_zeropoints || with_group_scales); - -diff --git a/csrc/quantization/machete/machete_mm_launcher.cuh b/csrc/quantization/machete/machete_mm_launcher.cuh -index 4b0da5b30..cabe0af46 100644 ---- a/csrc/quantization/machete/machete_mm_launcher.cuh -+++ b/csrc/quantization/machete/machete_mm_launcher.cuh -@@ -13,23 +13,23 @@ struct MMArgs { - torch::Tensor const& A; - torch::Tensor const& B; - vllm::ScalarType const& b_type; -- c10::optional const& maybe_out_type; -- c10::optional const& maybe_group_scales; -- c10::optional const& maybe_group_zeros; -- c10::optional maybe_group_size; -- c10::optional const& maybe_channel_scales; -- c10::optional const& maybe_token_scales; -- c10::optional maybe_schedule; -+ std::optional const& maybe_out_type; -+ std::optional const& maybe_group_scales; -+ std::optional const& maybe_group_zeros; -+ std::optional maybe_group_size; -+ std::optional const& maybe_channel_scales; -+ std::optional const& maybe_token_scales; -+ std::optional maybe_schedule; - }; - - struct SupportedSchedulesArgs { - at::ScalarType a_type; - vllm::ScalarType b_type; -- c10::optional maybe_group_scales_type; -- c10::optional maybe_group_zeros_type; -- c10::optional maybe_channel_scales_type; -- c10::optional maybe_token_scales_type; -- c10::optional maybe_out_type; -+ std::optional maybe_group_scales_type; -+ std::optional maybe_group_zeros_type; -+ std::optional maybe_channel_scales_type; -+ std::optional maybe_token_scales_type; -+ std::optional maybe_out_type; - }; - - torch::Tensor mm_dispatch(MMArgs args); -diff --git a/csrc/quantization/machete/machete_prepack_launcher.cuh b/csrc/quantization/machete/machete_prepack_launcher.cuh -index 3486d28be..634b651a4 100644 ---- a/csrc/quantization/machete/machete_prepack_launcher.cuh -+++ b/csrc/quantization/machete/machete_prepack_launcher.cuh -@@ -10,7 +10,7 @@ struct PrepackBArgs { - torch::Tensor const& B; - at::ScalarType a_type; - vllm::ScalarType b_type; -- c10::optional maybe_group_scales_type; -+ std::optional maybe_group_scales_type; - }; - - template -diff --git a/csrc/quantization/machete/machete_prepacked_layout.cuh b/csrc/quantization/machete/machete_prepacked_layout.cuh -index 680a858a8..81aaa6c4f 100644 ---- a/csrc/quantization/machete/machete_prepacked_layout.cuh -+++ b/csrc/quantization/machete/machete_prepacked_layout.cuh -@@ -98,8 +98,7 @@ struct PrepackedLayoutBTemplate { - // For coop schedules we have two warp groups cooperatively issuing wgmma - // instructions so we use 2 atoms along the M dim (one for each warpgroup) - using AtomLayoutMNK = cute::conditional_t< -- cute::is_same_v, -+ cute::is_same_v, - Layout>, Layout>>; - - using TiledMma = decltype(cute::make_tiled_mma( -@@ -247,4 +246,4 @@ struct PrepackedLayoutBTemplate { - } - }; - --}; // namespace machete -\ No newline at end of file -+}; // namespace machete -diff --git a/csrc/quantization/machete/machete_pytorch.cu b/csrc/quantization/machete/machete_pytorch.cu -index da2c2fb0d..05a51ee21 100644 ---- a/csrc/quantization/machete/machete_pytorch.cu -+++ b/csrc/quantization/machete/machete_pytorch.cu -@@ -10,11 +10,11 @@ using namespace vllm; - - std::vector supported_schedules( - at::ScalarType a_type, int64_t b_type_id, -- c10::optional maybe_group_scales_type, -- c10::optional maybe_group_zeros_type, -- c10::optional maybe_channel_scales_type, -- c10::optional maybe_token_scales_type, -- c10::optional maybe_out_type) { -+ std::optional maybe_group_scales_type, -+ std::optional maybe_group_zeros_type, -+ std::optional maybe_channel_scales_type, -+ std::optional maybe_token_scales_type, -+ std::optional maybe_out_type) { - ScalarType const b_type = ScalarType::from_id(b_type_id); - return supported_schedules_dispatch({ - .a_type = a_type, -@@ -29,13 +29,13 @@ std::vector supported_schedules( - - torch::Tensor mm(torch::Tensor const& A, torch::Tensor const& B, - int64_t b_type_id, -- c10::optional const& maybe_out_type, -- c10::optional const& maybe_group_scales, -- c10::optional const& maybe_group_zeros, -- c10::optional maybe_group_size, -- c10::optional const& maybe_channel_scales, -- c10::optional const& maybe_token_scales, -- c10::optional maybe_schedule) { -+ std::optional const& maybe_out_type, -+ std::optional const& maybe_group_scales, -+ std::optional const& maybe_group_zeros, -+ std::optional maybe_group_size, -+ std::optional const& maybe_channel_scales, -+ std::optional const& maybe_token_scales, -+ std::optional maybe_schedule) { - ScalarType const b_type = ScalarType::from_id(b_type_id); - return mm_dispatch({.A = A, - .B = B, -@@ -51,7 +51,7 @@ torch::Tensor mm(torch::Tensor const& A, torch::Tensor const& B, - - torch::Tensor prepack_B( - torch::Tensor const& B, at::ScalarType const& a_type, int64_t b_type_id, -- c10::optional const& maybe_group_scales_type) { -+ std::optional const& maybe_group_scales_type) { - ScalarType const b_type = ScalarType::from_id(b_type_id); - return prepack_B_dispatch( - {.B = B, -diff --git a/csrc/rocm/attention.cu b/csrc/rocm/attention.cu -index b48348a51..0fec9624c 100644 ---- a/csrc/rocm/attention.cu -+++ b/csrc/rocm/attention.cu -@@ -928,7 +928,7 @@ void paged_attention_custom_launcher( - torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache, - torch::Tensor& value_cache, const int num_kv_heads, float scale, - torch::Tensor& block_tables, torch::Tensor& context_lens, -- int max_context_len, const c10::optional& alibi_slopes, -+ int max_context_len, const std::optional& alibi_slopes, - float k_scale, float v_scale) { - int num_seqs = query.size(0); - int num_heads = query.size(1); -@@ -1086,7 +1086,7 @@ void paged_attention( - torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq] - torch::Tensor& context_lens, // [num_seqs] - int64_t block_size, int64_t max_context_len, -- const c10::optional& alibi_slopes, -+ const std::optional& alibi_slopes, - const std::string& kv_cache_dtype, double k_scale, double v_scale) { - const int head_size = query.size(2); - if (kv_cache_dtype == "auto") { -diff --git a/csrc/rocm/ops.h b/csrc/rocm/ops.h -index 9f085115a..34b2f9ce8 100644 ---- a/csrc/rocm/ops.h -+++ b/csrc/rocm/ops.h -@@ -9,6 +9,6 @@ void paged_attention(torch::Tensor& out, torch::Tensor& exp_sums, - double scale, torch::Tensor& block_tables, - torch::Tensor& context_lens, int64_t block_size, - int64_t max_context_len, -- const c10::optional& alibi_slopes, -+ const std::optional& alibi_slopes, - const std::string& kv_cache_dtype, double k_scale, - double v_scale); -diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu -index 6223dc8cc..5a1879787 100644 ---- a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu -+++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu -@@ -286,7 +286,7 @@ void cutlass_scaled_sparse_mm_sm90(torch::Tensor& out, torch::Tensor const& a, - torch::Tensor const& bt_meta, - torch::Tensor const& a_scales, - torch::Tensor const& b_scales, -- c10::optional const& bias) { -+ std::optional const& bias) { - TORCH_CHECK(a_scales.dtype() == torch::kFloat32); - TORCH_CHECK(b_scales.dtype() == torch::kFloat32); - if (bias) { -diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu b/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu -index d464b045b..371de0950 100644 ---- a/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu -+++ b/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu -@@ -22,7 +22,7 @@ void cutlass_scaled_sparse_mm_sm90(torch::Tensor& c, torch::Tensor const& a, - torch::Tensor const& e, - torch::Tensor const& a_scales, - torch::Tensor const& b_scales, -- c10::optional const& bias); -+ std::optional const& bias); - #endif - - void cutlass_scaled_sparse_mm(torch::Tensor& c, torch::Tensor const& a, -@@ -30,7 +30,7 @@ void cutlass_scaled_sparse_mm(torch::Tensor& c, torch::Tensor const& a, - torch::Tensor const& bt_meta, - torch::Tensor const& a_scales, - torch::Tensor const& b_scales, -- c10::optional const& bias) { -+ std::optional const& bias) { - // Checks for conformality - TORCH_CHECK(a.dim() == 2 && bt_nzs.dim() == 2 && c.dim() == 2); - TORCH_CHECK(c.size(1) == bt_nzs.size(0) && bt_nzs.size(1) * 2 == a.size(1) && diff --git a/csrc/xpu/activation_xpu.cpp b/csrc/xpu/activation_xpu.cpp new file mode 100644 index 000000000..6f98ddbb3 @@ -6879,6 +3906,130 @@ index 000000000..a2ea5819b + o_accs.stride(0), o_accs.stride(1), output.stride(0), output.stride(1), + bsz, num_heads, row_block_num, query.device()); +} +diff --git a/csrc/xpu/base.hpp b/csrc/xpu/base.hpp +new file mode 100644 +index 000000000..c364c62e6 +--- /dev/null ++++ b/csrc/xpu/base.hpp +@@ -0,0 +1,118 @@ ++#pragma once ++ ++#include ++#include ++ ++#include "common.h" ++ ++using namespace sycl::ext::intel::esimd; ++using fp16 = sycl::half; ++ ++constexpr int QK = 64; ++constexpr int SBS = 4; ++ ++constexpr int BLOCK_SIZES[GGML_TYPE_COUNT] = { ++ [GGML_TYPE_Q4_0] = QK / 2, ++ [GGML_TYPE_Q4_0_WOQ] = QK / 2, ++ [GGML_TYPE_FP8E5] = QK, ++}; ++ ++constexpr int SCALE_SIZES[GGML_TYPE_COUNT] = { ++ [GGML_TYPE_Q4_0] = sizeof(fp16), ++ [GGML_TYPE_Q4_0_WOQ] = sizeof(fp16), ++ [GGML_TYPE_FP8E5] = 0, ++}; ++ ++template ++ESIMD_INLINE auto load_qblocks(const uint8_t * weight, const uint8_t * scale); ++ ++template<> ++ESIMD_INLINE auto load_qblocks(const uint8_t * weight, const uint8_t * scale) { ++ constexpr int BLOCK_SIZE = BLOCK_SIZES[GGML_TYPE_Q4_0]; ++ simd ybytes = block_load(weight); ++ const simd scales = block_load((const fp16 *)scale); ++ ++ simd yvs; ++ #pragma unroll ++ for (int i = 0; i < SBS; ++i) { ++ simd uyv; ++ uyv.select(0) = ybytes.template select(i * QK / 2) & (uint8_t)0xF; ++ uyv.select(QK / 2) = ybytes.template select(i * QK / 2) >> (uint8_t)4; ++ yvs.template select(i * QK) = (uyv.bit_cast_view() - (int8_t)8) * scales[i]; ++ } ++ return yvs; ++} ++ ++template<> ++ESIMD_INLINE auto load_qblocks(const uint8_t * weight, const uint8_t * scale) { ++ constexpr int BLOCK_SIZE = BLOCK_SIZES[GGML_TYPE_Q4_0_WOQ]; ++ simd ybytes = block_load(weight); ++ const simd scales = block_load((const fp16 *)scale); ++ ++ simd yvs; ++ #pragma unroll ++ for (int i = 0; i < SBS; ++i) { ++ simd uyv; ++ uyv.select(0) = ybytes.template select(i * QK / 2) & (uint8_t)0xF; ++ uyv.select(1) = ybytes.template select(i * QK / 2) >> (uint8_t)4; ++ yvs.template select(i * QK) = (uyv.bit_cast_view() - (int8_t)8) * scales[i]; ++ } ++ return yvs; ++} ++ ++ ++template<> ++ESIMD_INLINE auto load_qblocks(const uint8_t * weight, const uint8_t * scale) { ++ constexpr int BLOCK_SIZE = BLOCK_SIZES[GGML_TYPE_FP8E5]; ++ simd ybytes = block_load(weight); ++ ++ simd yvs; ++ yvs.template bit_cast_view().template select(0) = 0x80; ++ yvs.template bit_cast_view().template select(1) = ybytes; ++ return yvs; ++} ++ ++ ++// C++ doesn't support function template partial specialization, so write a new version for SBS=1 ++template ++ESIMD_INLINE auto load_qblock(const uint8_t * weight, const uint8_t * scale); ++ ++template<> ++ESIMD_INLINE auto load_qblock(const uint8_t * weight, const uint8_t * scale) { ++ constexpr int BLOCK_SIZE = BLOCK_SIZES[GGML_TYPE_Q4_0]; ++ simd ybytes = block_load(weight); ++ fp16 scales = *(const fp16 *)scale; ++ ++ simd uyv; ++ uyv.select(0) = ybytes & (uint8_t)0xF; ++ uyv.select(QK / 2) = ybytes >> (uint8_t)4; ++ simd yv = (uyv.bit_cast_view() - (int8_t)8) * scales; ++ ++ return yv; ++} ++ ++template<> ++ESIMD_INLINE auto load_qblock(const uint8_t * weight, const uint8_t * scale) { ++ constexpr int BLOCK_SIZE = BLOCK_SIZES[GGML_TYPE_Q4_0_WOQ]; ++ simd ybytes = block_load(weight); ++ fp16 scales = *(const fp16 *)scale; ++ ++ simd uyv; ++ uyv.select(0) = ybytes & (uint8_t)0xF; ++ uyv.select(1) = ybytes >> (uint8_t)4; ++ simd yv = (uyv.bit_cast_view() - (int8_t)8) * scales; ++ ++ return yv; ++} ++ ++ ++template<> ++ESIMD_INLINE auto load_qblock(const uint8_t * weight, const uint8_t * scale) { ++ constexpr int BLOCK_SIZE = BLOCK_SIZES[GGML_TYPE_FP8E5]; ++ simd ybytes = block_load(weight); ++ ++ simd yvs; ++ yvs.template bit_cast_view().template select(0) = 0x80; ++ yvs.template bit_cast_view().template select(1) = ybytes; ++ return yvs; ++} diff --git a/csrc/xpu/cache_ops_xpu.cpp b/csrc/xpu/cache_ops_xpu.cpp new file mode 100644 index 000000000..a3451c0e7 @@ -7466,7 +4617,7 @@ index 000000000..a3451c0e7 +} diff --git a/csrc/xpu/cache_ops_xpu_fp8.cpp b/csrc/xpu/cache_ops_xpu_fp8.cpp new file mode 100644 -index 000000000..cbfb7eea1 +index 000000000..e4a0001fe --- /dev/null +++ b/csrc/xpu/cache_ops_xpu_fp8.cpp @@ -0,0 +1,170 @@ @@ -7497,12 +4648,12 @@ index 000000000..cbfb7eea1 + uint8_t * __restrict__ value_cache, // [num_blocks, num_kv_heads, + // block_size, head_size] + const int64_t* __restrict__ slot_mapping, // [num_tokens] -+ const int key_stride, const int value_stride, ++ const int key_stride, const int value_stride, + const int key_head_stride, const int value_head_stride, + const int num_heads, + const int head_size, const int block_size, const int x, + const sycl::nd_item<3>& item_ct1) { -+ ++ + // New Implementation // + const size_t token_idx = item_ct1.get_global_id(0); + const size_t head_idx = item_ct1.get_global_id(1); @@ -7517,11 +4668,11 @@ index 000000000..cbfb7eea1 + + const scalar_t * value_head = value + token_idx * value_stride + head_idx * value_head_stride; + -+ uint8_t * key_output_head = key_cache + block_idx * num_heads * head_size * block_size + ++ uint8_t * key_output_head = key_cache + block_idx * num_heads * head_size * block_size + + head_idx * head_size * block_size + block_offset * head_size; -+ uint8_t * value_output_head = value_cache + block_idx * num_heads * head_size * block_size + ++ uint8_t * value_output_head = value_cache + block_idx * num_heads * head_size * block_size + + head_idx * head_size * block_size + block_offset * head_size; -+ ++ + simd key_row = block_load(key_head); + simd key_result = quantize_key_row(key_row); + block_store(key_output_head, key_result); @@ -7537,7 +4688,7 @@ index 000000000..cbfb7eea1 + const scalar_t* __restrict__ key, const scalar_t* __restrict__ value, + uint8_t* __restrict__ key_cache, uint8_t* __restrict__ value_cache, + const int64_t* __restrict__ slot_mapping, const int num_tokens, -+ const int key_stride, const int value_stride, ++ const int key_stride, const int value_stride, + const int key_head_stride, const int value_head_stride, + const int num_heads, + const int head_size, const int block_size, const int x) { @@ -7640,6 +4791,324 @@ index 000000000..cbfb7eea1 + + + +diff --git a/csrc/xpu/common.h b/csrc/xpu/common.h +new file mode 100644 +index 000000000..17d6ef643 +--- /dev/null ++++ b/csrc/xpu/common.h +@@ -0,0 +1,312 @@ ++#pragma once ++ ++#include ++#include ++ ++typedef union half_t { ++ uint16_t u; ++ sycl::half f; ++} __half_t; ++ ++typedef union ufloat32 { ++ unsigned u; ++ float f; ++} __float_t; ++ ++#define QK4_0 64 ++#define QR4_0 2 ++#define QK4_1 64 ++#define QR4_1 2 ++#define QK5_0 64 ++#define QR5_0 2 ++#define QK5_1 64 ++#define QR5_1 2 ++#define QK8_0 64 ++#define QR8_0 1 ++#define QK8_1 32 ++#define QR8_1 1 ++#define QI8_1 (QK8_1 / (4 * QR8_1)) // 8 ++#define QKFP8 64 ++#define QRFP8 1 ++#define QKFP6 64 ++// for iq2 quantization ++#define WARP_SIZE 32 ++#define QK_K 256 ++#define QK4_K 32 ++#define QR4_K 2 ++#define QK6_K 16 ++#define QKFP6_K 16 ++#define QR2_XXS 8 ++#define QI2_XXS (QK_K / (4*QR2_XXS)) // 8 ++#define QR2_XS 8 ++#define QI2_XS (QK_K / (4*QR2_XS)) // 8 ++#define QR2_K 4 ++#define QI2_K (QK_K / (4*QR2_K)) // 16 ++#define QR1_S 8 ++#define QI1_S (QK_K / (4*QR1_S)) // 8 ++ ++typedef struct { ++ sycl::half d; // delta ++ uint8_t qs[QK4_0 / 2]; // nibbles / quants ++} block_q4_0; ++ ++typedef struct { ++ uint8_t qs[QK4_0 / 2]; // nibbles / quants ++} block_q4_0_qs; ++ ++typedef struct { ++ uint8_t qs[QK4_1 / 2]; // nibbles / quants ++} block_q4_1_qs; ++ ++typedef struct { ++ sycl::half d; // delta ++ sycl::half m; // min ++ uint8_t qs[QK4_1 / 2]; // nibbles / quants ++} block_q4_1; ++ ++typedef struct { ++ sycl::half d; ++ uint8_t qh[8]; ++ uint8_t qs[QK5_0 / 2]; ++} block_q5_0; ++ ++typedef struct { ++ sycl::half d; // delta ++ sycl::half m; // min ++ uint8_t qh[8]; // 5-th bit of quants ++ uint8_t qs[QK5_1 / 2]; // nibbles / quants ++} block_q5_1; ++ ++typedef struct { ++ sycl::half d; // delta ++ uint8_t qh[8]; // 3-th bit of quants ++ uint8_t qs[QK4_0 / 4]; // nibbles / quants ++} block_nf3; ++ ++typedef struct { ++ uint8_t qh[8]; // 3-th bit of quants ++ uint8_t qs[QK4_0 / 4]; // nibbles / quants ++} block_nf3_qs; ++ ++typedef struct { ++ float d; // delta ++ int8_t qs[QK8_0]; // quants ++} block_q8_0; ++ ++typedef struct { ++ int8_t qs[QK8_0]; // quants ++} block_q8_0_qs; ++ ++typedef struct { ++ sycl::half d; ++ sycl::half sum; ++ int8_t qs[QK8_1]; // quants ++} block_q8_1; ++ ++typedef struct { ++ uint8_t qs[QKFP8]; ++} block_fp8_qs; ++ ++typedef struct { ++ float d; ++ uint8_t qs[QKFP8]; ++} block_fp8; ++ ++typedef struct { ++ sycl::half d; ++ uint16_t qs[QK_K/8]; // 32 ++} block_iq2_xxs; ++ ++typedef struct { ++ sycl::half d; ++ uint16_t qs[QK_K/8]; // 32 ++ uint8_t scales[QK_K/32]; // 8 ++} block_iq2_xs; ++ ++typedef struct { ++ uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits ++ uint8_t qs[QK_K/4]; // quants ++ sycl::half d; // super-block scale for quantized scales ++ sycl::half min; // super-block min for quantized mins ++} block_q2_K; ++ ++typedef struct { ++ sycl::half d; // super-block scale for quantized scales ++ sycl::half dmin; // super-block scale for quantized mins ++ uint8_t scales[16]; // scales and mins, quantized with 8 bits ++ uint8_t qs[QK_K/2]; // 4--bit quants ++} block_q4_K; ++ ++typedef struct { ++ uint8_t qs[QK_K/2]; // 4-bit quants ++} block_q4_K_qs; ++ ++typedef struct { ++ uint8_t qs[QK4_K/2]; // 4-bit quants ++} block_q4_K_qs_block; ++ ++typedef struct { ++ uint8_t scales[16]; // scales and mins, quantized with 8 bits ++} block_q4_K_scales; ++ ++typedef struct { ++ sycl::half d; // super-block scale for quantized scales ++ sycl::half dmin; // super-block scale for quantized mins ++ uint8_t scales[12]; // scales and mins, quantized with 6 bits ++ uint8_t qh[QK_K/8]; // quants, high bit ++ uint8_t qs[QK_K/2]; // quants, low 4 bits ++} block_q5_K; ++ ++typedef struct { ++ uint8_t ql[QK_K/2]; // quants, lower 4 bits ++ uint8_t qh[QK_K/4]; // quants, upper 2 bits ++ int8_t scales[QK_K/16]; // scales ++ sycl::half d; // delta ++} block_q6_K; ++ ++typedef struct { ++ uint32_t qh[QK_K/16]; // quants, upper 2 bits ++} block_q6_K_qh; ++ ++typedef struct { ++ uint32_t ql[QK_K/8]; // quants, lower 4 bits ++} block_q6_K_ql; ++ ++typedef struct { ++ int8_t scales[QK_K/16]; // scales, quantized with 8 bits ++} block_q6_K_scales; ++ ++typedef struct { ++ uint8_t ql[QK_K/2]; // quants, lower 4 bits ++ uint8_t qh[QK_K/4]; // quants, upper 2 bits ++ int8_t scales[QK_K/16]; // scales, quantized with 8 bits ++ sycl::half d; // super-block scale ++} block_fp6_K; ++static_assert(sizeof(block_fp6_K) == sizeof(sycl::half) + QK_K / 16 + 3*QK_K/4, "wrong fp6_K block size/padding"); ++ ++typedef struct { ++ uint32_t ql[QK_K/8]; // quants, lower 4 bits ++} block_fp6_k_ql; ++ ++typedef struct { ++ uint32_t qh[QK_K/16]; // quants, upper 2 bits ++} block_fp6_k_qh; ++ ++typedef struct { ++ int8_t scales[QK_K/16]; // scales, quantized with 8 bits, 16 ++} block_fp6_k_scales; ++ ++typedef struct { ++ uint32_t ql[QKFP6_K/8]; // upper 2 bits, 2 ++} block_base_fp6_k_ql; ++ ++typedef struct { ++ uint32_t qh[QKFP6_K/16]; // upper 2 bits, 1 ++} block_base_fp6_k_qh; ++ ++#define NGRID_IQ1S 2048 ++#define IQ1S_DELTA 0.125f ++#define IQ1M_DELTA 0.125f ++ ++typedef struct { ++ sycl::half d; ++ uint8_t qs[QK_K/8]; ++ uint16_t qh[QK_K/32]; ++} block_iq1_s; ++ ++// 1.8125 bpw ++typedef struct { ++ uint8_t qs[QK_K/8]; // grid index, low 8 bits ++ uint8_t qh[QK_K/16]; // grid index, high 3 bits + grid shift bit (for two groups of 8) ++ uint8_t scales[QK_K/32]; // 4-bit block scales ++} block_iq1_m; ++ ++typedef struct { ++ uint8_t ql[QKFP6/2]; // lower 4 bits, 32 ++ uint8_t qh[QKFP6/4]; // upper 2 bits, 16 ++ sycl::half d; // delta ++} block_fp6; ++ ++typedef struct { ++ uint32_t qh[QKFP6/16]; // upper 2 bits, 4 ++} block_fp6_32_qh; ++ ++typedef struct { ++ uint32_t ql[QKFP6/8]; // lower 4 bits, 8 ++} block_fp6_32_ql; ++ ++enum ggml_type { ++ GGML_TYPE_Q4_0 = 2, ++ GGML_TYPE_Q4_1 = 3, ++ GGML_TYPE_Q5_0 = 6, ++ GGML_TYPE_Q5_1 = 7, ++ GGML_TYPE_Q8_0 = 8, ++ GGML_TYPE_Q8_1 = 9, ++ GGML_TYPE_NF4 = 10, ++ GGML_TYPE_NF3 = 11, ++ GGML_TYPE_FP8E4 = 15, ++ GGML_TYPE_FP4 = 16, ++ GGML_TYPE_FP8E5 = 19, ++ GGML_TYPE_IQ2_XXS = 21, ++ GGML_TYPE_IQ2_XS = 22, ++ GGML_TYPE_Q2_K = 23, ++ GGML_TYPE_IQ1_S = 24, ++ GGML_TYPE_IQ1_M = 25, ++ GGML_TYPE_Q6_K = 26, ++ GGML_TYPE_Q4_K = 27, ++ GGML_TYPE_Q5_K = 28, ++ GGML_TYPE_FP6 = 29, ++ GGML_TYPE_FP6_K = 30, ++ GGML_TYPE_Q4_0_WOQ = 34, ++ GGML_TYPE_COUNT ++}; ++ ++static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = { ++ [GGML_TYPE_Q4_0] = QK4_0, ++ [GGML_TYPE_Q4_1] = QK4_1, ++ [GGML_TYPE_Q5_0] = QK5_0, ++ [GGML_TYPE_Q5_1] = QK5_1, ++ [GGML_TYPE_NF4] = QK4_0, ++ [GGML_TYPE_NF3] = QK4_0, ++ [GGML_TYPE_Q8_0] = QK8_0, ++ [GGML_TYPE_Q8_1] = QK8_1, ++ [GGML_TYPE_FP8E4] = QKFP8, ++ [GGML_TYPE_FP4] = QK4_0, ++ [GGML_TYPE_FP6] = QKFP6, ++ [GGML_TYPE_FP8E5] = QKFP8, ++ [GGML_TYPE_IQ2_XXS] = QK_K, ++ [GGML_TYPE_IQ2_XS] = QK_K, ++ [GGML_TYPE_Q2_K] = QK_K, ++ [GGML_TYPE_IQ1_S] = QK_K, ++ [GGML_TYPE_IQ1_M] = QK_K, ++ [GGML_TYPE_Q6_K] = QK_K, ++ [GGML_TYPE_Q4_K] = QK_K, ++ [GGML_TYPE_Q5_K] = QK_K, ++ [GGML_TYPE_FP6_K] = QK_K, ++ [GGML_TYPE_Q4_0_WOQ] = QK4_0, ++}; ++ ++static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = { ++ [GGML_TYPE_Q4_0] = sizeof(block_q4_0), ++ [GGML_TYPE_Q4_1] = sizeof(block_q4_1), ++ [GGML_TYPE_Q5_0] = sizeof(block_q5_1), ++ [GGML_TYPE_Q5_1] = sizeof(block_q5_1), ++ [GGML_TYPE_NF4] = sizeof(block_q4_0), ++ [GGML_TYPE_NF3] = sizeof(block_nf3), ++ [GGML_TYPE_Q8_0] = sizeof(block_q8_0), ++ [GGML_TYPE_Q8_1] = sizeof(block_q8_1), ++ [GGML_TYPE_FP8E4]= sizeof(block_fp8), ++ [GGML_TYPE_FP4] = sizeof(block_q4_0), ++ [GGML_TYPE_FP6] = sizeof(block_fp6), ++ [GGML_TYPE_FP8E5] = sizeof(block_fp8), ++ [GGML_TYPE_IQ2_XXS] = sizeof(block_iq2_xxs), ++ [GGML_TYPE_IQ2_XS] = sizeof(block_iq2_xs), ++ [GGML_TYPE_Q2_K] = sizeof(block_q2_K), ++ [GGML_TYPE_IQ1_S] = sizeof(block_iq1_s), ++ [GGML_TYPE_IQ1_M] = sizeof(block_iq1_m), ++ [GGML_TYPE_Q6_K] = sizeof(block_q6_K), ++ [GGML_TYPE_Q4_K] = sizeof(block_q4_K), ++ [GGML_TYPE_Q5_K] = sizeof(block_q5_K), ++ [GGML_TYPE_FP6_K] = sizeof(block_fp6_K), ++ [GGML_TYPE_Q4_0_WOQ] = sizeof(block_q4_0), ++}; diff --git a/csrc/xpu/dequantize.h b/csrc/xpu/dequantize.h new file mode 100644 index 000000000..9a967312e @@ -8461,6 +5930,281 @@ index 000000000..7b70e4efc + +} // namespace vllm \ No newline at end of file +diff --git a/csrc/xpu/fused_moe.cpp b/csrc/xpu/fused_moe.cpp +new file mode 100644 +index 000000000..3a39d0e13 +--- /dev/null ++++ b/csrc/xpu/fused_moe.cpp +@@ -0,0 +1,269 @@ ++#include "utils.h" ++#include "base.hpp" ++ ++using ST = at::ScalarType; ++ ++#include ++#include "xpu_types.h" ++#include ++ ++template ++__inline__ T silu_xpu(const T& x) { ++ // x * sigmoid(x) ++ return (T)(((float)x) / (1.0f + sycl::exp((float)-x))); ++} ++ ++template ++void silu_and_mul_kernel( ++ scalar_t* __restrict__ out, // [..., d] ++ const scalar_t* __restrict__ input, // [..., 2, d] ++ const int d, ++ const sycl::nd_item<3>& item_ct1) { ++ const int64_t token_idx = item_ct1.get_group(2); ++ for (int64_t idx = item_ct1.get_local_id(2); idx < d; ++ idx += item_ct1.get_local_range(2)) { ++ const scalar_t x = input[token_idx * 2 * d + idx]; ++ const scalar_t y = input[token_idx * 2 * d + d + idx]; ++ out[token_idx * d + idx] = silu_xpu(x) * y; ++ } ++} ++ ++template ++void call_silu_and_mul_kernel( ++ int num_tokens, ++ int d, ++ const scalar_t* __restrict__ input, ++ scalar_t* __restrict__ output) { ++ using sycl_t = vllm::xpu::SyclTypeTrait::Type; ++ sycl::range<3> grid(1, 1, num_tokens); ++ sycl::range<3> block(1, 1, std::min(d, 1024)); ++ auto& queue = vllm::xpu::vllmGetQueue(); ++ queue.submit([&](sycl::handler& cgh) { ++ cgh.parallel_for( ++ sycl::nd_range<3>(grid * block, block), [=](sycl::nd_item<3> item_ct1) { ++ silu_and_mul_kernel( ++ (sycl_t*)output, (const sycl_t*)input, d, item_ct1); ++ }); ++ }); ++} ++ ++void _silu_and_mul(torch::Tensor& out, torch::Tensor& input) { ++ int num_tokens = input.numel() / input.size(-1); ++ int d = input.size(-1) / 2; ++ ++ VLLM_XPU_DISPATCH_FLOATING_TYPES( ++ input.scalar_type(), "call_silu_and_mul_kernel", [&] { ++ call_silu_and_mul_kernel( ++ num_tokens, ++ d, ++ input.data_ptr(), ++ out.data_ptr()); ++ }); ++} ++ ++template ++static void moe_forward_kernel( ++ const void* input_ptr, ++ const int64_t* indexs, ++ const uint64_t* qweights, ++ void * output_ptr, ++ const int num_tokens, ++ const int state_size, ++ const int output_size, ++ at::Device device ++) { ++ static_assert(ES == 8 || ES == 16 || ES == 32); ++ assert(output_size % VS == 0); ++ ++ const int nb = state_size / QK; ++ const int nsb = nb / SBS; ++ ++ constexpr int BLOCK_SIZE = BLOCK_SIZES[QTYPE]; ++ constexpr int SCALE_SIZE = SCALE_SIZES[QTYPE]; ++ ++ sycl::range<2> global_size(num_tokens, output_size / VS * GS); ++ sycl::range<2> local_size(1, GS); ++ ++ auto cgf = [&](sycl::handler& handle) { ++ handle.parallel_for( ++ sycl::nd_range<2>(global_size, local_size), ++ [=](sycl::nd_item<2> item) SYCL_ESIMD_KERNEL { ++ slm_init(); ++ ++ const int eid = item.get_global_id(0); ++ const int tid = item.get_local_id(1); ++ const int vid = item.get_group(1) * VS; ++ ++ if (indexs[eid] >= 0) { ++ const uint8_t* weight = (const uint8_t *)(qweights[indexs[eid]]); ++ const uint8_t* scales = weight + (int64_t)output_size * nb * BLOCK_SIZE; ++ const IT* input = static_cast(input_ptr) + eid * state_size; ++ IT* output = static_cast(output_ptr) + eid * output_size; ++ ++ const uint8_t * weight_base = weight + nb * BLOCK_SIZE * vid; ++ const uint8_t * scale_base = scales + nb * SCALE_SIZE * vid; ++ ++ simd accvs{}; ++ ++ for (int s = tid; s < nsb; s += GS) { ++ simd xvs = block_load(input + s * SBS * QK); ++ ++ #pragma unroll ++ for (int v = 0; v < VS; ++v) { ++ simd yvs = load_qblocks( ++ weight_base + v * nb * BLOCK_SIZE + s * SBS * BLOCK_SIZE, ++ scale_base + v * nb * SCALE_SIZE + s * SBS * SCALE_SIZE ++ ); ++ ++ #pragma unroll ++ for (int i = 0; i < SBS * QK; i += ES) { ++ accvs.template select(v * ES) += ++ xvs.template select(i) * ++ yvs.template select(i); ++ } ++ } ++ } ++ ++ for (int b = nsb * SBS + tid; b < nb; b += GS) { ++ simd xv = block_load(input + b * QK); ++ ++ #pragma unroll ++ for (int v = 0; v < VS; ++v) { ++ simd yv = load_qblock( ++ weight_base + v * nb * BLOCK_SIZE + b * BLOCK_SIZE, ++ scale_base + v * nb * SCALE_SIZE + b * SCALE_SIZE ++ ); ++ ++ #pragma unroll ++ for (int i = 0; i < QK; i += ES) { ++ accvs.template select(v * ES) += ++ xv.template select(i) * ++ yv.template select(i); ++ } ++ } ++ } ++ ++ simd accs; ++ #pragma unroll ++ for(int v = 0; v < VS; ++v) { ++ accs[v] = sycl::ext::intel::esimd::detail::sum( ++ accvs.template select(v * ES) ++ ); ++ } ++ ++ slm_block_store(tid * VS * sizeof(float), accs); ++ ++ barrier(); ++ ++ if (tid == 0) { ++ #pragma unroll ++ for (int i = 1; i < GS; ++i) { ++ accs += slm_block_load(i * VS * sizeof(float)); ++ } ++ ++ block_store(output + vid, accs); ++ } ++ } ++ ++ ++ } ++ ); ++ }; ++ ++ utils::submit_kernel(cgf, device, "moe forward down kernel"); ++} ++ ++ ++template ++static auto dispatch_moe_forward(ST scalar_t) { ++ switch (scalar_t) { ++ case ST::Float: return std::make_tuple(moe_forward_kernel); ++ case ST::Half: return std::make_tuple(moe_forward_kernel); ++ default: throw std::runtime_error("unsupported dtype, only fp32 and fp16 are supported"); ++ } ++} ++ ++ ++torch::Tensor moe_forward( ++ torch::Tensor input, ++ torch::Tensor indexs, ++ torch::Tensor qweights_attr, ++ int64_t state_size, ++ int64_t output_size, ++ int64_t qtype ++) { ++ auto [func] = [&] () { ++ switch (qtype) { ++ case GGML_TYPE_Q4_0: ++ return dispatch_moe_forward(input.scalar_type()); ++ case GGML_TYPE_Q4_0_WOQ: ++ return dispatch_moe_forward(input.scalar_type()); ++ case GGML_TYPE_FP8E5: ++ return dispatch_moe_forward(input.scalar_type()); ++ default: throw std::runtime_error("unsupported qtype: " + std::to_string(qtype)); ++ } ++ } (); ++ ++ int64_t num_tokens = indexs.numel(); ++ ++ torch::Tensor output = torch::zeros({num_tokens, output_size}, ++ torch::device(input.device()).dtype(input.dtype())); ++ ++ func( ++ input.data_ptr(), indexs.data_ptr(), ++ qweights_attr.data_ptr(), output.data_ptr(), ++ num_tokens, state_size, output_size, input.device() ++ ); ++ ++ return output; ++} ++ ++ ++torch::Tensor fused_moe_forward( ++ torch::Tensor input, ++ torch::Tensor indexs, ++ torch::Tensor qweights1_attr, ++ torch::Tensor qweights2_attr, ++ int64_t hidden_size, ++ int64_t intermediate_size, ++ int64_t qtype ++) { ++ auto [gmm_func] = [&] () { ++ switch (qtype) { ++ case GGML_TYPE_Q4_0: ++ return dispatch_moe_forward(input.scalar_type()); ++ case GGML_TYPE_Q4_0_WOQ: ++ return dispatch_moe_forward(input.scalar_type()); ++ case GGML_TYPE_FP8E5: ++ return dispatch_moe_forward(input.scalar_type()); ++ default: throw std::runtime_error("unsupported qtype: " + std::to_string(qtype)); ++ } ++ } (); ++ ++ int64_t num_tokens = indexs.numel(); ++ ++ torch::Tensor w1_output = torch::zeros({num_tokens, intermediate_size * 2}, ++ torch::device(input.device()).dtype(input.dtype())); ++ ++ torch::Tensor tmp = torch::zeros({num_tokens, intermediate_size}, ++ torch::device(input.device()).dtype(input.dtype())); ++ ++ torch::Tensor w2_output = torch::zeros({num_tokens, hidden_size}, ++ torch::device(input.device()).dtype(input.dtype())); ++ ++ gmm_func( ++ input.data_ptr(), indexs.data_ptr(), ++ qweights1_attr.data_ptr(), w1_output.data_ptr(), ++ num_tokens, hidden_size, intermediate_size * 2, input.device() ++ ); ++ ++ _silu_and_mul(tmp, w1_output); ++ ++ gmm_func( ++ tmp.data_ptr(), indexs.data_ptr(), ++ qweights2_attr.data_ptr(), w2_output.data_ptr(), ++ num_tokens, intermediate_size, hidden_size, input.device() ++ ); ++ ++ return w2_output; ++} diff --git a/csrc/xpu/gemm_kernels_xpu.cpp b/csrc/xpu/gemm_kernels_xpu.cpp new file mode 100644 index 000000000..d96aa5880 @@ -9212,10 +6956,10 @@ index 000000000..3232cacbc \ No newline at end of file diff --git a/csrc/xpu/pybind.cpp b/csrc/xpu/pybind.cpp new file mode 100644 -index 000000000..55b29cb1e +index 000000000..bf9e94612 --- /dev/null +++ b/csrc/xpu/pybind.cpp -@@ -0,0 +1,101 @@ +@@ -0,0 +1,112 @@ +// #include "cache.h" +#include "xpu_ops.h" +#include @@ -9316,6 +7060,17 @@ index 000000000..55b29cb1e + &awq_dequantize, + "dequant method for awq"); + ++ ++ ops.def( ++ "moe_forward", ++ &moe_forward, ++ "PagedAttention GQA."); ++ ++ ops.def( ++ "fused_moe_forward", ++ &fused_moe_forward, ++ "PagedAttention GQA."); ++ +} diff --git a/csrc/xpu/reduction_utils.h b/csrc/xpu/reduction_utils.h new file mode 100644 @@ -9323,64 +7078,64 @@ index 000000000..93c64d759 --- /dev/null +++ b/csrc/xpu/reduction_utils.h @@ -0,0 +1,56 @@ -+/* -+ * Copyright (c) 2023, The vLLM team. -+ * -+ * Licensed under the Apache License, Version 2.0 (the "License"); -+ * you may not use this file except in compliance with the License. -+ * You may obtain a copy of the License at -+ * -+ * http://www.apache.org/licenses/LICENSE-2.0 -+ * -+ * Unless required by applicable law or agreed to in writing, software -+ * distributed under the License is distributed on an "AS IS" BASIS, -+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -+ * See the License for the specific language governing permissions and -+ * limitations under the License. -+ */ -+#pragma once -+ -+#include -+#include -+#include -+ -+namespace vllm { -+ -+template -+__inline__ T warpReduceSum(T val, const sycl::nd_item<3>& item_ct1) { -+#pragma unroll -+ for (int mask = 16; mask > 0; mask >>= 1) -+ val += dpct::permute_sub_group_by_xor( -+ item_ct1.get_sub_group(), val, mask, 32); -+ return val; -+} -+ -+/* Calculate the sum of all elements in a block */ -+template -+__inline__ T blockReduceSum(T val, const sycl::nd_item<3> &item_ct1, T *shared) { -+ -+ int lane = item_ct1.get_local_id(2) & 0x1f; -+ int wid = item_ct1.get_local_id(2) >> 5; -+ -+ val = warpReduceSum(val, item_ct1); -+ -+ if (lane == 0) { -+ shared[wid] = val; -+ } -+ item_ct1.barrier(); -+ -+ // Modify from blockDim.x << 5 to blockDim.x / 32. to prevent -+ // blockDim.x is not divided by 32 -+ val = (item_ct1.get_local_id(2) < (item_ct1.get_local_range(2) / 32.f)) -+ ? shared[lane] -+ : (T)(0.0f); -+ val = warpReduceSum(val, item_ct1); -+ return val; -+} -+ -+} // namespace vllm -\ No newline at end of file -diff --git a/csrc/xpu/utils.cpp b/csrc/xpu/utils.cpp ++/* ++ * Copyright (c) 2023, The vLLM team. ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++#pragma once ++ ++#include ++#include ++#include ++ ++namespace vllm { ++ ++template ++__inline__ T warpReduceSum(T val, const sycl::nd_item<3>& item_ct1) { ++#pragma unroll ++ for (int mask = 16; mask > 0; mask >>= 1) ++ val += dpct::permute_sub_group_by_xor( ++ item_ct1.get_sub_group(), val, mask, 32); ++ return val; ++} ++ ++/* Calculate the sum of all elements in a block */ ++template ++__inline__ T blockReduceSum(T val, const sycl::nd_item<3> &item_ct1, T *shared) { ++ ++ int lane = item_ct1.get_local_id(2) & 0x1f; ++ int wid = item_ct1.get_local_id(2) >> 5; ++ ++ val = warpReduceSum(val, item_ct1); ++ ++ if (lane == 0) { ++ shared[wid] = val; ++ } ++ item_ct1.barrier(); ++ ++ // Modify from blockDim.x << 5 to blockDim.x / 32. to prevent ++ // blockDim.x is not divided by 32 ++ val = (item_ct1.get_local_id(2) < (item_ct1.get_local_range(2) / 32.f)) ++ ? shared[lane] ++ : (T)(0.0f); ++ val = warpReduceSum(val, item_ct1); ++ return val; ++} ++ ++} // namespace vllm +\ No newline at end of file +diff --git a/csrc/xpu/utils.cpp b/csrc/xpu/utils.cpp new file mode 100644 index 000000000..5f613af55 --- /dev/null @@ -9510,10 +7265,11 @@ index 000000000..fa3ead51c +} diff --git a/csrc/xpu/xpu_ops.h b/csrc/xpu/xpu_ops.h new file mode 100644 -index 000000000..e78cc59a1 +index 000000000..603d4f23d --- /dev/null +++ b/csrc/xpu/xpu_ops.h -@@ -0,0 +1,174 @@ +@@ -0,0 +1,194 @@ ++#pragma once +#include + +void rotary_embedding(torch::Tensor &positions, torch::Tensor &query, @@ -9682,13 +7438,31 @@ index 000000000..e78cc59a1 + int max_seq_len +); + ++ ++torch::Tensor moe_forward( ++ torch::Tensor input, ++ torch::Tensor indexs, ++ torch::Tensor qweights_attr, ++ int64_t state_size, ++ int64_t output_size, ++ int64_t qtype ++); ++ ++torch::Tensor fused_moe_forward( ++ torch::Tensor input, ++ torch::Tensor indexs, ++ torch::Tensor qweights1_attr, ++ torch::Tensor qweights2_attr, ++ int64_t hidden_size, ++ int64_t intermediate_size, ++ int64_t qtype ++); +void paged_attention_gqa_fp8(torch::Tensor output, torch::Tensor query, + torch::Tensor key_cache, torch::Tensor value_cache, + int64_t bsz, int64_t num_heads, int64_t num_kv_heads, + float scale, torch::Tensor& block_tables, + torch::Tensor& context_lens, int block_size, + int64_t head_dim, int max_seq_len); -\ No newline at end of file diff --git a/csrc/xpu/xpu_types.h b/csrc/xpu/xpu_types.h new file mode 100644 index 000000000..23f5b805c @@ -9721,3486 +7495,115 @@ index 000000000..23f5b805c + +#endif \ No newline at end of file -diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt -index 4859c8ac0..25a700033 100644 ---- a/docs/requirements-docs.txt -+++ b/docs/requirements-docs.txt -@@ -19,3 +19,4 @@ openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entr - fastapi # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args - partial-json-parser # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args - requests -+zmq -diff --git a/docs/source/conf.py b/docs/source/conf.py -index 1fe047463..71394c530 100644 ---- a/docs/source/conf.py -+++ b/docs/source/conf.py -@@ -191,6 +191,7 @@ def linkcode_resolve(domain, info): - - # Mock out external dependencies here, otherwise the autodoc pages may be blank. - autodoc_mock_imports = [ -+ "blake3", - "compressed_tensors", - "cpuinfo", - "cv2", -@@ -207,7 +208,7 @@ autodoc_mock_imports = [ - "tensorizer", - "pynvml", - "outlines", -- "xgrammar," -+ "xgrammar", - "librosa", - "soundfile", - "gguf", -diff --git a/docs/source/contributing/dockerfile/dockerfile.md b/docs/source/contributing/dockerfile/dockerfile.md -index 6535414a7..7ffec8333 100644 ---- a/docs/source/contributing/dockerfile/dockerfile.md -+++ b/docs/source/contributing/dockerfile/dockerfile.md -@@ -11,11 +11,11 @@ Below is a visual representation of the multi-stage Dockerfile. The build graph - - The edges of the build graph represent: - --- FROM ... dependencies (with a solid line and a full arrow head) -+- `FROM ...` dependencies (with a solid line and a full arrow head) - --- COPY --from=... dependencies (with a dashed line and an empty arrow head) -+- `COPY --from=...` dependencies (with a dashed line and an empty arrow head) - --- RUN --mount=(.\*)from=... dependencies (with a dotted line and an empty diamond arrow head) -+- `RUN --mount=(.\*)from=...` dependencies (with a dotted line and an empty diamond arrow head) - - > ```{figure} ../../assets/dev/dockerfile-stages-dependency.png - > :align: center -diff --git a/docs/source/contributing/overview.md b/docs/source/contributing/overview.md -index 9dac41cff..c960790f4 100644 ---- a/docs/source/contributing/overview.md -+++ b/docs/source/contributing/overview.md -@@ -34,7 +34,7 @@ pytest tests/ - ``` - - ```{note} --Currently, the repository does not pass the `mypy` tests. -+Currently, the repository is not fully checked by `mypy`. - ``` - - # Contribution Guidelines -diff --git a/docs/source/design/arch_overview.md b/docs/source/design/arch_overview.md -index 475a3e5fa..2f1280c04 100644 ---- a/docs/source/design/arch_overview.md -+++ b/docs/source/design/arch_overview.md -@@ -77,8 +77,7 @@ python -m vllm.entrypoints.openai.api_server --model - - That code can be found in . - --More details on the API server can be found in the {doc}`OpenAI Compatible --Server ` document. -+More details on the API server can be found in the [OpenAI-Compatible Server](#openai-compatible-server) document. - - ## LLM Engine - -diff --git a/docs/source/design/multimodal/multimodal_index.md b/docs/source/design/multimodal/multimodal_index.md -index 88af07afc..e4f2171e8 100644 ---- a/docs/source/design/multimodal/multimodal_index.md -+++ b/docs/source/design/multimodal/multimodal_index.md -@@ -45,39 +45,39 @@ adding_multimodal_plugin - ### Base Classes - - ```{eval-rst} --.. autodata:: vllm.multimodal.NestedTensors -+.. automodule:: vllm.multimodal.base -+ :members: -+ :show-inheritance: - ``` - --```{eval-rst} --.. autodata:: vllm.multimodal.BatchedTensorInputs --``` -+### Input Classes - - ```{eval-rst} --.. autoclass:: vllm.multimodal.MultiModalDataBuiltins -+.. automodule:: vllm.multimodal.inputs - :members: - :show-inheritance: - ``` - --```{eval-rst} --.. autodata:: vllm.multimodal.MultiModalDataDict --``` -+### Audio Classes - - ```{eval-rst} --.. autoclass:: vllm.multimodal.MultiModalKwargs -+.. automodule:: vllm.multimodal.audio - :members: - :show-inheritance: - ``` - -+### Image Classes -+ - ```{eval-rst} --.. autoclass:: vllm.multimodal.MultiModalPlugin -+.. automodule:: vllm.multimodal.image - :members: - :show-inheritance: - ``` - --### Image Classes -+### Video Classes - - ```{eval-rst} --.. automodule:: vllm.multimodal.image -+.. automodule:: vllm.multimodal.video - :members: - :show-inheritance: - ``` -diff --git a/docs/source/design/multiprocessing.md b/docs/source/design/multiprocessing.md -index 34564413b..da87638e5 100644 ---- a/docs/source/design/multiprocessing.md -+++ b/docs/source/design/multiprocessing.md -@@ -2,7 +2,7 @@ - - ## Debugging - --Please see the [Debugging Tips](#debugging-python-multiprocessing) -+Please see the [Troubleshooting](#troubleshooting-python-multiprocessing) - page for information on known issues and how to solve them. - - ## Introduction -diff --git a/docs/source/design/plugin_system.md b/docs/source/design/plugin_system.md -index 79aff7575..225030885 100644 ---- a/docs/source/design/plugin_system.md -+++ b/docs/source/design/plugin_system.md -@@ -41,9 +41,11 @@ Every plugin has three parts: - 2. **Plugin name**: The name of the plugin. This is the value in the dictionary of the `entry_points` dictionary. In the example above, the plugin name is `register_dummy_model`. Plugins can be filtered by their names using the `VLLM_PLUGINS` environment variable. To load only a specific plugin, set `VLLM_PLUGINS` to the plugin name. - 3. **Plugin value**: The fully qualified name of the function to register in the plugin system. In the example above, the plugin value is `vllm_add_dummy_model:register`, which refers to a function named `register` in the `vllm_add_dummy_model` module. - --## What Can Plugins Do? -+## Types of supported plugins - --Currently, the primary use case for plugins is to register custom, out-of-the-tree models into vLLM. This is done by calling `ModelRegistry.register_model` to register the model. In the future, the plugin system may be extended to support more features, such as swapping in custom implementations for certain classes in vLLM. -+- **General plugins** (with group name `vllm.general_plugins`): The primary use case for these plugins is to register custom, out-of-the-tree models into vLLM. This is done by calling `ModelRegistry.register_model` to register the model inside the plugin function. -+ -+- **Platform plugins** (with group name `vllm.platform_plugins`): The primary use case for these plugins is to register custom, out-of-the-tree platforms into vLLM. The plugin function should return `None` when the platform is not supported in the current environment, or the platform class's fully qualified name when the platform is supported. - - ## Guidelines for Writing Plugins - -diff --git a/docs/source/usage/faq.md b/docs/source/getting_started/faq.md -similarity index 100% -rename from docs/source/usage/faq.md -rename to docs/source/getting_started/faq.md -diff --git a/docs/source/getting_started/arm-installation.md b/docs/source/getting_started/installation/cpu-arm.md -similarity index 88% -rename from docs/source/getting_started/arm-installation.md -rename to docs/source/getting_started/installation/cpu-arm.md -index de807e198..a46e2c010 100644 ---- a/docs/source/getting_started/arm-installation.md -+++ b/docs/source/getting_started/installation/cpu-arm.md -@@ -2,7 +2,7 @@ - - # Installation for ARM CPUs - --vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform. This guide provides installation instructions specific to ARM. For additional details on supported features, refer to the x86 platform documentation covering: -+vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform. This guide provides installation instructions specific to ARM. For additional details on supported features, refer to the [x86 CPU documentation](#installation-x86) covering: - - - CPU backend inference capabilities - - Relevant runtime environment variables -@@ -20,7 +20,7 @@ Contents: - ## Requirements - - - **Operating System**: Linux or macOS --- **Compiler**: gcc/g++ >= 12.3.0 (optional, but recommended) -+- **Compiler**: `gcc/g++ >= 12.3.0` (optional, but recommended) - - **Instruction Set Architecture (ISA)**: NEON support is required - - (arm-backend-quick-start-dockerfile)= -diff --git a/docs/source/getting_started/cpu-installation.md b/docs/source/getting_started/installation/cpu-x86.md -similarity index 94% -rename from docs/source/getting_started/cpu-installation.md -rename to docs/source/getting_started/installation/cpu-x86.md -index b6f181ace..bbb2d1872 100644 ---- a/docs/source/getting_started/cpu-installation.md -+++ b/docs/source/getting_started/installation/cpu-x86.md -@@ -1,6 +1,6 @@ --(installation-cpu)= -+(installation-x86)= - --# Installation with CPU -+# Installation for x86 CPUs - - vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32, FP16 and BF16. vLLM CPU backend supports the following vLLM features: - -@@ -24,7 +24,7 @@ Table of contents: - ## Requirements - - - OS: Linux --- Compiler: gcc/g++>=12.3.0 (optional, recommended) -+- Compiler: `gcc/g++>=12.3.0` (optional, recommended) - - Instruction set architecture (ISA) requirement: AVX512 (optional, recommended) - - (cpu-backend-quick-start-dockerfile)= -@@ -69,7 +69,7 @@ $ VLLM_TARGET_DEVICE=cpu python setup.py install - - ```{note} - - AVX512_BF16 is an extension ISA provides native BF16 data type conversion and vector product instructions, will brings some performance improvement compared with pure AVX512. The CPU backend build script will check the host CPU flags to determine whether to enable AVX512_BF16. --- If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable VLLM_CPU_AVX512BF16=1 before the building. -+- If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable `VLLM_CPU_AVX512BF16=1` before the building. - ``` - - (env-intro)= -@@ -151,4 +151,4 @@ $ python examples/offline_inference.py - $ VLLM_CPU_KVCACHE_SPACE=40 VLLM_CPU_OMP_THREADS_BIND="0-31|32-63" vllm serve meta-llama/Llama-2-7b-chat-hf -tp=2 --distributed-executor-backend mp - ``` - -- - Using Data Parallel for maximum throughput: to launch an LLM serving endpoint on each NUMA node along with one additional load balancer to dispatch the requests to those endpoints. Common solutions like [Nginx](../serving/deploying_with_nginx.md) or HAProxy are recommended. Anyscale Ray project provides the feature on LLM [serving](https://docs.ray.io/en/latest/serve/index.html). Here is the example to setup a scalable LLM serving with [Ray Serve](https://github.com/intel/llm-on-ray/blob/main/docs/setup.md). -+ - Using Data Parallel for maximum throughput: to launch an LLM serving endpoint on each NUMA node along with one additional load balancer to dispatch the requests to those endpoints. Common solutions like [Nginx](#nginxloadbalancer) or HAProxy are recommended. Anyscale Ray project provides the feature on LLM [serving](https://docs.ray.io/en/latest/serve/index.html). Here is the example to setup a scalable LLM serving with [Ray Serve](https://github.com/intel/llm-on-ray/blob/main/docs/setup.md). -diff --git a/docs/source/getting_started/installation.md b/docs/source/getting_started/installation/gpu-cuda.md -similarity index 99% -rename from docs/source/getting_started/installation.md -rename to docs/source/getting_started/installation/gpu-cuda.md -index 996fb346f..7ea10bb8b 100644 ---- a/docs/source/getting_started/installation.md -+++ b/docs/source/getting_started/installation/gpu-cuda.md -@@ -1,6 +1,6 @@ --(installation)= -+(installation-cuda)= - --# Installation -+# Installation for CUDA - - vLLM is a Python library that also contains pre-compiled C++ and CUDA (12.1) binaries. - -diff --git a/docs/source/getting_started/amd-installation.md b/docs/source/getting_started/installation/gpu-rocm.md -similarity index 99% -rename from docs/source/getting_started/amd-installation.md -rename to docs/source/getting_started/installation/gpu-rocm.md -index 6d01efbbf..796911d73 100644 ---- a/docs/source/getting_started/amd-installation.md -+++ b/docs/source/getting_started/installation/gpu-rocm.md -@@ -1,6 +1,6 @@ - (installation-rocm)= - --# Installation with ROCm -+# Installation for ROCm - - vLLM supports AMD GPUs with ROCm 6.2. - -diff --git a/docs/source/getting_started/gaudi-installation.md b/docs/source/getting_started/installation/hpu-gaudi.md -similarity index 96% -rename from docs/source/getting_started/gaudi-installation.md -rename to docs/source/getting_started/installation/hpu-gaudi.md -index acf42f210..94de169f5 100644 ---- a/docs/source/getting_started/gaudi-installation.md -+++ b/docs/source/getting_started/installation/hpu-gaudi.md -@@ -1,4 +1,6 @@ --# Installation with Intel® Gaudi® AI Accelerators -+(installation-gaudi)= -+ -+# Installation for Intel® Gaudi® - - This README provides instructions on running vLLM with Intel Gaudi devices. - -@@ -141,32 +143,33 @@ Gaudi2 devices. Configurations that are not listed may or may not work. - - Currently in vLLM for HPU we support four execution modes, depending on selected HPU PyTorch Bridge backend (via `PT_HPU_LAZY_MODE` environment variable), and `--enforce-eager` flag. - --```{eval-rst} --.. list-table:: vLLM execution modes -- :widths: 25 25 50 -- :header-rows: 1 -- -- * - ``PT_HPU_LAZY_MODE`` -- - ``enforce_eager`` -- - execution mode -- * - 0 -- - 0 -- - torch.compile -- * - 0 -- - 1 -- - PyTorch eager mode -- * - 1 -- - 0 -- - HPU Graphs -- * - 1 -- - 1 -- - PyTorch lazy mode -+```{list-table} vLLM execution modes -+:widths: 25 25 50 -+:header-rows: 1 -+ -+* - `PT_HPU_LAZY_MODE` -+ - `enforce_eager` -+ - execution mode -+* - 0 -+ - 0 -+ - torch.compile -+* - 0 -+ - 1 -+ - PyTorch eager mode -+* - 1 -+ - 0 -+ - HPU Graphs -+* - 1 -+ - 1 -+ - PyTorch lazy mode - ``` - - ```{warning} - In 1.18.0, all modes utilizing `PT_HPU_LAZY_MODE=0` are highly experimental and should be only used for validating functional correctness. Their performance will be improved in the next releases. For obtaining the best performance in 1.18.0, please use HPU Graphs, or PyTorch lazy mode. - ``` - -+(gaudi-bucketing-mechanism)= -+ - ### Bucketing mechanism - - Intel Gaudi accelerators work best when operating on models with fixed tensor shapes. [Intel Gaudi Graph Compiler](https://docs.habana.ai/en/latest/Gaudi_Overview/Intel_Gaudi_Software_Suite.html#graph-compiler-and-runtime) is responsible for generating optimized binary code that implements the given model topology on Gaudi. In its default configuration, the produced binary code may be heavily dependent on input and output tensor shapes, and can require graph recompilation when encountering differently shaped tensors within the same topology. While the resulting binaries utilize Gaudi efficiently, the compilation itself may introduce a noticeable overhead in end-to-end execution. -@@ -185,7 +188,7 @@ INFO 08-01 21:37:59 hpu_model_runner.py:504] Decode bucket config (min, step, ma - INFO 08-01 21:37:59 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)] - ``` - --`min` determines the lowest value of the bucket. `step` determines the interval between buckets, and `max` determines the upper bound of the bucket. Furthermore, interval between `min` and `step` has special handling - `min` gets multiplied by consecutive powers of two, until `step` gets reached. We call this the ramp-up phase and it is used for handling lower batch sizes with minimum wastage, while allowing larger padding on larger batch sizes. -+`min` determines the lowest value of the bucket. `step` determines the interval between buckets, and `max` determines the upper bound of the bucket. Furthermore, interval between `min` and `step` has special handling -- `min` gets multiplied by consecutive powers of two, until `step` gets reached. We call this the ramp-up phase and it is used for handling lower batch sizes with minimum wastage, while allowing larger padding on larger batch sizes. - - Example (with ramp-up) - -@@ -214,7 +217,7 @@ If a request exceeds maximum bucket size in any dimension, it will be processed - As an example, if a request of 3 sequences, with max sequence length of 412 comes in to an idle vLLM server, it will be padded executed as `(4, 512)` prefill bucket, as `batch_size` (number of sequences) will be padded to 4 (closest batch_size dimension higher than 3), and max sequence length will be padded to 512 (closest sequence length dimension higher than 412). After prefill stage, it will be executed as `(4, 512)` decode bucket and will continue as that bucket until either batch dimension changes (due to request being finished) - in which case it will become a `(2, 512)` bucket, or context length increases above 512 tokens, in which case it will become `(4, 640)` bucket. - - ```{note} --Bucketing is transparent to a client - padding in sequence length dimension is never returned to the client, and padding in batch dimension does not create new requests. -+Bucketing is transparent to a client -- padding in sequence length dimension is never returned to the client, and padding in batch dimension does not create new requests. - ``` - - ### Warmup -@@ -235,7 +238,7 @@ INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][47/48] batch_size - INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB - ``` - --This example uses the same buckets as in *Bucketing mechanism* section. Each output line corresponds to execution of a single bucket. When bucket is executed for the first time, its graph is compiled and can be reused later on, skipping further graph compilations. -+This example uses the same buckets as in the [Bucketing Mechanism](#gaudi-bucketing-mechanism) section. Each output line corresponds to execution of a single bucket. When bucket is executed for the first time, its graph is compiled and can be reused later on, skipping further graph compilations. - - ```{tip} - Compiling all the buckets might take some time and can be turned off with `VLLM_SKIP_WARMUP=true` environment variable. Keep in mind that if you do that, you may face graph compilations once executing a given bucket for the first time. It is fine to disable warmup for development, but it's highly recommended to enable it in deployment. -diff --git a/docs/source/getting_started/installation/index.md b/docs/source/getting_started/installation/index.md -new file mode 100644 -index 000000000..83de1aff4 ---- /dev/null -+++ b/docs/source/getting_started/installation/index.md -@@ -0,0 +1,19 @@ -+(installation-index)= -+ -+# Installation -+ -+vLLM supports the following hardware platforms: -+ -+```{toctree} -+:maxdepth: 1 -+ -+gpu-cuda -+gpu-rocm -+cpu-x86 -+cpu-arm -+hpu-gaudi -+tpu -+xpu -+openvino -+neuron -+``` -diff --git a/docs/source/getting_started/neuron-installation.md b/docs/source/getting_started/installation/neuron.md -similarity index 95% -rename from docs/source/getting_started/neuron-installation.md -rename to docs/source/getting_started/installation/neuron.md -index d6de5760c..431f90537 100644 ---- a/docs/source/getting_started/neuron-installation.md -+++ b/docs/source/getting_started/installation/neuron.md -@@ -1,6 +1,6 @@ - (installation-neuron)= - --# Installation with Neuron -+# Installation for Neuron - - vLLM 0.3.3 onwards supports model inferencing and serving on AWS Trainium/Inferentia with Neuron SDK with continuous batching. - Paged Attention and Chunked Prefill are currently in development and will be available soon. -@@ -26,7 +26,7 @@ Installation steps: - (build-from-source-neuron)= - - ```{note} --The currently supported version of Pytorch for Neuron installs `triton` version `2.1.0`. This is incompatible with vLLM >= 0.5.3. You may see an error `cannot import name 'default_dump_dir...`. To work around this, run a `pip install --upgrade triton==3.0.0` after installing the vLLM wheel. -+The currently supported version of Pytorch for Neuron installs `triton` version `2.1.0`. This is incompatible with `vllm >= 0.5.3`. You may see an error `cannot import name 'default_dump_dir...`. To work around this, run a `pip install --upgrade triton==3.0.0` after installing the vLLM wheel. - ``` - - ## Build from source -diff --git a/docs/source/getting_started/openvino-installation.md b/docs/source/getting_started/installation/openvino.md -similarity index 90% -rename from docs/source/getting_started/openvino-installation.md -rename to docs/source/getting_started/installation/openvino.md -index 8b43c0a90..60f95fd1c 100644 ---- a/docs/source/getting_started/openvino-installation.md -+++ b/docs/source/getting_started/installation/openvino.md -@@ -1,8 +1,8 @@ - (installation-openvino)= - --# Installation with OpenVINO -+# Installation for OpenVINO - --vLLM powered by OpenVINO supports all LLM models from {doc}`vLLM supported models list <../models/supported_models>` and can perform optimal model serving on all x86-64 CPUs with, at least, AVX2 support, as well as on both integrated and discrete Intel® GPUs ([the list of supported GPUs](https://docs.openvino.ai/2024/about-openvino/release-notes-openvino/system-requirements.html#gpu)). OpenVINO vLLM backend supports the following advanced vLLM features: -+vLLM powered by OpenVINO supports all LLM models from [vLLM supported models list](#supported-models) and can perform optimal model serving on all x86-64 CPUs with, at least, AVX2 support, as well as on both integrated and discrete Intel® GPUs ([the list of supported GPUs](https://docs.openvino.ai/2024/about-openvino/release-notes-openvino/system-requirements.html#gpu)). OpenVINO vLLM backend supports the following advanced vLLM features: - - - Prefix caching (`--enable-prefix-caching`) - - Chunked prefill (`--enable-chunked-prefill`) -diff --git a/docs/source/getting_started/tpu-installation.md b/docs/source/getting_started/installation/tpu.md -similarity index 79% -rename from docs/source/getting_started/tpu-installation.md -rename to docs/source/getting_started/installation/tpu.md -index f2a949e72..bc93c44fe 100644 ---- a/docs/source/getting_started/tpu-installation.md -+++ b/docs/source/getting_started/installation/tpu.md -@@ -1,6 +1,6 @@ - (installation-tpu)= - --# Installation with TPU -+# Installation for TPUs - - Tensor Processing Units (TPUs) are Google's custom-developed application-specific - integrated circuits (ASICs) used to accelerate machine learning workloads. TPUs -@@ -68,33 +68,32 @@ gcloud alpha compute tpus queued-resources create QUEUED_RESOURCE_ID \ - --service-account SERVICE_ACCOUNT - ``` - --```{eval-rst} --.. list-table:: Parameter descriptions -- :header-rows: 1 -- -- * - Parameter name -- - Description -- * - QUEUED_RESOURCE_ID -- - The user-assigned ID of the queued resource request. -- * - TPU_NAME -- - The user-assigned name of the TPU which is created when the queued -- resource request is allocated. -- * - PROJECT_ID -- - Your Google Cloud project -- * - ZONE -- - The GCP zone where you want to create your Cloud TPU. The value you use -- depends on the version of TPUs you are using. For more information, see -- `TPU regions and zones `_ -- * - ACCELERATOR_TYPE -- - The TPU version you want to use. Specify the TPU version, for example -- `v5litepod-4` specifies a v5e TPU with 4 cores. For more information, -- see `TPU versions `_. -- * - RUNTIME_VERSION -- - The TPU VM runtime version to use. For more information see `TPU VM images `_. -- * - SERVICE_ACCOUNT -- - The email address for your service account. You can find it in the IAM -- Cloud Console under *Service Accounts*. For example: -- `tpu-service-account@.iam.gserviceaccount.com` -+```{list-table} Parameter descriptions -+:header-rows: 1 -+ -+* - Parameter name -+ - Description -+* - QUEUED_RESOURCE_ID -+ - The user-assigned ID of the queued resource request. -+* - TPU_NAME -+ - The user-assigned name of the TPU which is created when the queued -+ resource request is allocated. -+* - PROJECT_ID -+ - Your Google Cloud project -+* - ZONE -+ - The GCP zone where you want to create your Cloud TPU. The value you use -+ depends on the version of TPUs you are using. For more information, see -+ `TPU regions and zones `_ -+* - ACCELERATOR_TYPE -+ - The TPU version you want to use. Specify the TPU version, for example -+ `v5litepod-4` specifies a v5e TPU with 4 cores. For more information, -+ see `TPU versions `_. -+* - RUNTIME_VERSION -+ - The TPU VM runtime version to use. For more information see `TPU VM images `_. -+* - SERVICE_ACCOUNT -+ - The email address for your service account. You can find it in the IAM -+ Cloud Console under *Service Accounts*. For example: -+ `tpu-service-account@.iam.gserviceaccount.com` - ``` - - Connect to your TPU using SSH: -@@ -103,7 +102,7 @@ Connect to your TPU using SSH: - gcloud compute tpus tpu-vm ssh TPU_NAME --zone ZONE - ``` - --Install Miniconda -+Install Miniconda: - - ```bash - wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -diff --git a/docs/source/getting_started/xpu-installation.md b/docs/source/getting_started/installation/xpu.md -similarity index 98% -rename from docs/source/getting_started/xpu-installation.md -rename to docs/source/getting_started/installation/xpu.md -index 9554ae4b7..be4e3b9bd 100644 ---- a/docs/source/getting_started/xpu-installation.md -+++ b/docs/source/getting_started/installation/xpu.md -@@ -1,6 +1,6 @@ - (installation-xpu)= - --# Installation with XPU -+# Installation for XPUs - - vLLM initially supports basic model inferencing and serving on Intel GPU platform. - -diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md -index 165e5df14..ff216f8af 100644 ---- a/docs/source/getting_started/quickstart.md -+++ b/docs/source/getting_started/quickstart.md -@@ -23,7 +23,7 @@ $ conda activate myenv - $ pip install vllm - ``` - --Please refer to the {ref}`installation documentation ` for more details on installing vLLM. -+Please refer to the [installation documentation](#installation-index) for more details on installing vLLM. - - (offline-batched-inference)= - -@@ -114,7 +114,7 @@ $ "temperature": 0 - $ }' - ``` - --Since this server is compatible with OpenAI API, you can use it as a drop-in replacement for any applications using OpenAI API. For example, another way to query the server is via the `openai` python package: -+Since this server is compatible with OpenAI API, you can use it as a drop-in replacement for any applications using OpenAI API. For example, another way to query the server is via the `openai` Python package: - - ```python - from openai import OpenAI -@@ -151,7 +151,7 @@ $ ] - $ }' - ``` - --Alternatively, you can use the `openai` python package: -+Alternatively, you can use the `openai` Python package: - - ```python - from openai import OpenAI -diff --git a/docs/source/getting_started/debugging.md b/docs/source/getting_started/troubleshooting.md -similarity index 90% -rename from docs/source/getting_started/debugging.md -rename to docs/source/getting_started/troubleshooting.md -index 3b0029f2e..5a0310da0 100644 ---- a/docs/source/getting_started/debugging.md -+++ b/docs/source/getting_started/troubleshooting.md -@@ -1,8 +1,8 @@ --(debugging)= -+(troubleshooting)= - --# Debugging Tips -+# Troubleshooting - --This document outlines some debugging strategies you can consider. If you think you've discovered a bug, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible. -+This document outlines some troubleshooting strategies you can consider. If you think you've discovered a bug, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible. - - ```{note} - Once you've debugged a problem, remember to turn off any debugging environment variables defined, or simply start a new shell to avoid being affected by lingering debugging settings. Otherwise, the system might be slow with debugging functionalities left activated. -@@ -47,6 +47,7 @@ You might also need to set `export NCCL_SOCKET_IFNAME=` - If vLLM crashes and the error trace captures it somewhere around `self.graph.replay()` in `vllm/worker/model_runner.py`, it is a CUDA error inside CUDAGraph. - To identify the particular CUDA operation that causes the error, you can add `--enforce-eager` to the command line, or `enforce_eager=True` to the {class}`~vllm.LLM` class to disable the CUDAGraph optimization and isolate the exact CUDA operation that causes the error. - -+(troubleshooting-incorrect-hardware-driver)= - ## Incorrect hardware/driver - - If GPU/CPU communication cannot be established, you can use the following Python script and follow the instructions below to confirm whether the GPU/CPU communication is working correctly. -@@ -139,7 +140,7 @@ A multi-node environment is more complicated than a single-node one. If you see - Adjust `--nproc-per-node`, `--nnodes`, and `--node-rank` according to your setup, being sure to execute different commands (with different `--node-rank`) on different nodes. - ``` - --(debugging-python-multiprocessing)= -+(troubleshooting-python-multiprocessing)= - ## Python multiprocessing - - ### `RuntimeError` Exception -@@ -150,7 +151,7 @@ If you have seen a warning in your logs like this: - WARNING 12-11 14:50:37 multiproc_worker_utils.py:281] CUDA was previously - initialized. We must use the `spawn` multiprocessing start method. Setting - VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See -- https://docs.vllm.ai/en/latest/getting_started/debugging.html#python-multiprocessing -+ https://docs.vllm.ai/en/latest/getting_started/troubleshooting.html#python-multiprocessing - for more information. - ``` - -@@ -197,4 +198,4 @@ if __name__ == '__main__': - ## Known Issues - - - In `v0.5.2`, `v0.5.3`, and `v0.5.3.post1`, there is a bug caused by [zmq](https://github.com/zeromq/pyzmq/issues/2000) , which can occasionally cause vLLM to hang depending on the machine configuration. The solution is to upgrade to the latest version of `vllm` to include the [fix](gh-pr:6759). --- To circumvent a NCCL [bug](https://github.com/NVIDIA/nccl/issues/1234) , all vLLM processes will set an environment variable ``NCCL_CUMEM_ENABLE=0`` to disable NCCL's ``cuMem`` allocator. It does not affect performance but only gives memory benefits. When external processes want to set up a NCCL connection with vLLM's processes, they should also set this environment variable, otherwise, inconsistent environment setup will cause NCCL to hang or crash, as observed in the [RLHF integration](https://github.com/OpenRLHF/OpenRLHF/pull/604) and the [discussion](gh-issue:5723#issuecomment-2554389656) . -+- To circumvent a NCCL [bug](https://github.com/NVIDIA/nccl/issues/1234) , all vLLM processes will set an environment variable `NCCL_CUMEM_ENABLE=0` to disable NCCL's `cuMem` allocator. It does not affect performance but only gives memory benefits. When external processes want to set up a NCCL connection with vLLM's processes, they should also set this environment variable, otherwise, inconsistent environment setup will cause NCCL to hang or crash, as observed in the [RLHF integration](https://github.com/OpenRLHF/OpenRLHF/pull/604) and the [discussion](gh-issue:5723#issuecomment-2554389656) . -diff --git a/docs/source/index.md b/docs/source/index.md -index 34f9c4cae..f39047497 100644 ---- a/docs/source/index.md -+++ b/docs/source/index.md -@@ -50,7 +50,7 @@ For more information, check out the following: - - [vLLM announcing blog post](https://vllm.ai) (intro to PagedAttention) - - [vLLM paper](https://arxiv.org/abs/2309.06180) (SOSP 2023) - - [How continuous batching enables 23x throughput in LLM inference while reducing p50 latency](https://www.anyscale.com/blog/continuous-batching-llm-inference) by Cade Daniel et al. --- {ref}`vLLM Meetups `. -+- [vLLM Meetups](#meetups) - - ## Documentation - -@@ -58,18 +58,11 @@ For more information, check out the following: - :caption: Getting Started - :maxdepth: 1 - --getting_started/installation --getting_started/amd-installation --getting_started/openvino-installation --getting_started/cpu-installation --getting_started/gaudi-installation --getting_started/arm-installation --getting_started/neuron-installation --getting_started/tpu-installation --getting_started/xpu-installation -+getting_started/installation/index - getting_started/quickstart --getting_started/debugging - getting_started/examples/examples_index -+getting_started/troubleshooting -+getting_started/faq - ``` - - ```{toctree} -@@ -110,7 +103,6 @@ usage/structured_outputs - usage/spec_decode - usage/compatibility_matrix - usage/performance --usage/faq - usage/engine_args - usage/env_vars - usage/usage_stats -diff --git a/docs/source/models/generative_models.md b/docs/source/models/generative_models.md -index 35e0302b8..383299d61 100644 ---- a/docs/source/models/generative_models.md -+++ b/docs/source/models/generative_models.md -@@ -120,7 +120,7 @@ outputs = llm.chat(conversation, chat_template=custom_template) - - ## Online Inference - --Our [OpenAI Compatible Server](../serving/openai_compatible_server.md) provides endpoints that correspond to the offline APIs: -+Our [OpenAI-Compatible Server](#openai-compatible-server) provides endpoints that correspond to the offline APIs: - - - [Completions API](#completions-api) is similar to `LLM.generate` but only accepts text. - - [Chat API](#chat-api) is similar to `LLM.chat`, accepting both text and [multi-modal inputs](#multimodal-inputs) for models with a chat template. -diff --git a/docs/source/models/pooling_models.md b/docs/source/models/pooling_models.md -index 76c96c9ed..12ded68eb 100644 ---- a/docs/source/models/pooling_models.md -+++ b/docs/source/models/pooling_models.md -@@ -106,7 +106,7 @@ A code example can be found here: ` -- - :ref:`PP ` -- * - :code:`AquilaForCausalLM` -- - Aquila, Aquila2 -- - :code:`BAAI/Aquila-7B`, :code:`BAAI/AquilaChat-7B`, etc. -- - ✅︎ -- - ✅︎ -- * - :code:`ArcticForCausalLM` -- - Arctic -- - :code:`Snowflake/snowflake-arctic-base`, :code:`Snowflake/snowflake-arctic-instruct`, etc. -- - -- - ✅︎ -- * - :code:`BaiChuanForCausalLM` -- - Baichuan2, Baichuan -- - :code:`baichuan-inc/Baichuan2-13B-Chat`, :code:`baichuan-inc/Baichuan-7B`, etc. -- - ✅︎ -- - ✅︎ -- * - :code:`BloomForCausalLM` -- - BLOOM, BLOOMZ, BLOOMChat -- - :code:`bigscience/bloom`, :code:`bigscience/bloomz`, etc. -- - -- - ✅︎ -- * - :code:`BartForConditionalGeneration` -- - BART -- - :code:`facebook/bart-base`, :code:`facebook/bart-large-cnn`, etc. -- - -- - -- * - :code:`ChatGLMModel` -- - ChatGLM -- - :code:`THUDM/chatglm2-6b`, :code:`THUDM/chatglm3-6b`, etc. -- - ✅︎ -- - ✅︎ -- * - :code:`CohereForCausalLM`,:code:`Cohere2ForCausalLM` -- - Command-R -- - :code:`CohereForAI/c4ai-command-r-v01`, :code:`CohereForAI/c4ai-command-r7b-12-2024`, etc. -- - ✅︎ -- - ✅︎ -- * - :code:`DbrxForCausalLM` -- - DBRX -- - :code:`databricks/dbrx-base`, :code:`databricks/dbrx-instruct`, etc. -- - -- - ✅︎ -- * - :code:`DeciLMForCausalLM` -- - DeciLM -- - :code:`Deci/DeciLM-7B`, :code:`Deci/DeciLM-7B-instruct`, etc. -- - -- - ✅︎ -- * - :code:`DeepseekForCausalLM` -- - DeepSeek -- - :code:`deepseek-ai/deepseek-llm-67b-base`, :code:`deepseek-ai/deepseek-llm-7b-chat` etc. -- - -- - ✅︎ -- * - :code:`DeepseekV2ForCausalLM` -- - DeepSeek-V2 -- - :code:`deepseek-ai/DeepSeek-V2`, :code:`deepseek-ai/DeepSeek-V2-Chat` etc. -- - -- - ✅︎ -- * - :code:`DeepseekV3ForCausalLM` -- - DeepSeek-V3 -- - :code:`deepseek-ai/DeepSeek-V3-Base`, :code:`deepseek-ai/DeepSeek-V3` etc. -- - -- - ✅︎ -- * - :code:`ExaoneForCausalLM` -- - EXAONE-3 -- - :code:`LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc. -- - ✅︎ -- - ✅︎ -- * - :code:`FalconForCausalLM` -- - Falcon -- - :code:`tiiuae/falcon-7b`, :code:`tiiuae/falcon-40b`, :code:`tiiuae/falcon-rw-7b`, etc. -- - -- - ✅︎ -- * - :code:`FalconMambaForCausalLM` -- - FalconMamba -- - :code:`tiiuae/falcon-mamba-7b`, :code:`tiiuae/falcon-mamba-7b-instruct`, etc. -- - ✅︎ -- - ✅︎ -- * - :code:`GemmaForCausalLM` -- - Gemma -- - :code:`google/gemma-2b`, :code:`google/gemma-7b`, etc. -- - ✅︎ -- - ✅︎ -- * - :code:`Gemma2ForCausalLM` -- - Gemma2 -- - :code:`google/gemma-2-9b`, :code:`google/gemma-2-27b`, etc. -- - ✅︎ -- - ✅︎ -- * - :code:`GlmForCausalLM` -- - GLM-4 -- - :code:`THUDM/glm-4-9b-chat-hf`, etc. -- - ✅︎ -- - ✅︎ -- * - :code:`GPT2LMHeadModel` -- - GPT-2 -- - :code:`gpt2`, :code:`gpt2-xl`, etc. -- - -- - ✅︎ -- * - :code:`GPTBigCodeForCausalLM` -- - StarCoder, SantaCoder, WizardCoder -- - :code:`bigcode/starcoder`, :code:`bigcode/gpt_bigcode-santacoder`, :code:`WizardLM/WizardCoder-15B-V1.0`, etc. -- - ✅︎ -- - ✅︎ -- * - :code:`GPTJForCausalLM` -- - GPT-J -- - :code:`EleutherAI/gpt-j-6b`, :code:`nomic-ai/gpt4all-j`, etc. -- - -- - ✅︎ -- * - :code:`GPTNeoXForCausalLM` -- - GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM -- - :code:`EleutherAI/gpt-neox-20b`, :code:`EleutherAI/pythia-12b`, :code:`OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, :code:`databricks/dolly-v2-12b`, :code:`stabilityai/stablelm-tuned-alpha-7b`, etc. -- - -- - ✅︎ -- * - :code:`GraniteForCausalLM` -- - Granite 3.0, Granite 3.1, PowerLM -- - :code:`ibm-granite/granite-3.0-2b-base`, :code:`ibm-granite/granite-3.1-8b-instruct`, :code:`ibm/PowerLM-3b`, etc. -- - ✅︎ -- - ✅︎ -- * - :code:`GraniteMoeForCausalLM` -- - Granite 3.0 MoE, PowerMoE -- - :code:`ibm-granite/granite-3.0-1b-a400m-base`, :code:`ibm-granite/granite-3.0-3b-a800m-instruct`, :code:`ibm/PowerMoE-3b`, etc. -- - ✅︎ -- - ✅︎ -- * - :code:`GritLM` -- - GritLM -- - :code:`parasail-ai/GritLM-7B-vllm`. -- - ✅︎ -- - ✅︎ -- * - :code:`InternLMForCausalLM` -- - InternLM -- - :code:`internlm/internlm-7b`, :code:`internlm/internlm-chat-7b`, etc. -- - ✅︎ -- - ✅︎ -- * - :code:`InternLM2ForCausalLM` -- - InternLM2 -- - :code:`internlm/internlm2-7b`, :code:`internlm/internlm2-chat-7b`, etc. -- - ✅︎ -- - ✅︎ -- * - :code:`JAISLMHeadModel` -- - Jais -- - :code:`inceptionai/jais-13b`, :code:`inceptionai/jais-13b-chat`, :code:`inceptionai/jais-30b-v3`, :code:`inceptionai/jais-30b-chat-v3`, etc. -- - -- - ✅︎ -- * - :code:`JambaForCausalLM` -- - Jamba -- - :code:`ai21labs/AI21-Jamba-1.5-Large`, :code:`ai21labs/AI21-Jamba-1.5-Mini`, :code:`ai21labs/Jamba-v0.1`, etc. -- - ✅︎ -- - ✅︎ -- * - :code:`LlamaForCausalLM` -- - Llama 3.1, Llama 3, Llama 2, LLaMA, Yi -- - :code:`meta-llama/Meta-Llama-3.1-405B-Instruct`, :code:`meta-llama/Meta-Llama-3.1-70B`, :code:`meta-llama/Meta-Llama-3-70B-Instruct`, :code:`meta-llama/Llama-2-70b-hf`, :code:`01-ai/Yi-34B`, etc. -- - ✅︎ -- - ✅︎ -- * - :code:`MambaForCausalLM` -- - Mamba -- - :code:`state-spaces/mamba-130m-hf`, :code:`state-spaces/mamba-790m-hf`, :code:`state-spaces/mamba-2.8b-hf`, etc. -- - -- - ✅︎ -- * - :code:`MiniCPMForCausalLM` -- - MiniCPM -- - :code:`openbmb/MiniCPM-2B-sft-bf16`, :code:`openbmb/MiniCPM-2B-dpo-bf16`, :code:`openbmb/MiniCPM-S-1B-sft`, etc. -- - ✅︎ -- - ✅︎ -- * - :code:`MiniCPM3ForCausalLM` -- - MiniCPM3 -- - :code:`openbmb/MiniCPM3-4B`, etc. -- - ✅︎ -- - ✅︎ -- * - :code:`MistralForCausalLM` -- - Mistral, Mistral-Instruct -- - :code:`mistralai/Mistral-7B-v0.1`, :code:`mistralai/Mistral-7B-Instruct-v0.1`, etc. -- - ✅︎ -- - ✅︎ -- * - :code:`MixtralForCausalLM` -- - Mixtral-8x7B, Mixtral-8x7B-Instruct -- - :code:`mistralai/Mixtral-8x7B-v0.1`, :code:`mistralai/Mixtral-8x7B-Instruct-v0.1`, :code:`mistral-community/Mixtral-8x22B-v0.1`, etc. -- - ✅︎ -- - ✅︎ -- * - :code:`MPTForCausalLM` -- - MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter -- - :code:`mosaicml/mpt-7b`, :code:`mosaicml/mpt-7b-storywriter`, :code:`mosaicml/mpt-30b`, etc. -- - -- - ✅︎ -- * - :code:`NemotronForCausalLM` -- - Nemotron-3, Nemotron-4, Minitron -- - :code:`nvidia/Minitron-8B-Base`, :code:`mgoin/Nemotron-4-340B-Base-hf-FP8`, etc. -- - ✅︎ -- - ✅︎ -- * - :code:`OLMoForCausalLM` -- - OLMo -- - :code:`allenai/OLMo-1B-hf`, :code:`allenai/OLMo-7B-hf`, etc. -- - -- - ✅︎ -- * - :code:`OLMo2ForCausalLM` -- - OLMo2 -- - :code:`allenai/OLMo2-7B-1124`, etc. -- - -- - ✅︎ -- * - :code:`OLMoEForCausalLM` -- - OLMoE -- - :code:`allenai/OLMoE-1B-7B-0924`, :code:`allenai/OLMoE-1B-7B-0924-Instruct`, etc. -- - ✅︎ -- - ✅︎ -- * - :code:`OPTForCausalLM` -- - OPT, OPT-IML -- - :code:`facebook/opt-66b`, :code:`facebook/opt-iml-max-30b`, etc. -- - -- - ✅︎ -- * - :code:`OrionForCausalLM` -- - Orion -- - :code:`OrionStarAI/Orion-14B-Base`, :code:`OrionStarAI/Orion-14B-Chat`, etc. -- - -- - ✅︎ -- * - :code:`PhiForCausalLM` -- - Phi -- - :code:`microsoft/phi-1_5`, :code:`microsoft/phi-2`, etc. -- - ✅︎ -- - ✅︎ -- * - :code:`Phi3ForCausalLM` -- - Phi-3 -- - :code:`microsoft/Phi-3-mini-4k-instruct`, :code:`microsoft/Phi-3-mini-128k-instruct`, :code:`microsoft/Phi-3-medium-128k-instruct`, etc. -- - ✅︎ -- - ✅︎ -- * - :code:`Phi3SmallForCausalLM` -- - Phi-3-Small -- - :code:`microsoft/Phi-3-small-8k-instruct`, :code:`microsoft/Phi-3-small-128k-instruct`, etc. -- - -- - ✅︎ -- * - :code:`PhiMoEForCausalLM` -- - Phi-3.5-MoE -- - :code:`microsoft/Phi-3.5-MoE-instruct`, etc. -- - ✅︎ -- - ✅︎ -- * - :code:`PersimmonForCausalLM` -- - Persimmon -- - :code:`adept/persimmon-8b-base`, :code:`adept/persimmon-8b-chat`, etc. -- - -- - ✅︎ -- * - :code:`QWenLMHeadModel` -- - Qwen -- - :code:`Qwen/Qwen-7B`, :code:`Qwen/Qwen-7B-Chat`, etc. -- - ✅︎ -- - ✅︎ -- * - :code:`Qwen2ForCausalLM` -- - Qwen2 -- - :code:`Qwen/QwQ-32B-Preview`, :code:`Qwen/Qwen2-7B-Instruct`, :code:`Qwen/Qwen2-7B`, etc. -- - ✅︎ -- - ✅︎ -- * - :code:`Qwen2MoeForCausalLM` -- - Qwen2MoE -- - :code:`Qwen/Qwen1.5-MoE-A2.7B`, :code:`Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc. -- - -- - ✅︎ -- * - :code:`StableLmForCausalLM` -- - StableLM -- - :code:`stabilityai/stablelm-3b-4e1t`, :code:`stabilityai/stablelm-base-alpha-7b-v2`, etc. -- - -- - ✅︎ -- * - :code:`Starcoder2ForCausalLM` -- - Starcoder2 -- - :code:`bigcode/starcoder2-3b`, :code:`bigcode/starcoder2-7b`, :code:`bigcode/starcoder2-15b`, etc. -- - -- - ✅︎ -- * - :code:`SolarForCausalLM` -- - Solar Pro -- - :code:`upstage/solar-pro-preview-instruct`, etc. -- - ✅︎ -- - ✅︎ -- * - :code:`TeleChat2ForCausalLM` -- - TeleChat2 -- - :code:`TeleAI/TeleChat2-3B`, :code:`TeleAI/TeleChat2-7B`, :code:`TeleAI/TeleChat2-35B`, etc. -- - ✅︎ -- - ✅︎ -- * - :code:`XverseForCausalLM` -- - XVERSE -- - :code:`xverse/XVERSE-7B-Chat`, :code:`xverse/XVERSE-13B-Chat`, :code:`xverse/XVERSE-65B-Chat`, etc. -- - ✅︎ -- - ✅︎ -+```{list-table} -+:widths: 25 25 50 5 5 -+:header-rows: 1 -+ -+* - Architecture -+ - Models -+ - Example HF Models -+ - [LoRA](#lora-adapter) -+ - [PP](#distributed-serving) -+* - `AquilaForCausalLM` -+ - Aquila, Aquila2 -+ - `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc. -+ - ✅︎ -+ - ✅︎ -+* - `ArcticForCausalLM` -+ - Arctic -+ - `Snowflake/snowflake-arctic-base`, `Snowflake/snowflake-arctic-instruct`, etc. -+ - -+ - ✅︎ -+* - `BaiChuanForCausalLM` -+ - Baichuan2, Baichuan -+ - `baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc. -+ - ✅︎ -+ - ✅︎ -+* - `BloomForCausalLM` -+ - BLOOM, BLOOMZ, BLOOMChat -+ - `bigscience/bloom`, `bigscience/bloomz`, etc. -+ - -+ - ✅︎ -+* - `BartForConditionalGeneration` -+ - BART -+ - `facebook/bart-base`, `facebook/bart-large-cnn`, etc. -+ - -+ - -+* - `ChatGLMModel` -+ - ChatGLM -+ - `THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, etc. -+ - ✅︎ -+ - ✅︎ -+* - `CohereForCausalLM`, `Cohere2ForCausalLM` -+ - Command-R -+ - `CohereForAI/c4ai-command-r-v01`, `CohereForAI/c4ai-command-r7b-12-2024`, etc. -+ - ✅︎ -+ - ✅︎ -+* - `DbrxForCausalLM` -+ - DBRX -+ - `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc. -+ - -+ - ✅︎ -+* - `DeciLMForCausalLM` -+ - DeciLM -+ - `Deci/DeciLM-7B`, `Deci/DeciLM-7B-instruct`, etc. -+ - -+ - ✅︎ -+* - `DeepseekForCausalLM` -+ - DeepSeek -+ - `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat` etc. -+ - -+ - ✅︎ -+* - `DeepseekV2ForCausalLM` -+ - DeepSeek-V2 -+ - `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat` etc. -+ - -+ - ✅︎ -+* - `DeepseekV3ForCausalLM` -+ - DeepSeek-V3 -+ - `deepseek-ai/DeepSeek-V3-Base`, `deepseek-ai/DeepSeek-V3` etc. -+ - -+ - ✅︎ -+* - `ExaoneForCausalLM` -+ - EXAONE-3 -+ - `LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc. -+ - ✅︎ -+ - ✅︎ -+* - `FalconForCausalLM` -+ - Falcon -+ - `tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc. -+ - -+ - ✅︎ -+* - `FalconMambaForCausalLM` -+ - FalconMamba -+ - `tiiuae/falcon-mamba-7b`, `tiiuae/falcon-mamba-7b-instruct`, etc. -+ - ✅︎ -+ - ✅︎ -+* - `GemmaForCausalLM` -+ - Gemma -+ - `google/gemma-2b`, `google/gemma-7b`, etc. -+ - ✅︎ -+ - ✅︎ -+* - `Gemma2ForCausalLM` -+ - Gemma2 -+ - `google/gemma-2-9b`, `google/gemma-2-27b`, etc. -+ - ✅︎ -+ - ✅︎ -+* - `GlmForCausalLM` -+ - GLM-4 -+ - `THUDM/glm-4-9b-chat-hf`, etc. -+ - ✅︎ -+ - ✅︎ -+* - `GPT2LMHeadModel` -+ - GPT-2 -+ - `gpt2`, `gpt2-xl`, etc. -+ - -+ - ✅︎ -+* - `GPTBigCodeForCausalLM` -+ - StarCoder, SantaCoder, WizardCoder -+ - `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc. -+ - ✅︎ -+ - ✅︎ -+* - `GPTJForCausalLM` -+ - GPT-J -+ - `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc. -+ - -+ - ✅︎ -+* - `GPTNeoXForCausalLM` -+ - GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM -+ - `EleutherAI/gpt-neox-20b`, `EleutherAI/pythia-12b`, `OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc. -+ - -+ - ✅︎ -+* - `GraniteForCausalLM` -+ - Granite 3.0, Granite 3.1, PowerLM -+ - `ibm-granite/granite-3.0-2b-base`, `ibm-granite/granite-3.1-8b-instruct`, `ibm/PowerLM-3b`, etc. -+ - ✅︎ -+ - ✅︎ -+* - `GraniteMoeForCausalLM` -+ - Granite 3.0 MoE, PowerMoE -+ - `ibm-granite/granite-3.0-1b-a400m-base`, `ibm-granite/granite-3.0-3b-a800m-instruct`, `ibm/PowerMoE-3b`, etc. -+ - ✅︎ -+ - ✅︎ -+* - `GritLM` -+ - GritLM -+ - `parasail-ai/GritLM-7B-vllm`. -+ - ✅︎ -+ - ✅︎ -+* - `InternLMForCausalLM` -+ - InternLM -+ - `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc. -+ - ✅︎ -+ - ✅︎ -+* - `InternLM2ForCausalLM` -+ - InternLM2 -+ - `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc. -+ - ✅︎ -+ - ✅︎ -+* - `JAISLMHeadModel` -+ - Jais -+ - `inceptionai/jais-13b`, `inceptionai/jais-13b-chat`, `inceptionai/jais-30b-v3`, `inceptionai/jais-30b-chat-v3`, etc. -+ - -+ - ✅︎ -+* - `JambaForCausalLM` -+ - Jamba -+ - `ai21labs/AI21-Jamba-1.5-Large`, `ai21labs/AI21-Jamba-1.5-Mini`, `ai21labs/Jamba-v0.1`, etc. -+ - ✅︎ -+ - ✅︎ -+* - `LlamaForCausalLM` -+ - Llama 3.1, Llama 3, Llama 2, LLaMA, Yi -+ - `meta-llama/Meta-Llama-3.1-405B-Instruct`, `meta-llama/Meta-Llama-3.1-70B`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-2-70b-hf`, `01-ai/Yi-34B`, etc. -+ - ✅︎ -+ - ✅︎ -+* - `MambaForCausalLM` -+ - Mamba -+ - `state-spaces/mamba-130m-hf`, `state-spaces/mamba-790m-hf`, `state-spaces/mamba-2.8b-hf`, etc. -+ - -+ - ✅︎ -+* - `MiniCPMForCausalLM` -+ - MiniCPM -+ - `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc. -+ - ✅︎ -+ - ✅︎ -+* - `MiniCPM3ForCausalLM` -+ - MiniCPM3 -+ - `openbmb/MiniCPM3-4B`, etc. -+ - ✅︎ -+ - ✅︎ -+* - `MistralForCausalLM` -+ - Mistral, Mistral-Instruct -+ - `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc. -+ - ✅︎ -+ - ✅︎ -+* - `MixtralForCausalLM` -+ - Mixtral-8x7B, Mixtral-8x7B-Instruct -+ - `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc. -+ - ✅︎ -+ - ✅︎ -+* - `MPTForCausalLM` -+ - MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter -+ - `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc. -+ - -+ - ✅︎ -+* - `NemotronForCausalLM` -+ - Nemotron-3, Nemotron-4, Minitron -+ - `nvidia/Minitron-8B-Base`, `mgoin/Nemotron-4-340B-Base-hf-FP8`, etc. -+ - ✅︎ -+ - ✅︎ -+* - `OLMoForCausalLM` -+ - OLMo -+ - `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc. -+ - -+ - ✅︎ -+* - `OLMo2ForCausalLM` -+ - OLMo2 -+ - `allenai/OLMo2-7B-1124`, etc. -+ - -+ - ✅︎ -+* - `OLMoEForCausalLM` -+ - OLMoE -+ - `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc. -+ - ✅︎ -+ - ✅︎ -+* - `OPTForCausalLM` -+ - OPT, OPT-IML -+ - `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc. -+ - -+ - ✅︎ -+* - `OrionForCausalLM` -+ - Orion -+ - `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc. -+ - -+ - ✅︎ -+* - `PhiForCausalLM` -+ - Phi -+ - `microsoft/phi-1_5`, `microsoft/phi-2`, etc. -+ - ✅︎ -+ - ✅︎ -+* - `Phi3ForCausalLM` -+ - Phi-3 -+ - `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc. -+ - ✅︎ -+ - ✅︎ -+* - `Phi3SmallForCausalLM` -+ - Phi-3-Small -+ - `microsoft/Phi-3-small-8k-instruct`, `microsoft/Phi-3-small-128k-instruct`, etc. -+ - -+ - ✅︎ -+* - `PhiMoEForCausalLM` -+ - Phi-3.5-MoE -+ - `microsoft/Phi-3.5-MoE-instruct`, etc. -+ - ✅︎ -+ - ✅︎ -+* - `PersimmonForCausalLM` -+ - Persimmon -+ - `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc. -+ - -+ - ✅︎ -+* - `QWenLMHeadModel` -+ - Qwen -+ - `Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc. -+ - ✅︎ -+ - ✅︎ -+* - `Qwen2ForCausalLM` -+ - Qwen2 -+ - `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc. -+ - ✅︎ -+ - ✅︎ -+* - `Qwen2MoeForCausalLM` -+ - Qwen2MoE -+ - `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc. -+ - -+ - ✅︎ -+* - `StableLmForCausalLM` -+ - StableLM -+ - `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc. -+ - -+ - ✅︎ -+* - `Starcoder2ForCausalLM` -+ - Starcoder2 -+ - `bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc. -+ - -+ - ✅︎ -+* - `SolarForCausalLM` -+ - Solar Pro -+ - `upstage/solar-pro-preview-instruct`, etc. -+ - ✅︎ -+ - ✅︎ -+* - `TeleChat2ForCausalLM` -+ - TeleChat2 -+ - `TeleAI/TeleChat2-3B`, `TeleAI/TeleChat2-7B`, `TeleAI/TeleChat2-35B`, etc. -+ - ✅︎ -+ - ✅︎ -+* - `XverseForCausalLM` -+ - XVERSE -+ - `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc. -+ - ✅︎ -+ - ✅︎ - ``` - - ```{note} -@@ -374,51 +373,50 @@ you should explicitly specify the task type to ensure that the model is used in - - #### Text Embedding (`--task embed`) - --```{eval-rst} --.. list-table:: -- :widths: 25 25 50 5 5 -- :header-rows: 1 -- -- * - Architecture -- - Models -- - Example HF Models -- - :ref:`LoRA ` -- - :ref:`PP ` -- * - :code:`BertModel` -- - BERT-based -- - :code:`BAAI/bge-base-en-v1.5`, etc. -- - -- - -- * - :code:`Gemma2Model` -- - Gemma2-based -- - :code:`BAAI/bge-multilingual-gemma2`, etc. -- - -- - ✅︎ -- * - :code:`GritLM` -- - GritLM -- - :code:`parasail-ai/GritLM-7B-vllm`. -- - ✅︎ -- - ✅︎ -- * - :code:`LlamaModel`, :code:`LlamaForCausalLM`, :code:`MistralModel`, etc. -- - Llama-based -- - :code:`intfloat/e5-mistral-7b-instruct`, etc. -- - ✅︎ -- - ✅︎ -- * - :code:`Qwen2Model`, :code:`Qwen2ForCausalLM` -- - Qwen2-based -- - :code:`ssmits/Qwen2-7B-Instruct-embed-base` (see note), :code:`Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc. -- - ✅︎ -- - ✅︎ -- * - :code:`RobertaModel`, :code:`RobertaForMaskedLM` -- - RoBERTa-based -- - :code:`sentence-transformers/all-roberta-large-v1`, :code:`sentence-transformers/all-roberta-large-v1`, etc. -- - -- - -- * - :code:`XLMRobertaModel` -- - XLM-RoBERTa-based -- - :code:`intfloat/multilingual-e5-large`, etc. -- - -- - -+```{list-table} -+:widths: 25 25 50 5 5 -+:header-rows: 1 -+ -+* - Architecture -+ - Models -+ - Example HF Models -+ - [LoRA](#lora-adapter) -+ - [PP](#distributed-serving) -+* - `BertModel` -+ - BERT-based -+ - `BAAI/bge-base-en-v1.5`, etc. -+ - -+ - -+* - `Gemma2Model` -+ - Gemma2-based -+ - `BAAI/bge-multilingual-gemma2`, etc. -+ - -+ - ✅︎ -+* - `GritLM` -+ - GritLM -+ - `parasail-ai/GritLM-7B-vllm`. -+ - ✅︎ -+ - ✅︎ -+* - `LlamaModel`, `LlamaForCausalLM`, `MistralModel`, etc. -+ - Llama-based -+ - `intfloat/e5-mistral-7b-instruct`, etc. -+ - ✅︎ -+ - ✅︎ -+* - `Qwen2Model`, `Qwen2ForCausalLM` -+ - Qwen2-based -+ - `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc. -+ - ✅︎ -+ - ✅︎ -+* - `RobertaModel`, `RobertaForMaskedLM` -+ - RoBERTa-based -+ - `sentence-transformers/all-roberta-large-v1`, `sentence-transformers/all-roberta-large-v1`, etc. -+ - -+ - -+* - `XLMRobertaModel` -+ - XLM-RoBERTa-based -+ - `intfloat/multilingual-e5-large`, etc. -+ - -+ - - ``` - - ```{note} -@@ -435,35 +433,39 @@ despite being described otherwise on its model card. - ``` - - If your model is not in the above list, we will try to automatically convert the model using --:func:`vllm.model_executor.models.adapters.as_embedding_model`. By default, the embeddings -+{func}`vllm.model_executor.models.adapters.as_embedding_model`. By default, the embeddings - of the whole prompt are extracted from the normalized hidden state corresponding to the last token. - - #### Reward Modeling (`--task reward`) - --```{eval-rst} --.. list-table:: -- :widths: 25 25 50 5 5 -- :header-rows: 1 -- -- * - Architecture -- - Models -- - Example HF Models -- - :ref:`LoRA ` -- - :ref:`PP ` -- * - :code:`LlamaForCausalLM` -- - Llama-based -- - :code:`peiyi9979/math-shepherd-mistral-7b-prm`, etc. -- - ✅︎ -- - ✅︎ -- * - :code:`Qwen2ForRewardModel` -- - Qwen2-based -- - :code:`Qwen/Qwen2.5-Math-RM-72B`, etc. -- - ✅︎ -- - ✅︎ -+```{list-table} -+:widths: 25 25 50 5 5 -+:header-rows: 1 -+ -+* - Architecture -+ - Models -+ - Example HF Models -+ - [LoRA](#lora-adapter) -+ - [PP](#distributed-serving) -+* - `InternLM2ForRewardModel` -+ - InternLM2-based -+ - `internlm/internlm2-1_8b-reward`, `internlm/internlm2-7b-reward`, etc. -+ - ✅︎ -+ - ✅︎ -+* - `LlamaForCausalLM` -+ - Llama-based -+ - `peiyi9979/math-shepherd-mistral-7b-prm`, etc. -+ - ✅︎ -+ - ✅︎ -+* - `Qwen2ForRewardModel` -+ - Qwen2-based -+ - `Qwen/Qwen2.5-Math-RM-72B`, etc. -+ - ✅︎ -+ - ✅︎ - ``` - - If your model is not in the above list, we will try to automatically convert the model using --:func:`vllm.model_executor.models.adapters.as_reward_model`. By default, we return the hidden states of each token directly. -+{func}`vllm.model_executor.models.adapters.as_reward_model`. By default, we return the hidden states of each token directly. - - ```{important} - For process-supervised reward models such as {code}`peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly, -@@ -472,58 +474,56 @@ e.g.: {code}`--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 1 - - #### Classification (`--task classify`) - --```{eval-rst} --.. list-table:: -- :widths: 25 25 50 5 5 -- :header-rows: 1 -- -- * - Architecture -- - Models -- - Example HF Models -- - :ref:`LoRA ` -- - :ref:`PP ` -- * - :code:`JambaForSequenceClassification` -- - Jamba -- - :code:`ai21labs/Jamba-tiny-reward-dev`, etc. -- - ✅︎ -- - ✅︎ -- * - :code:`Qwen2ForSequenceClassification` -- - Qwen2-based -- - :code:`jason9693/Qwen2.5-1.5B-apeach`, etc. -- - ✅︎ -- - ✅︎ -+```{list-table} -+:widths: 25 25 50 5 5 -+:header-rows: 1 -+ -+* - Architecture -+ - Models -+ - Example HF Models -+ - [LoRA](#lora-adapter) -+ - [PP](#distributed-serving) -+* - `JambaForSequenceClassification` -+ - Jamba -+ - `ai21labs/Jamba-tiny-reward-dev`, etc. -+ - ✅︎ -+ - ✅︎ -+* - `Qwen2ForSequenceClassification` -+ - Qwen2-based -+ - `jason9693/Qwen2.5-1.5B-apeach`, etc. -+ - ✅︎ -+ - ✅︎ - ``` - - If your model is not in the above list, we will try to automatically convert the model using --:func:`vllm.model_executor.models.adapters.as_classification_model`. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token. -+{func}`vllm.model_executor.models.adapters.as_classification_model`. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token. - - #### Sentence Pair Scoring (`--task score`) - --```{eval-rst} --.. list-table:: -- :widths: 25 25 50 5 5 -- :header-rows: 1 -- -- * - Architecture -- - Models -- - Example HF Models -- - :ref:`LoRA ` -- - :ref:`PP ` -- * - :code:`BertForSequenceClassification` -- - BERT-based -- - :code:`cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. -- - -- - -- * - :code:`RobertaForSequenceClassification` -- - RoBERTa-based -- - :code:`cross-encoder/quora-roberta-base`, etc. -- - -- - -- * - :code:`XLMRobertaForSequenceClassification` -- - XLM-RoBERTa-based -- - :code:`BAAI/bge-reranker-v2-m3`, etc. -- - -- - -+```{list-table} -+:widths: 25 25 50 5 5 -+:header-rows: 1 -+ -+* - Architecture -+ - Models -+ - Example HF Models -+ - [LoRA](#lora-adapter) -+ - [PP](#distributed-serving) -+* - `BertForSequenceClassification` -+ - BERT-based -+ - `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. -+ - -+ - -+* - `RobertaForSequenceClassification` -+ - RoBERTa-based -+ - `cross-encoder/quora-roberta-base`, etc. -+ - -+ - -+* - `XLMRobertaForSequenceClassification` -+ - XLM-RoBERTa-based -+ - `BAAI/bge-reranker-v2-m3`, etc. -+ - -+ - - ``` - - (supported-mm-models)= -@@ -553,186 +553,182 @@ See [this page](#generative-models) for more information on how to use generativ - - #### Text Generation (`--task generate`) +@@ -478,6 +478,16 @@ See [this page](#generative-models) for more information on how to use generativ + * `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc. + * + * ✅︎ ++- * `Qwen3ForCausalLM` ++ * Qwen3 ++ * `Qwen/Qwen3-8B`, etc. ++ * ✅︎ ++ * ✅︎ ++- * `Qwen3MoeForCausalLM` ++ * Qwen3MoE ++ * `Qwen/Qwen3-MoE-15B-A2B`, etc. ++ * ✅︎ ++ * ✅︎ + - * `StableLmForCausalLM` + * StableLM + * `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc. +diff --git a/examples/offline_inference/basic/basic.py b/examples/offline_inference/basic/basic.py +index 2ba5ec119..12268d3f0 100644 +--- a/examples/offline_inference/basic/basic.py ++++ b/examples/offline_inference/basic/basic.py +@@ -13,7 +13,13 @@ prompts = [ + sampling_params = SamplingParams(temperature=0.8, top_p=0.95) --```{eval-rst} --.. list-table:: -- :widths: 25 25 15 20 5 5 5 -- :header-rows: 1 -- -- * - Architecture -- - Models -- - Inputs -- - Example HF Models -- - :ref:`LoRA ` -- - :ref:`PP ` -- - V1 -- * - :code:`AriaForConditionalGeneration` -- - Aria -- - T + I -- - :code:`rhymes-ai/Aria` -- - -- - ✅︎ -- - -- * - :code:`Blip2ForConditionalGeneration` -- - BLIP-2 -- - T + I\ :sup:`E` -- - :code:`Salesforce/blip2-opt-2.7b`, :code:`Salesforce/blip2-opt-6.7b`, etc. -- - -- - ✅︎ -- - -- * - :code:`ChameleonForConditionalGeneration` -- - Chameleon -- - T + I -- - :code:`facebook/chameleon-7b` etc. -- - -- - ✅︎ -- - -- * - :code:`FuyuForCausalLM` -- - Fuyu -- - T + I -- - :code:`adept/fuyu-8b` etc. -- - -- - ✅︎ -- - -- * - :code:`ChatGLMModel` -- - GLM-4V -- - T + I -- - :code:`THUDM/glm-4v-9b` etc. -- - ✅︎ -- - ✅︎ -- - -- * - :code:`H2OVLChatModel` -- - H2OVL -- - T + I\ :sup:`E+` -- - :code:`h2oai/h2ovl-mississippi-800m`, :code:`h2oai/h2ovl-mississippi-2b`, etc. -- - -- - ✅︎ -- - -- * - :code:`Idefics3ForConditionalGeneration` -- - Idefics3 -- - T + I -- - :code:`HuggingFaceM4/Idefics3-8B-Llama3` etc. -- - ✅︎ -- - -- - -- * - :code:`InternVLChatModel` -- - InternVL 2.5, Mono-InternVL, InternVL 2.0 -- - T + I\ :sup:`E+` -- - :code:`OpenGVLab/InternVL2_5-4B`, :code:`OpenGVLab/Mono-InternVL-2B`, :code:`OpenGVLab/InternVL2-4B`, etc. -- - -- - ✅︎ -- - ✅︎ -- * - :code:`LlavaForConditionalGeneration` -- - LLaVA-1.5 -- - T + I\ :sup:`E+` -- - :code:`llava-hf/llava-1.5-7b-hf`, :code:`TIGER-Lab/Mantis-8B-siglip-llama3` (see note), etc. -- - -- - ✅︎ -- - ✅︎ -- * - :code:`LlavaNextForConditionalGeneration` -- - LLaVA-NeXT -- - T + I\ :sup:`E+` -- - :code:`llava-hf/llava-v1.6-mistral-7b-hf`, :code:`llava-hf/llava-v1.6-vicuna-7b-hf`, etc. -- - -- - ✅︎ -- - -- * - :code:`LlavaNextVideoForConditionalGeneration` -- - LLaVA-NeXT-Video -- - T + V -- - :code:`llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. -- - -- - ✅︎ -- - -- * - :code:`LlavaOnevisionForConditionalGeneration` -- - LLaVA-Onevision -- - T + I\ :sup:`+` + V\ :sup:`+` -- - :code:`llava-hf/llava-onevision-qwen2-7b-ov-hf`, :code:`llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. -- - -- - ✅︎ -- - -- * - :code:`MiniCPMV` -- - MiniCPM-V -- - T + I\ :sup:`E+` -- - :code:`openbmb/MiniCPM-V-2` (see note), :code:`openbmb/MiniCPM-Llama3-V-2_5`, :code:`openbmb/MiniCPM-V-2_6`, etc. -- - ✅︎ -- - ✅︎ -- - -- * - :code:`MllamaForConditionalGeneration` -- - Llama 3.2 -- - T + I\ :sup:`+` -- - :code:`meta-llama/Llama-3.2-90B-Vision-Instruct`, :code:`meta-llama/Llama-3.2-11B-Vision`, etc. -- - -- - -- - -- * - :code:`MolmoForCausalLM` -- - Molmo -- - T + I -- - :code:`allenai/Molmo-7B-D-0924`, :code:`allenai/Molmo-72B-0924`, etc. -- - -- - ✅︎ -- - ✅︎ -- * - :code:`NVLM_D_Model` -- - NVLM-D 1.0 -- - T + I\ :sup:`E+` -- - :code:`nvidia/NVLM-D-72B`, etc. -- - -- - ✅︎ -- - ✅︎ -- * - :code:`PaliGemmaForConditionalGeneration` -- - PaliGemma, PaliGemma 2 -- - T + I\ :sup:`E` -- - :code:`google/paligemma-3b-pt-224`, :code:`google/paligemma-3b-mix-224`, :code:`google/paligemma2-3b-ft-docci-448`, etc. -- - -- - ✅︎ -- - -- * - :code:`Phi3VForCausalLM` -- - Phi-3-Vision, Phi-3.5-Vision -- - T + I\ :sup:`E+` -- - :code:`microsoft/Phi-3-vision-128k-instruct`, :code:`microsoft/Phi-3.5-vision-instruct` etc. -- - -- - ✅︎ -- - ✅︎ -- * - :code:`PixtralForConditionalGeneration` -- - Pixtral -- - T + I\ :sup:`+` -- - :code:`mistralai/Pixtral-12B-2409`, :code:`mistral-community/pixtral-12b` etc. -- - -- - ✅︎ -- - ✅︎ -- * - :code:`QWenLMHeadModel` -- - Qwen-VL -- - T + I\ :sup:`E+` -- - :code:`Qwen/Qwen-VL`, :code:`Qwen/Qwen-VL-Chat`, etc. -- - ✅︎ -- - ✅︎ -- - -- * - :code:`Qwen2AudioForConditionalGeneration` -- - Qwen2-Audio -- - T + A\ :sup:`+` -- - :code:`Qwen/Qwen2-Audio-7B-Instruct` -- - -- - ✅︎ -- - -- * - :code:`Qwen2VLForConditionalGeneration` -- - Qwen2-VL -- - T + I\ :sup:`E+` + V\ :sup:`E+` -- - :code:`Qwen/QVQ-72B-Preview`, :code:`Qwen/Qwen2-VL-7B-Instruct`, :code:`Qwen/Qwen2-VL-72B-Instruct`, etc. -- - ✅︎ -- - ✅︎ -- - -- * - :code:`UltravoxModel` -- - Ultravox -- - T + A\ :sup:`E+` -- - :code:`fixie-ai/ultravox-v0_3` -- - -- - ✅︎ -- - -+```{list-table} -+:widths: 25 25 15 20 5 5 5 -+:header-rows: 1 -+ -+* - Architecture -+ - Models -+ - Inputs -+ - Example HF Models -+ - [LoRA](#lora-adapter) -+ - [PP](#distributed-serving) -+ - [V1](gh-issue:8779) -+* - `AriaForConditionalGeneration` -+ - Aria -+ - T + I+ -+ - `rhymes-ai/Aria` -+ - -+ - ✅︎ -+ - ✅︎ -+* - `Blip2ForConditionalGeneration` -+ - BLIP-2 -+ - T + IE -+ - `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc. -+ - -+ - ✅︎ -+ - ✅︎ -+* - `ChameleonForConditionalGeneration` -+ - Chameleon -+ - T + I -+ - `facebook/chameleon-7b` etc. -+ - -+ - ✅︎ -+ - ✅︎ -+* - `FuyuForCausalLM` -+ - Fuyu -+ - T + I -+ - `adept/fuyu-8b` etc. -+ - -+ - ✅︎ -+ - ✅︎ -+* - `ChatGLMModel` -+ - GLM-4V -+ - T + I -+ - `THUDM/glm-4v-9b` etc. -+ - ✅︎ -+ - ✅︎ -+ - -+* - `H2OVLChatModel` -+ - H2OVL -+ - T + IE+ -+ - `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. -+ - -+ - ✅︎ -+ - -+* - `Idefics3ForConditionalGeneration` -+ - Idefics3 -+ - T + I -+ - `HuggingFaceM4/Idefics3-8B-Llama3` etc. -+ - ✅︎ -+ - -+ - -+* - `InternVLChatModel` -+ - InternVL 2.5, Mono-InternVL, InternVL 2.0 -+ - T + IE+ -+ - `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. -+ - -+ - ✅︎ -+ - ✅︎ -+* - `LlavaForConditionalGeneration` -+ - LLaVA-1.5 -+ - T + IE+ -+ - `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), etc. -+ - -+ - ✅︎ -+ - ✅︎ -+* - `LlavaNextForConditionalGeneration` -+ - LLaVA-NeXT -+ - T + IE+ -+ - `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc. -+ - -+ - ✅︎ -+ - ✅︎ -+* - `LlavaNextVideoForConditionalGeneration` -+ - LLaVA-NeXT-Video -+ - T + V -+ - `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. -+ - -+ - ✅︎ -+ - -+* - `LlavaOnevisionForConditionalGeneration` -+ - LLaVA-Onevision -+ - T + I+ + V+ -+ - `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. -+ - -+ - ✅︎ -+ - -+* - `MiniCPMV` -+ - MiniCPM-V -+ - T + IE+ -+ - `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, etc. -+ - ✅︎ -+ - ✅︎ -+ - -+* - `MllamaForConditionalGeneration` -+ - Llama 3.2 -+ - T + I+ -+ - `meta-llama/Llama-3.2-90B-Vision-Instruct`, `meta-llama/Llama-3.2-11B-Vision`, etc. -+ - -+ - -+ - -+* - `MolmoForCausalLM` -+ - Molmo -+ - T + I -+ - `allenai/Molmo-7B-D-0924`, `allenai/Molmo-72B-0924`, etc. -+ - ✅︎ -+ - ✅︎ -+ - ✅︎ -+* - `NVLM_D_Model` -+ - NVLM-D 1.0 -+ - T + IE+ -+ - `nvidia/NVLM-D-72B`, etc. -+ - -+ - ✅︎ -+ - ✅︎ -+* - `PaliGemmaForConditionalGeneration` -+ - PaliGemma, PaliGemma 2 -+ - T + IE -+ - `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. -+ - -+ - ✅︎ -+ - -+* - `Phi3VForCausalLM` -+ - Phi-3-Vision, Phi-3.5-Vision -+ - T + IE+ -+ - `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct` etc. -+ - -+ - ✅︎ -+ - ✅︎ -+* - `PixtralForConditionalGeneration` -+ - Pixtral -+ - T + I+ -+ - `mistralai/Pixtral-12B-2409`, `mistral-community/pixtral-12b` etc. -+ - -+ - ✅︎ -+ - ✅︎ -+* - `QWenLMHeadModel` -+ - Qwen-VL -+ - T + IE+ -+ - `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc. -+ - ✅︎ -+ - ✅︎ -+ - -+* - `Qwen2AudioForConditionalGeneration` -+ - Qwen2-Audio -+ - T + A+ -+ - `Qwen/Qwen2-Audio-7B-Instruct` -+ - -+ - ✅︎ -+ - -+* - `Qwen2VLForConditionalGeneration` -+ - Qwen2-VL -+ - T + IE+ + VE+ -+ - `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc. -+ - ✅︎ -+ - ✅︎ -+ - -+* - `UltravoxModel` -+ - Ultravox -+ - T + AE+ -+ - `fixie-ai/ultravox-v0_3` -+ - -+ - ✅︎ -+ - - ``` + # Create an LLM. +-llm = LLM(model="facebook/opt-125m") ++llm = LLM(model="/llm/models/Llama-2-7b-chat-hf", ++ device="xpu", ++ dtype="float16", ++ enforce_eager=True, ++ tensor_parallel_size=1, ++ gpu_memory_utilization=0.9, ++ max_model_len=2048) + # Generate texts from the prompts. The output is a list of RequestOutput objects + # that contain the prompt, generated text, and other information. + outputs = llm.generate(prompts, sampling_params) +diff --git a/setup.py b/setup.py +index b0cc2f481..bd8556b41 100755 +--- a/setup.py ++++ b/setup.py +@@ -26,7 +26,7 @@ def load_module_from_path(module_name, path): + spec.loader.exec_module(module) + return module --```{eval-rst} --:sup:`E` Pre-computed embeddings can be inputted for this modality. - --:sup:`+` Multiple items can be inputted per text prompt for this modality. --``` -+E Pre-computed embeddings can be inputted for this modality. -++ Multiple items can be inputted per text prompt for this modality. - - ````{important} - To enable multiple multi-modal items per text prompt, you have to set {code}`limit_mm_per_prompt` (offline inference) -@@ -755,8 +751,7 @@ vLLM currently only supports adding LoRA to the language backbone of multimodal - ``` - - ```{note} --To use {code}`TIGER-Lab/Mantis-8B-siglip-llama3`, you have to install their GitHub repo ({code}`pip install git+https://github.com/TIGER-AI-Lab/Mantis.git`) --and pass {code}`--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM. -+To use {code}`TIGER-Lab/Mantis-8B-siglip-llama3`, you have pass {code}`--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM. - ``` ++os.environ["SETUPTOOLS_SCM_PRETEND_VERSION"] = "0.8.3+ipexllm" + ROOT_DIR = Path(__file__).parent + logger = logging.getLogger(__name__) - ```{note} -@@ -783,38 +778,37 @@ To get the best results, you should use pooling models that are specifically tra +@@ -147,6 +147,7 @@ class cmake_build_ext(build_ext): + cmake_args = [ + '-DCMAKE_BUILD_TYPE={}'.format(cfg), + '-DVLLM_TARGET_DEVICE={}'.format(VLLM_TARGET_DEVICE), ++ "-DCMAKE_CXX_STANDARD=17", + ] - The following table lists those that are tested in vLLM. + verbose = envs.VERBOSE +@@ -432,7 +433,7 @@ def _no_device() -> bool: + def _is_cuda() -> bool: + has_cuda = torch.version.cuda is not None + return (VLLM_TARGET_DEVICE == "cuda" and has_cuda +- and not (_is_neuron() or _is_tpu() or _is_hpu())) ++ and not (_is_neuron() or _is_tpu() or _is_xpu())) --```{eval-rst} --.. list-table:: -- :widths: 25 25 15 25 5 5 -- :header-rows: 1 -- -- * - Architecture -- - Models -- - Inputs -- - Example HF Models -- - :ref:`LoRA ` -- - :ref:`PP ` -- * - :code:`LlavaNextForConditionalGeneration` -- - LLaVA-NeXT-based -- - T / I -- - :code:`royokong/e5-v` -- - -- - ✅︎ -- * - :code:`Phi3VForCausalLM` -- - Phi-3-Vision-based -- - T + I -- - :code:`TIGER-Lab/VLM2Vec-Full` -- - 🚧 -- - ✅︎ -- * - :code:`Qwen2VLForConditionalGeneration` -- - Qwen2-VL-based -- - T + I -- - :code:`MrLight/dse-qwen2-2b-mrl-v1` -- - -- - ✅︎ -+```{list-table} -+:widths: 25 25 15 25 5 5 -+:header-rows: 1 -+ -+* - Architecture -+ - Models -+ - Inputs -+ - Example HF Models -+ - [LoRA](#lora-adapter) -+ - [PP](#distributed-serving) -+* - `LlavaNextForConditionalGeneration` -+ - LLaVA-NeXT-based -+ - T / I -+ - `royokong/e5-v` -+ - -+ - ✅︎ -+* - `Phi3VForCausalLM` -+ - Phi-3-Vision-based -+ - T + I -+ - `TIGER-Lab/VLM2Vec-Full` -+ - 🚧 -+ - ✅︎ -+* - `Qwen2VLForConditionalGeneration` -+ - Qwen2-VL-based -+ - T + I -+ - `MrLight/dse-qwen2-2b-mrl-v1` -+ - -+ - ✅︎ - ``` --______________________________________________________________________ -+_________________ + def _is_hip() -> bool: +@@ -457,7 +458,11 @@ def _is_xpu() -> bool: - # Model Support Policy -diff --git a/docs/source/quantization/bnb.md b/docs/source/quantization/bnb.md -index 8240eca1c..f7f41726f 100644 ---- a/docs/source/quantization/bnb.md -+++ b/docs/source/quantization/bnb.md -@@ -37,3 +37,10 @@ model_id = "huggyllama/llama-7b" - llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \ - quantization="bitsandbytes", load_format="bitsandbytes") - ``` -+## OpenAI Compatible Server -+ -+Append the following to your 4bit model arguments: + def _build_custom_ops() -> bool: +- return _is_cuda() or _is_hip() or _is_cpu() ++ return _is_cuda() or _is_hip() or _is_cpu() or _is_xpu() + -+``` -+--quantization bitsandbytes --load-format bitsandbytes -+``` -diff --git a/docs/source/quantization/supported_hardware.md b/docs/source/quantization/supported_hardware.md -index 843ee2162..7330c2f8a 100644 ---- a/docs/source/quantization/supported_hardware.md -+++ b/docs/source/quantization/supported_hardware.md -@@ -4,121 +4,120 @@ - - The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM: - --```{eval-rst} --.. list-table:: -- :header-rows: 1 -- :widths: 20 8 8 8 8 8 8 8 8 8 8 -+```{list-table} -+:header-rows: 1 -+:widths: 20 8 8 8 8 8 8 8 8 8 8 - -- * - Implementation -- - Volta -- - Turing -- - Ampere -- - Ada -- - Hopper -- - AMD GPU -- - Intel GPU -- - x86 CPU -- - AWS Inferentia -- - Google TPU -- * - AWQ -- - ✗ -- - ✅︎ -- - ✅︎ -- - ✅︎ -- - ✅︎ -- - ✗ -- - ✅︎ -- - ✅︎ -- - ✗ -- - ✗ -- * - GPTQ -- - ✅︎ -- - ✅︎ -- - ✅︎ -- - ✅︎ -- - ✅︎ -- - ✗ -- - ✅︎ -- - ✅︎ -- - ✗ -- - ✗ -- * - Marlin (GPTQ/AWQ/FP8) -- - ✗ -- - ✗ -- - ✅︎ -- - ✅︎ -- - ✅︎ -- - ✗ -- - ✗ -- - ✗ -- - ✗ -- - ✗ -- * - INT8 (W8A8) -- - ✗ -- - ✅︎ -- - ✅︎ -- - ✅︎ -- - ✅︎ -- - ✗ -- - ✗ -- - ✅︎ -- - ✗ -- - ✗ -- * - FP8 (W8A8) -- - ✗ -- - ✗ -- - ✗ -- - ✅︎ -- - ✅︎ -- - ✅︎ -- - ✗ -- - ✗ -- - ✗ -- - ✗ -- * - AQLM -- - ✅︎ -- - ✅︎ -- - ✅︎ -- - ✅︎ -- - ✅︎ -- - ✗ -- - ✗ -- - ✗ -- - ✗ -- - ✗ -- * - bitsandbytes -- - ✅︎ -- - ✅︎ -- - ✅︎ -- - ✅︎ -- - ✅︎ -- - ✗ -- - ✗ -- - ✗ -- - ✗ -- - ✗ -- * - DeepSpeedFP -- - ✅︎ -- - ✅︎ -- - ✅︎ -- - ✅︎ -- - ✅︎ -- - ✗ -- - ✗ -- - ✗ -- - ✗ -- - ✗ -- * - GGUF -- - ✅︎ -- - ✅︎ -- - ✅︎ -- - ✅︎ -- - ✅︎ -- - ✗ -- - ✗ -- - ✗ -- - ✗ -- - ✗ -+* - Implementation -+ - Volta -+ - Turing -+ - Ampere -+ - Ada -+ - Hopper -+ - AMD GPU -+ - Intel GPU -+ - x86 CPU -+ - AWS Inferentia -+ - Google TPU -+* - AWQ -+ - ✗ -+ - ✅︎ -+ - ✅︎ -+ - ✅︎ -+ - ✅︎ -+ - ✗ -+ - ✅︎ -+ - ✅︎ -+ - ✗ -+ - ✗ -+* - GPTQ -+ - ✅︎ -+ - ✅︎ -+ - ✅︎ -+ - ✅︎ -+ - ✅︎ -+ - ✗ -+ - ✅︎ -+ - ✅︎ -+ - ✗ -+ - ✗ -+* - Marlin (GPTQ/AWQ/FP8) -+ - ✗ -+ - ✗ -+ - ✅︎ -+ - ✅︎ -+ - ✅︎ -+ - ✗ -+ - ✗ -+ - ✗ -+ - ✗ -+ - ✗ -+* - INT8 (W8A8) -+ - ✗ -+ - ✅︎ -+ - ✅︎ -+ - ✅︎ -+ - ✅︎ -+ - ✗ -+ - ✗ -+ - ✅︎ -+ - ✗ -+ - ✗ -+* - FP8 (W8A8) -+ - ✗ -+ - ✗ -+ - ✗ -+ - ✅︎ -+ - ✅︎ -+ - ✅︎ -+ - ✗ -+ - ✗ -+ - ✗ -+ - ✗ -+* - AQLM -+ - ✅︎ -+ - ✅︎ -+ - ✅︎ -+ - ✅︎ -+ - ✅︎ -+ - ✗ -+ - ✗ -+ - ✗ -+ - ✗ -+ - ✗ -+* - bitsandbytes -+ - ✅︎ -+ - ✅︎ -+ - ✅︎ -+ - ✅︎ -+ - ✅︎ -+ - ✗ -+ - ✗ -+ - ✗ -+ - ✗ -+ - ✗ -+* - DeepSpeedFP -+ - ✅︎ -+ - ✅︎ -+ - ✅︎ -+ - ✅︎ -+ - ✅︎ -+ - ✗ -+ - ✗ -+ - ✗ -+ - ✗ -+ - ✗ -+* - GGUF -+ - ✅︎ -+ - ✅︎ -+ - ✅︎ -+ - ✅︎ -+ - ✅︎ -+ - ✗ -+ - ✗ -+ - ✗ -+ - ✗ -+ - ✗ - ``` - - ## Notes: -diff --git a/docs/source/serving/deploying_with_cerebrium.md b/docs/source/serving/deploying_with_cerebrium.md -index 486393623..950064c8c 100644 ---- a/docs/source/serving/deploying_with_cerebrium.md -+++ b/docs/source/serving/deploying_with_cerebrium.md -@@ -33,7 +33,7 @@ docker_base_image_url = "nvidia/cuda:12.1.1-runtime-ubuntu22.04" - vllm = "latest" - ``` - --Next, let us add our code to handle inference for the LLM of your choice(`mistralai/Mistral-7B-Instruct-v0.1` for this example), add the following code to your main.py\`: -+Next, let us add our code to handle inference for the LLM of your choice (`mistralai/Mistral-7B-Instruct-v0.1` for this example), add the following code to your `main.py`: - - ```python - from vllm import LLM, SamplingParams -@@ -55,13 +55,13 @@ def run(prompts: list[str], temperature: float = 0.8, top_p: float = 0.95): - return {"results": results} - ``` - --Then, run the following code to deploy it to the cloud -+Then, run the following code to deploy it to the cloud: - - ```console - $ cerebrium deploy - ``` - --If successful, you should be returned a CURL command that you can call inference against. Just remember to end the url with the function name you are calling (in our case /run) -+If successful, you should be returned a CURL command that you can call inference against. Just remember to end the url with the function name you are calling (in our case` /run`) - - ```python - curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \ -diff --git a/docs/source/serving/deploying_with_dstack.md b/docs/source/serving/deploying_with_dstack.md -index 65ef1c001..381f5f786 100644 ---- a/docs/source/serving/deploying_with_dstack.md -+++ b/docs/source/serving/deploying_with_dstack.md -@@ -25,7 +25,7 @@ $ cd vllm-dstack - $ dstack init - ``` ++ ++def _build_core_ext() -> bool: ++ return not (_is_neuron() or _is_tpu() or _is_xpu()) --Next, to provision a VM instance with LLM of your choice(`NousResearch/Llama-2-7b-chat-hf` for this example), create the following `serve.dstack.yml` file for the dstack `Service`: -+Next, to provision a VM instance with LLM of your choice (`NousResearch/Llama-2-7b-chat-hf` for this example), create the following `serve.dstack.yml` file for the dstack `Service`: - ```yaml - type: service -diff --git a/docs/source/serving/deploying_with_helm.md b/docs/source/serving/deploying_with_helm.md -index 3b2657582..7286a0a88 100644 ---- a/docs/source/serving/deploying_with_helm.md -+++ b/docs/source/serving/deploying_with_helm.md -@@ -43,209 +43,208 @@ chart **including persistent volumes** and deletes the release. + def get_rocm_version(): +@@ -634,6 +639,9 @@ def get_requirements() -> list[str]: - ## Values + ext_modules = [] --```{eval-rst} --.. list-table:: Values -- :widths: 25 25 25 25 -- :header-rows: 1 -- -- * - Key -- - Type -- - Default -- - Description -- * - autoscaling -- - object -- - {"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80} -- - Autoscaling configuration -- * - autoscaling.enabled -- - bool -- - false -- - Enable autoscaling -- * - autoscaling.maxReplicas -- - int -- - 100 -- - Maximum replicas -- * - autoscaling.minReplicas -- - int -- - 1 -- - Minimum replicas -- * - autoscaling.targetCPUUtilizationPercentage -- - int -- - 80 -- - Target CPU utilization for autoscaling -- * - configs -- - object -- - {} -- - Configmap -- * - containerPort -- - int -- - 8000 -- - Container port -- * - customObjects -- - list -- - [] -- - Custom Objects configuration -- * - deploymentStrategy -- - object -- - {} -- - Deployment strategy configuration -- * - externalConfigs -- - list -- - [] -- - External configuration -- * - extraContainers -- - list -- - [] -- - Additional containers configuration -- * - extraInit -- - object -- - {"pvcStorage":"1Gi","s3modelpath":"relative_s3_model_path/opt-125m", "awsEc2MetadataDisabled": true} -- - Additional configuration for the init container -- * - extraInit.pvcStorage -- - string -- - "50Gi" -- - Storage size of the s3 -- * - extraInit.s3modelpath -- - string -- - "relative_s3_model_path/opt-125m" -- - Path of the model on the s3 which hosts model weights and config files -- * - extraInit.awsEc2MetadataDisabled -- - boolean -- - true -- - Disables the use of the Amazon EC2 instance metadata service -- * - extraPorts -- - list -- - [] -- - Additional ports configuration -- * - gpuModels -- - list -- - ["TYPE_GPU_USED"] -- - Type of gpu used -- * - image -- - object -- - {"command":["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"],"repository":"vllm/vllm-openai","tag":"latest"} -- - Image configuration -- * - image.command -- - list -- - ["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"] -- - Container launch command -- * - image.repository -- - string -- - "vllm/vllm-openai" -- - Image repository -- * - image.tag -- - string -- - "latest" -- - Image tag -- * - livenessProbe -- - object -- - {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":15,"periodSeconds":10} -- - Liveness probe configuration -- * - livenessProbe.failureThreshold -- - int -- - 3 -- - Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not alive -- * - livenessProbe.httpGet -- - object -- - {"path":"/health","port":8000} -- - Configuration of the Kubelet http request on the server -- * - livenessProbe.httpGet.path -- - string -- - "/health" -- - Path to access on the HTTP server -- * - livenessProbe.httpGet.port -- - int -- - 8000 -- - Name or number of the port to access on the container, on which the server is listening -- * - livenessProbe.initialDelaySeconds -- - int -- - 15 -- - Number of seconds after the container has started before liveness probe is initiated -- * - livenessProbe.periodSeconds -- - int -- - 10 -- - How often (in seconds) to perform the liveness probe -- * - maxUnavailablePodDisruptionBudget -- - string -- - "" -- - Disruption Budget Configuration -- * - readinessProbe -- - object -- - {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":5,"periodSeconds":5} -- - Readiness probe configuration -- * - readinessProbe.failureThreshold -- - int -- - 3 -- - Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not ready -- * - readinessProbe.httpGet -- - object -- - {"path":"/health","port":8000} -- - Configuration of the Kubelet http request on the server -- * - readinessProbe.httpGet.path -- - string -- - "/health" -- - Path to access on the HTTP server -- * - readinessProbe.httpGet.port -- - int -- - 8000 -- - Name or number of the port to access on the container, on which the server is listening -- * - readinessProbe.initialDelaySeconds -- - int -- - 5 -- - Number of seconds after the container has started before readiness probe is initiated -- * - readinessProbe.periodSeconds -- - int -- - 5 -- - How often (in seconds) to perform the readiness probe -- * - replicaCount -- - int -- - 1 -- - Number of replicas -- * - resources -- - object -- - {"limits":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1},"requests":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1}} -- - Resource configuration -- * - resources.limits."nvidia.com/gpu" -- - int -- - 1 -- - Number of gpus used -- * - resources.limits.cpu -- - int -- - 4 -- - Number of CPUs -- * - resources.limits.memory -- - string -- - "16Gi" -- - CPU memory configuration -- * - resources.requests."nvidia.com/gpu" -- - int -- - 1 -- - Number of gpus used -- * - resources.requests.cpu -- - int -- - 4 -- - Number of CPUs -- * - resources.requests.memory -- - string -- - "16Gi" -- - CPU memory configuration -- * - secrets -- - object -- - {} -- - Secrets configuration -- * - serviceName -- - string -- - -- - Service name -- * - servicePort -- - int -- - 80 -- - Service port -- * - labels.environment -- - string -- - test -- - Environment name -- * - labels.release -- - string -- - test -- - Release name -+```{list-table} -+:widths: 25 25 25 25 -+:header-rows: 1 -+ -+* - Key -+ - Type -+ - Default -+ - Description -+* - autoscaling -+ - object -+ - {"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80} -+ - Autoscaling configuration -+* - autoscaling.enabled -+ - bool -+ - false -+ - Enable autoscaling -+* - autoscaling.maxReplicas -+ - int -+ - 100 -+ - Maximum replicas -+* - autoscaling.minReplicas -+ - int -+ - 1 -+ - Minimum replicas -+* - autoscaling.targetCPUUtilizationPercentage -+ - int -+ - 80 -+ - Target CPU utilization for autoscaling -+* - configs -+ - object -+ - {} -+ - Configmap -+* - containerPort -+ - int -+ - 8000 -+ - Container port -+* - customObjects -+ - list -+ - [] -+ - Custom Objects configuration -+* - deploymentStrategy -+ - object -+ - {} -+ - Deployment strategy configuration -+* - externalConfigs -+ - list -+ - [] -+ - External configuration -+* - extraContainers -+ - list -+ - [] -+ - Additional containers configuration -+* - extraInit -+ - object -+ - {"pvcStorage":"1Gi","s3modelpath":"relative_s3_model_path/opt-125m", "awsEc2MetadataDisabled": true} -+ - Additional configuration for the init container -+* - extraInit.pvcStorage -+ - string -+ - "50Gi" -+ - Storage size of the s3 -+* - extraInit.s3modelpath -+ - string -+ - "relative_s3_model_path/opt-125m" -+ - Path of the model on the s3 which hosts model weights and config files -+* - extraInit.awsEc2MetadataDisabled -+ - boolean -+ - true -+ - Disables the use of the Amazon EC2 instance metadata service -+* - extraPorts -+ - list -+ - [] -+ - Additional ports configuration -+* - gpuModels -+ - list -+ - ["TYPE_GPU_USED"] -+ - Type of gpu used -+* - image -+ - object -+ - {"command":["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"],"repository":"vllm/vllm-openai","tag":"latest"} -+ - Image configuration -+* - image.command -+ - list -+ - ["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"] -+ - Container launch command -+* - image.repository -+ - string -+ - "vllm/vllm-openai" -+ - Image repository -+* - image.tag -+ - string -+ - "latest" -+ - Image tag -+* - livenessProbe -+ - object -+ - {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":15,"periodSeconds":10} -+ - Liveness probe configuration -+* - livenessProbe.failureThreshold -+ - int -+ - 3 -+ - Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not alive -+* - livenessProbe.httpGet -+ - object -+ - {"path":"/health","port":8000} -+ - Configuration of the Kubelet http request on the server -+* - livenessProbe.httpGet.path -+ - string -+ - "/health" -+ - Path to access on the HTTP server -+* - livenessProbe.httpGet.port -+ - int -+ - 8000 -+ - Name or number of the port to access on the container, on which the server is listening -+* - livenessProbe.initialDelaySeconds -+ - int -+ - 15 -+ - Number of seconds after the container has started before liveness probe is initiated -+* - livenessProbe.periodSeconds -+ - int -+ - 10 -+ - How often (in seconds) to perform the liveness probe -+* - maxUnavailablePodDisruptionBudget -+ - string -+ - "" -+ - Disruption Budget Configuration -+* - readinessProbe -+ - object -+ - {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":5,"periodSeconds":5} -+ - Readiness probe configuration -+* - readinessProbe.failureThreshold -+ - int -+ - 3 -+ - Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not ready -+* - readinessProbe.httpGet -+ - object -+ - {"path":"/health","port":8000} -+ - Configuration of the Kubelet http request on the server -+* - readinessProbe.httpGet.path -+ - string -+ - "/health" -+ - Path to access on the HTTP server -+* - readinessProbe.httpGet.port -+ - int -+ - 8000 -+ - Name or number of the port to access on the container, on which the server is listening -+* - readinessProbe.initialDelaySeconds -+ - int -+ - 5 -+ - Number of seconds after the container has started before readiness probe is initiated -+* - readinessProbe.periodSeconds -+ - int -+ - 5 -+ - How often (in seconds) to perform the readiness probe -+* - replicaCount -+ - int -+ - 1 -+ - Number of replicas -+* - resources -+ - object -+ - {"limits":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1},"requests":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1}} -+ - Resource configuration -+* - resources.limits."nvidia.com/gpu" -+ - int -+ - 1 -+ - Number of gpus used -+* - resources.limits.cpu -+ - int -+ - 4 -+ - Number of CPUs -+* - resources.limits.memory -+ - string -+ - "16Gi" -+ - CPU memory configuration -+* - resources.requests."nvidia.com/gpu" -+ - int -+ - 1 -+ - Number of gpus used -+* - resources.requests.cpu -+ - int -+ - 4 -+ - Number of CPUs -+* - resources.requests.memory -+ - string -+ - "16Gi" -+ - CPU memory configuration -+* - secrets -+ - object -+ - {} -+ - Secrets configuration -+* - serviceName -+ - string -+ - -+ - Service name -+* - servicePort -+ - int -+ - 80 -+ - Service port -+* - labels.environment -+ - string -+ - test -+ - Environment name -+* - labels.release -+ - string -+ - test -+ - Release name - ``` -diff --git a/docs/source/serving/deploying_with_k8s.md b/docs/source/serving/deploying_with_k8s.md -index d27db826c..5f9b0e4f5 100644 ---- a/docs/source/serving/deploying_with_k8s.md -+++ b/docs/source/serving/deploying_with_k8s.md -@@ -43,11 +43,15 @@ metadata: - name: hf-token-secret - namespace: default - type: Opaque --data: -+stringData: - token: "REPLACE_WITH_TOKEN" - ``` ++if _build_core_ext(): ++ ext_modules.append(CMakeExtension(name="vllm._core_C")) ++ + if _is_cuda() or _is_hip(): + ext_modules.append(CMakeExtension(name="vllm._moe_C")) --Create a deployment file for vLLM to run the model server. The following example deploys the `Mistral-7B-Instruct-v0.3` model: -+Next to create the deployment file for vLLM to run the model server. The following example deploys the `Mistral-7B-Instruct-v0.3` model. +diff --git a/tests/distributed/test_basic_distributed_correctness_enc_dec.py b/tests/distributed/test_basic_distributed_correctness_enc_dec.py +new file mode 100644 +index 000000000..f00d5ef58 +--- /dev/null ++++ b/tests/distributed/test_basic_distributed_correctness_enc_dec.py +@@ -0,0 +1,102 @@ ++"""For encoder/decoder models only: ++Compare the outputs of HF and distributed vLLM when using greedy sampling. + -+Here are two examples for using NVIDIA GPU and AMD GPU. ++Run: ++```sh ++cd $VLLM_PATH/tests + -+- NVIDIA GPU - - ```yaml - apiVersion: apps/v1 -@@ -119,6 +123,79 @@ spec: - periodSeconds: 5 - ``` - -+- AMD GPU -+ -+You can refer to the `deployment.yaml` below if using AMD ROCm GPU like MI300X. -+ -+```yaml -+apiVersion: apps/v1 -+kind: Deployment -+metadata: -+ name: mistral-7b -+ namespace: default -+ labels: -+ app: mistral-7b -+spec: -+ replicas: 1 -+ selector: -+ matchLabels: -+ app: mistral-7b -+ template: -+ metadata: -+ labels: -+ app: mistral-7b -+ spec: -+ volumes: -+ # PVC -+ - name: cache-volume -+ persistentVolumeClaim: -+ claimName: mistral-7b -+ # vLLM needs to access the host's shared memory for tensor parallel inference. -+ - name: shm -+ emptyDir: -+ medium: Memory -+ sizeLimit: "8Gi" -+ hostNetwork: true -+ hostIPC: true -+ containers: -+ - name: mistral-7b -+ image: rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4 -+ securityContext: -+ seccompProfile: -+ type: Unconfined -+ runAsGroup: 44 -+ capabilities: -+ add: -+ - SYS_PTRACE -+ command: ["/bin/sh", "-c"] -+ args: [ -+ "vllm serve mistralai/Mistral-7B-v0.3 --port 8000 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024" -+ ] -+ env: -+ - name: HUGGING_FACE_HUB_TOKEN -+ valueFrom: -+ secretKeyRef: -+ name: hf-token-secret -+ key: token -+ ports: -+ - containerPort: 8000 -+ resources: -+ limits: -+ cpu: "10" -+ memory: 20G -+ amd.com/gpu: "1" -+ requests: -+ cpu: "6" -+ memory: 6G -+ amd.com/gpu: "1" -+ volumeMounts: -+ - name: cache-volume -+ mountPath: /root/.cache/huggingface -+ - name: shm -+ mountPath: /dev/shm ++pytest distributed/test_basic_distributed_correctness_enc_dec.py +``` -+You can get the full example with steps and sample yaml files from . -+ - 2. **Create a Kubernetes Service for vLLM** - - Next, create a Kubernetes Service file to expose the `mistral-7b` deployment: -diff --git a/docs/source/serving/distributed_serving.md b/docs/source/serving/distributed_serving.md -index c0a4b23f6..6fbc1ea10 100644 ---- a/docs/source/serving/distributed_serving.md -+++ b/docs/source/serving/distributed_serving.md -@@ -8,7 +8,7 @@ Before going into the details of distributed inference and serving, let's first - - - **Single GPU (no distributed inference)**: If your model fits in a single GPU, you probably don't need to use distributed inference. Just use the single GPU to run the inference. - - **Single-Node Multi-GPU (tensor parallel inference)**: If your model is too large to fit in a single GPU, but it can fit in a single node with multiple GPUs, you can use tensor parallelism. The tensor parallel size is the number of GPUs you want to use. For example, if you have 4 GPUs in a single node, you can set the tensor parallel size to 4. --- **Multi-Node Multi-GPU (tensor parallel plus pipeline parallel inference)**: If your model is too large to fit in a single node, you can use tensor parallel together with pipeline parallelism. The tensor parallel size is the number of GPUs you want to use in each node, and the pipeline parallel size is the number of nodes you want to use. For example, if you have 16 GPUs in 2 nodes (8GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2. -+- **Multi-Node Multi-GPU (tensor parallel plus pipeline parallel inference)**: If your model is too large to fit in a single node, you can use tensor parallel together with pipeline parallelism. The tensor parallel size is the number of GPUs you want to use in each node, and the pipeline parallel size is the number of nodes you want to use. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2. - - In short, you should increase the number of GPUs and the number of nodes until you have enough GPU memory to hold the model. The tensor parallel size should be the number of GPUs in each node, and the pipeline parallel size should be the number of nodes. - -@@ -22,7 +22,7 @@ There is one edge case: if the model fits in a single node with multiple GPUs, b - - vLLM supports distributed tensor-parallel and pipeline-parallel inference and serving. Currently, we support [Megatron-LM's tensor parallel algorithm](https://arxiv.org/pdf/1909.08053.pdf). We manage the distributed runtime with either [Ray](https://github.com/ray-project/ray) or python native multiprocessing. Multiprocessing can be used when deploying on a single node, multi-node inferencing currently requires Ray. - --Multiprocessing will be used by default when not running in a Ray placement group and if there are sufficient GPUs available on the same node for the configured {code}`tensor_parallel_size`, otherwise Ray will be used. This default can be overridden via the {code}`LLM` class {code}`distributed-executor-backend` argument or {code}`--distributed-executor-backend` API server argument. Set it to {code}`mp` for multiprocessing or {code}`ray` for Ray. It's not required for Ray to be installed for the multiprocessing case. -+Multiprocessing will be used by default when not running in a Ray placement group and if there are sufficient GPUs available on the same node for the configured {code}`tensor_parallel_size`, otherwise Ray will be used. This default can be overridden via the {code}`LLM` class {code}`distributed_executor_backend` argument or {code}`--distributed-executor-backend` API server argument. Set it to {code}`mp` for multiprocessing or {code}`ray` for Ray. It's not required for Ray to be installed for the multiprocessing case. - - To run multi-GPU inference with the {code}`LLM` class, set the {code}`tensor_parallel_size` argument to the number of GPUs you want to use. For example, to run inference on 4 GPUs: - -@@ -77,7 +77,7 @@ Then you get a ray cluster of containers. Note that you need to keep the shells - - Then, on any node, use `docker exec -it node /bin/bash` to enter the container, execute `ray status` to check the status of the Ray cluster. You should see the right number of nodes and GPUs. - --After that, on any node, you can use vLLM as usual, just as you have all the GPUs on one node. The common practice is to set the tensor parallel size to the number of GPUs in each node, and the pipeline parallel size to the number of nodes. For example, if you have 16 GPUs in 2 nodes (8GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2: -+After that, on any node, you can use vLLM as usual, just as you have all the GPUs on one node. The common practice is to set the tensor parallel size to the number of GPUs in each node, and the pipeline parallel size to the number of nodes. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2: - - ```console - $ vllm serve /path/to/the/model/in/the/container \ -@@ -85,7 +85,7 @@ $ --tensor-parallel-size 8 \ - $ --pipeline-parallel-size 2 - ``` - --You can also use tensor parallel without pipeline parallel, just set the tensor parallel size to the number of GPUs in the cluster. For example, if you have 16 GPUs in 2 nodes (8GPUs per node), you can set the tensor parallel size to 16: -+You can also use tensor parallel without pipeline parallel, just set the tensor parallel size to the number of GPUs in the cluster. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 16: - - ```console - $ vllm serve /path/to/the/model/in/the/container \ -@@ -95,7 +95,7 @@ $ --tensor-parallel-size 16 - To make tensor parallel performant, you should make sure the communication between nodes is efficient, e.g. using high-speed network cards like Infiniband. To correctly set up the cluster to use Infiniband, append additional arguments like `--privileged -e NCCL_IB_HCA=mlx5` to the `run_cluster.sh` script. Please contact your system administrator for more information on how to set up the flags. One way to confirm if the Infiniband is working is to run vLLM with `NCCL_DEBUG=TRACE` environment variable set, e.g. `NCCL_DEBUG=TRACE vllm serve ...` and check the logs for the NCCL version and the network used. If you find `[send] via NET/Socket` in the logs, it means NCCL uses raw TCP Socket, which is not efficient for cross-node tensor parallel. If you find `[send] via NET/IB/GDRDMA` in the logs, it means NCCL uses Infiniband with GPU-Direct RDMA, which is efficient. - - ```{warning} --After you start the Ray cluster, you'd better also check the GPU-GPU communication between nodes. It can be non-trivial to set up. Please refer to the [sanity check script](../getting_started/debugging.md) for more information. If you need to set some environment variables for the communication configuration, you can append them to the `run_cluster.sh` script, e.g. `-e NCCL_SOCKET_IFNAME=eth0`. Note that setting environment variables in the shell (e.g. `NCCL_SOCKET_IFNAME=eth0 vllm serve ...`) only works for the processes in the same node, not for the processes in the other nodes. Setting environment variables when you create the cluster is the recommended way. See for more information. -+After you start the Ray cluster, you'd better also check the GPU-GPU communication between nodes. It can be non-trivial to set up. Please refer to the [sanity check script](#troubleshooting-incorrect-hardware-driver) for more information. If you need to set some environment variables for the communication configuration, you can append them to the `run_cluster.sh` script, e.g. `-e NCCL_SOCKET_IFNAME=eth0`. Note that setting environment variables in the shell (e.g. `NCCL_SOCKET_IFNAME=eth0 vllm serve ...`) only works for the processes in the same node, not for the processes in the other nodes. Setting environment variables when you create the cluster is the recommended way. See for more information. - ``` - - ```{warning} -diff --git a/docs/source/serving/runai_model_streamer.md b/docs/source/serving/runai_model_streamer.md -index 1b5756a95..d4269050f 100644 ---- a/docs/source/serving/runai_model_streamer.md -+++ b/docs/source/serving/runai_model_streamer.md -@@ -41,7 +41,7 @@ For reading from S3, it will be the number of client instances the host is openi - $ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"concurrency":16}' - ``` - --You can controls the size of the CPU Memory buffer to which tensors are read from the file, and limit this size. -+You can control the size of the CPU Memory buffer to which tensors are read from the file, and limit this size. - You can read further about CPU buffer memory limiting [here](https://github.com/run-ai/runai-model-streamer/blob/master/docs/src/env-vars.md#runai_streamer_memory_limit). - - ```console -diff --git a/docs/source/usage/performance.md b/docs/source/usage/performance.md -index f028e2862..2cd3801bf 100644 ---- a/docs/source/usage/performance.md -+++ b/docs/source/usage/performance.md -@@ -32,8 +32,8 @@ You can enable the feature by specifying `--enable-chunked-prefill` in the comma - ```python - llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_chunked_prefill=True) - # Set max_num_batched_tokens to tune performance. --# NOTE: 512 is the default max_num_batched_tokens for chunked prefill. --# llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_chunked_prefill=True, max_num_batched_tokens=512) -+# NOTE: 2048 is the default max_num_batched_tokens for chunked prefill. -+# llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_chunked_prefill=True, max_num_batched_tokens=2048) - ``` - - By default, vLLM scheduler prioritizes prefills and doesn't batch prefill and decode to the same batch. -@@ -49,13 +49,12 @@ This policy has two benefits: - - It improves ITL and generation decode because decode requests are prioritized. - - It helps achieve better GPU utilization by locating compute-bound (prefill) and memory-bound (decode) requests to the same batch. - --You can tune the performance by changing `max_num_batched_tokens`. --By default, it is set to 512, which has the best ITL on A100 in the initial benchmark (llama 70B and mixtral 8x22B). -+You can tune the performance by changing `max_num_batched_tokens`. By default, it is set to 2048. - Smaller `max_num_batched_tokens` achieves better ITL because there are fewer prefills interrupting decodes. - Higher `max_num_batched_tokens` achieves better TTFT as you can put more prefill to the batch. - - - If `max_num_batched_tokens` is the same as `max_model_len`, that's almost the equivalent to the default scheduling policy (except that it still prioritizes decodes). --- Note that the default value (512) of `max_num_batched_tokens` is optimized for ITL, and it may have lower throughput than the default scheduler. -+- Note that the default value (2048) of `max_num_batched_tokens` is optimized for ITL, and it may have lower throughput than the default scheduler. - - We recommend you set `max_num_batched_tokens > 2048` for throughput. - -diff --git a/docs/source/usage/spec_decode.md b/docs/source/usage/spec_decode.md -index 8302da81b..8c52c97a4 100644 ---- a/docs/source/usage/spec_decode.md -+++ b/docs/source/usage/spec_decode.md -@@ -182,7 +182,7 @@ speculative decoding, breaking down the guarantees into three key areas: - 3. **vLLM Logprob Stability** - \- vLLM does not currently guarantee stable token log probabilities (logprobs). This can result in different outputs for the - same request across runs. For more details, see the FAQ section -- titled *Can the output of a prompt vary across runs in vLLM?* in the {ref}`FAQs `. -+ titled *Can the output of a prompt vary across runs in vLLM?* in the [FAQs](#faq). - - **Conclusion** - -@@ -195,7 +195,7 @@ can occur due to following factors: - - **Mitigation Strategies** - --For mitigation strategies, please refer to the FAQ entry *Can the output of a prompt vary across runs in vLLM?* in the {ref}`FAQs `. -+For mitigation strategies, please refer to the FAQ entry *Can the output of a prompt vary across runs in vLLM?* in the [FAQs](#faq). - - ## Resources for vLLM contributors - -diff --git a/docs/source/usage/structured_outputs.md b/docs/source/usage/structured_outputs.md -index 3f5d9ffc2..26c09bb0d 100644 ---- a/docs/source/usage/structured_outputs.md -+++ b/docs/source/usage/structured_outputs.md -@@ -2,7 +2,7 @@ - - # Structured Outputs - --vLLM supports the generation of structured outputs using [outlines](https://github.com/dottxt-ai/outlines) or [lm-format-enforcer](https://github.com/noamgat/lm-format-enforcer) as backends for the guided decoding. -+vLLM supports the generation of structured outputs using [outlines](https://github.com/dottxt-ai/outlines), [lm-format-enforcer](https://github.com/noamgat/lm-format-enforcer), or [xgrammar](https://github.com/mlc-ai/xgrammar) as backends for the guided decoding. - This document shows you some examples of the different options that are available to generate structured outputs. - - ## Online Inference (OpenAI API) -@@ -18,7 +18,7 @@ The following parameters are supported, which must be added as extra parameters: - - `guided_whitespace_pattern`: used to override the default whitespace pattern for guided json decoding. - - `guided_decoding_backend`: used to select the guided decoding backend to use. - --You can see the complete list of supported parameters on the [OpenAI Compatible Server](../serving/openai_compatible_server.md) page. -+You can see the complete list of supported parameters on the [OpenAI-Compatible Server](#openai-compatible-server)page. - - Now let´s see an example for each of the cases, starting with the `guided_choice`, as it´s the easiest one: - -diff --git a/docs/source/usage/tool_calling.md b/docs/source/usage/tool_calling.md -index 34b26647a..062f2021e 100644 ---- a/docs/source/usage/tool_calling.md -+++ b/docs/source/usage/tool_calling.md -@@ -10,7 +10,7 @@ Start the server with tool calling enabled. This example uses Meta's Llama 3.1 8 - vllm serve meta-llama/Llama-3.1-8B-Instruct \ - --enable-auto-tool-choice \ - --tool-call-parser llama3_json \ -- --chat-template examples/tool_chat_template_llama3_json.jinja -+ --chat-template examples/tool_chat_template_llama3.1_json.jinja - ``` - - Next, make a request to the model that should result in it using the available tools: -diff --git a/examples/offline_inference.py b/examples/offline_inference.py -index 23cc6e853..5db5d91df 100644 ---- a/examples/offline_inference.py -+++ b/examples/offline_inference.py -@@ -11,7 +11,13 @@ prompts = [ - sampling_params = SamplingParams(temperature=0.8, top_p=0.95) - - # Create an LLM. --llm = LLM(model="facebook/opt-125m") -+llm = LLM(model="/llm/models/Llama-2-7b-chat-hf", -+ device="xpu", -+ dtype="float16", -+ enforce_eager=True, -+ tensor_parallel_size=1, -+ gpu_memory_utilization=0.9, -+ max_model_len=2048) - # Generate texts from the prompts. The output is a list of RequestOutput objects - # that contain the prompt, generated text, and other information. - outputs = llm.generate(prompts, sampling_params) -diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py -index d5a718626..b51bfae45 100644 ---- a/examples/offline_inference_vision_language.py -+++ b/examples/offline_inference_vision_language.py -@@ -24,10 +24,13 @@ def run_aria(question: str, modality: str): - assert modality == "image" - model_name = "rhymes-ai/Aria" - -+ # NOTE: Need L40 (or equivalent) to avoid OOM - llm = LLM(model=model_name, - tokenizer_mode="slow", -- trust_remote_code=True, - dtype="bfloat16", -+ max_model_len=4096, -+ max_num_seqs=2, -+ trust_remote_code=True, - disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache) - - prompt = (f"<|im_start|>user\n<|img|>\n{question}" -@@ -57,6 +60,7 @@ def run_chameleon(question: str, modality: str): - prompt = f"{question}" - llm = LLM(model="facebook/chameleon-7b", - max_model_len=4096, -+ max_num_seqs=2, - disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache) - stop_token_ids = None - return llm, prompt, stop_token_ids -@@ -257,7 +261,7 @@ def run_minicpmv(question: str, modality: str): - # 2.5 - # model_name = "openbmb/MiniCPM-Llama3-V-2_5" - -- #2.6 -+ # 2.6 - model_name = "openbmb/MiniCPM-V-2_6" - tokenizer = AutoTokenizer.from_pretrained(model_name, - trust_remote_code=True) -@@ -308,7 +312,20 @@ def run_mllama(question: str, modality: str): - disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, - ) - -- prompt = f"<|image|><|begin_of_text|>{question}" -+ tokenizer = AutoTokenizer.from_pretrained(model_name) -+ messages = [{ -+ "role": -+ "user", -+ "content": [{ -+ "type": "image" -+ }, { -+ "type": "text", -+ "text": f"{question}" -+ }] -+ }] -+ prompt = tokenizer.apply_chat_template(messages, -+ add_generation_prompt=True, -+ tokenize=False) - stop_token_ids = None - return llm, prompt, stop_token_ids - -@@ -417,9 +434,11 @@ def run_pixtral_hf(question: str, modality: str): - - model_name = "mistral-community/pixtral-12b" - -+ # NOTE: Need L40 (or equivalent) to avoid OOM - llm = LLM( - model=model_name, - max_model_len=8192, -+ max_num_seqs=2, - disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, - ) - -diff --git a/examples/offline_inference_whisper.py b/examples/offline_inference_whisper.py -new file mode 100644 -index 000000000..087ad4376 ---- /dev/null -+++ b/examples/offline_inference_whisper.py -@@ -0,0 +1,59 @@ -+import time -+ -+from vllm import LLM, SamplingParams -+from vllm.assets.audio import AudioAsset -+ -+# Create a Whisper encoder/decoder model instance -+llm = LLM( -+ model="openai/whisper-large-v3", -+ max_model_len=448, -+ max_num_seqs=400, -+ limit_mm_per_prompt={"audio": 1}, -+ kv_cache_dtype="fp8", -+) -+ -+prompts = [ -+ { -+ "prompt": "<|startoftranscript|>", -+ "multi_modal_data": { -+ "audio": AudioAsset("mary_had_lamb").audio_and_sample_rate, -+ }, -+ }, -+ { # Test explicit encoder/decoder prompt -+ "encoder_prompt": { -+ "prompt": "", -+ "multi_modal_data": { -+ "audio": AudioAsset("winning_call").audio_and_sample_rate, -+ }, -+ }, -+ "decoder_prompt": "<|startoftranscript|>", -+ } -+] * 1024 -+ -+# Create a sampling params object. -+sampling_params = SamplingParams( -+ temperature=0, -+ top_p=1.0, -+ max_tokens=200, -+) -+ -+start = time.time() -+ -+# Generate output tokens from the prompts. The output is a list of -+# RequestOutput objects that contain the prompt, generated -+# text, and other information. -+outputs = llm.generate(prompts, sampling_params) -+ -+# Print the outputs. -+for output in outputs: -+ prompt = output.prompt -+ encoder_prompt = output.encoder_prompt -+ generated_text = output.outputs[0].text -+ print(f"Encoder prompt: {encoder_prompt!r}, " -+ f"Decoder prompt: {prompt!r}, " -+ f"Generated text: {generated_text!r}") -+ -+duration = time.time() - start -+ -+print("Duration:", duration) -+print("RPS:", len(prompts) / duration) -diff --git a/examples/sagemaker-entrypoint.sh b/examples/sagemaker-entrypoint.sh -new file mode 100644 -index 000000000..75a99ffc1 ---- /dev/null -+++ b/examples/sagemaker-entrypoint.sh -@@ -0,0 +1,24 @@ -+#!/bin/bash -+ -+# Define the prefix for environment variables to look for -+PREFIX="SM_VLLM_" -+ARG_PREFIX="--" -+ -+# Initialize an array for storing the arguments -+# port 8080 required by sagemaker, https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms-inference-code.html#your-algorithms-inference-code-container-response -+ARGS=(--port 8080) -+ -+# Loop through all environment variables -+while IFS='=' read -r key value; do -+ # Remove the prefix from the key, convert to lowercase, and replace underscores with dashes -+ arg_name=$(echo "${key#"${PREFIX}"}" | tr '[:upper:]' '[:lower:]' | tr '_' '-') -+ -+ # Add the argument name and value to the ARGS array -+ ARGS+=("${ARG_PREFIX}${arg_name}") -+ if [ -n "$value" ]; then -+ ARGS+=("$value") -+ fi -+done < <(env | grep "^${PREFIX}") -+ -+# Pass the collected arguments to the main entrypoint -+exec python3 -m vllm.entrypoints.openai.api_server "${ARGS[@]}" -\ No newline at end of file -diff --git a/requirements-common.txt b/requirements-common.txt -index 6c390bcfd..b22364c5a 100644 ---- a/requirements-common.txt -+++ b/requirements-common.txt -@@ -21,7 +21,7 @@ tiktoken >= 0.6.0 # Required for DBRX tokenizer - lm-format-enforcer >= 0.10.9, < 0.11 - outlines == 0.1.11 # Requires pytorch - lark == 1.2.2 --xgrammar >= 0.1.6; platform_machine == "x86_64" -+xgrammar <= 0.1.17; platform_machine == "x86_64" - typing_extensions >= 4.10 - filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317 - partial-json-parser # used for parsing partial JSON outputs -diff --git a/requirements-neuron.txt b/requirements-neuron.txt -index 148fdbe0d..5e08d101f 100644 ---- a/requirements-neuron.txt -+++ b/requirements-neuron.txt -@@ -2,6 +2,6 @@ - -r requirements-common.txt - - # Dependencies for Neuron devices --transformers-neuronx >= 0.12.0 --torch-neuronx >= 2.1.2 -+transformers-neuronx >= 0.13.0 -+torch-neuronx >= 2.5.0 - neuronx-cc -diff --git a/requirements-tpu.txt b/requirements-tpu.txt -index b8f0b1546..8ab18b377 100644 ---- a/requirements-tpu.txt -+++ b/requirements-tpu.txt -@@ -18,6 +18,8 @@ ray[default] - --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html - torch==2.6.0.dev20241126+cpu - torchvision==0.20.0.dev20241126+cpu --torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev20241126-cp310-cp310-linux_x86_64.whl -+torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev20241126-cp39-cp39-linux_x86_64.whl ; python_version == "3.9" -+torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev20241126-cp310-cp310-linux_x86_64.whl ; python_version == "3.10" -+torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev20241126-cp311-cp311-linux_x86_64.whl ; python_version == "3.11" - jaxlib==0.4.36.dev20241122 - jax==0.4.36.dev20241122 -diff --git a/requirements-xpu.txt b/requirements-xpu.txt -index 42c6c321d..bcbbe6a6c 100644 ---- a/requirements-xpu.txt -+++ b/requirements-xpu.txt -@@ -8,9 +8,10 @@ packaging - setuptools-scm>=8 - wheel - jinja2 -+datasets - --torch @ https://intel-optimized-pytorch.s3.cn-north-1.amazonaws.com.cn/ipex_dev/xpu/torch-2.5.0a0%2Bgite84e33f-cp310-cp310-linux_x86_64.whl --intel-extension-for-pytorch @ https://intel-optimized-pytorch.s3.cn-north-1.amazonaws.com.cn/ipex_dev/xpu/intel_extension_for_pytorch-2.5.10%2Bgit9d489a8-cp310-cp310-linux_x86_64.whl --oneccl_bind_pt @ https://intel-optimized-pytorch.s3.cn-north-1.amazonaws.com.cn/ipex_dev/xpu/oneccl_bind_pt-2.5.0%2Bxpu-cp310-cp310-linux_x86_64.whl -+# torch @ https://download.pytorch.org/whl/nightly/xpu/torch-2.6.0.dev20241231%2Bxpu-cp310-cp310-linux_x86_64.whl -+# intel-extension-for-pytorch @ https://ubit-artifactory-ba.intel.com/artifactory/aipc_releases-ba-local/gpu/new/validation/IPEX/weekly/PVC/2025/ww01_Stock_test/py310/intel_extension_for_pytorch-2.6.10+gite4ef6fe-cp310-cp310-linux_x86_64.whl -+# oneccl_bind_pt @ https://ubit-artifactory-ba.intel.com/artifactory/aipc_releases-ba-local/gpu/new/validation/IPEX/weekly/PVC/2025/ww01_Stock_test/py310/oneccl_bind_pt-2.5.0+xpu-cp310-cp310-linux_x86_64.whl - --triton-xpu == 3.0.0b1 -+# pytorch_triton_xpu @ https://download.pytorch.org/whl/nightly/pytorch_triton_xpu-3.2.0%2Bgite98b6fcb-cp310-cp310-linux_x86_64.whl -diff --git a/setup.py b/setup.py -index 61d2d710a..8e0ea02e4 100644 ---- a/setup.py -+++ b/setup.py -@@ -1,3 +1,4 @@ -+import ctypes - import importlib.util - import logging - import os -@@ -13,7 +14,7 @@ from packaging.version import Version, parse - from setuptools import Extension, find_packages, setup - from setuptools.command.build_ext import build_ext - from setuptools_scm import get_version --from torch.utils.cpp_extension import CUDA_HOME -+from torch.utils.cpp_extension import CUDA_HOME, ROCM_HOME - - - def load_module_from_path(module_name, path): -@@ -24,6 +25,7 @@ def load_module_from_path(module_name, path): - return module - - -+os.environ["SETUPTOOLS_SCM_PRETEND_VERSION"] = "0.6.6+ipexllm" - ROOT_DIR = os.path.dirname(__file__) - logger = logging.getLogger(__name__) - -@@ -121,6 +123,7 @@ class cmake_build_ext(build_ext): - cmake_args = [ - '-DCMAKE_BUILD_TYPE={}'.format(cfg), - '-DVLLM_TARGET_DEVICE={}'.format(VLLM_TARGET_DEVICE), -+ "-DCMAKE_CXX_STANDARD=17", - ] - - verbose = envs.VERBOSE -@@ -342,7 +345,7 @@ def _no_device() -> bool: - def _is_cuda() -> bool: - has_cuda = torch.version.cuda is not None - return (VLLM_TARGET_DEVICE == "cuda" and has_cuda -- and not (_is_neuron() or _is_tpu() or _is_hpu())) -+ and not (_is_neuron() or _is_tpu() or _is_xpu())) - - - def _is_hip() -> bool: -@@ -376,28 +379,38 @@ def _is_xpu() -> bool: - - - def _build_custom_ops() -> bool: -- return _is_cuda() or _is_hip() or _is_cpu() -+ return _is_cuda() or _is_hip() or _is_cpu() or _is_xpu() - - --def get_hipcc_rocm_version(): -- # Run the hipcc --version command -- result = subprocess.run(['hipcc', '--version'], -- stdout=subprocess.PIPE, -- stderr=subprocess.STDOUT, -- text=True) -+def _build_core_ext() -> bool: -+ return not (_is_neuron() or _is_tpu() or _is_openvino() or _is_xpu()) - -- # Check if the command was executed successfully -- if result.returncode != 0: -- print("Error running 'hipcc --version'") -- return None - -- # Extract the version using a regular expression -- match = re.search(r'HIP version: (\S+)', result.stdout) -- if match: -- # Return the version string -- return match.group(1) -- else: -- print("Could not find HIP version in the output") -+def get_rocm_version(): -+ # Get the Rocm version from the ROCM_HOME/bin/librocm-core.so -+ # see https://github.com/ROCm/rocm-core/blob/d11f5c20d500f729c393680a01fa902ebf92094b/rocm_version.cpp#L21 -+ try: -+ librocm_core_file = Path(ROCM_HOME) / "lib" / "librocm-core.so" -+ if not librocm_core_file.is_file(): -+ return None -+ librocm_core = ctypes.CDLL(librocm_core_file) -+ VerErrors = ctypes.c_uint32 -+ get_rocm_core_version = librocm_core.getROCmVersion -+ get_rocm_core_version.restype = VerErrors -+ get_rocm_core_version.argtypes = [ -+ ctypes.POINTER(ctypes.c_uint32), -+ ctypes.POINTER(ctypes.c_uint32), -+ ctypes.POINTER(ctypes.c_uint32), -+ ] -+ major = ctypes.c_uint32() -+ minor = ctypes.c_uint32() -+ patch = ctypes.c_uint32() -+ -+ if (get_rocm_core_version(ctypes.byref(major), ctypes.byref(minor), -+ ctypes.byref(patch)) == 0): -+ return "%d.%d.%d" % (major.value, minor.value, patch.value) -+ return None -+ except Exception: - return None - - -@@ -479,11 +492,10 @@ def get_vllm_version() -> str: - if "sdist" not in sys.argv: - version += f"{sep}cu{cuda_version_str}" - elif _is_hip(): -- # Get the HIP version -- hipcc_version = get_hipcc_rocm_version() -- if hipcc_version != MAIN_CUDA_VERSION: -- rocm_version_str = hipcc_version.replace(".", "")[:3] -- version += f"{sep}rocm{rocm_version_str}" -+ # Get the Rocm Version -+ rocm_version = get_rocm_version() or torch.version.hip -+ if rocm_version and rocm_version != MAIN_CUDA_VERSION: -+ version += f"{sep}rocm{rocm_version.replace('.', '')[:3]}" - elif _is_neuron(): - # Get the Neuron version - neuron_version = str(get_neuronxcc_version()) -@@ -573,6 +585,9 @@ def get_requirements() -> List[str]: - - ext_modules = [] - -+if _build_core_ext(): -+ ext_modules.append(CMakeExtension(name="vllm._core_C")) -+ - if _is_cuda() or _is_hip(): - ext_modules.append(CMakeExtension(name="vllm._moe_C")) - -diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py -index 07c10a3a1..d4ede4d23 100644 ---- a/tests/compile/piecewise/test_toy_llama.py -+++ b/tests/compile/piecewise/test_toy_llama.py -@@ -7,7 +7,7 @@ if the config `tractable_init` is set to True. Otherwise, the weights are - initialized randomly with a fixed seed. - """ - from dataclasses import dataclass --from typing import Optional, Tuple -+from typing import Any, List, Optional, Tuple - - import torch - from torch import nn -@@ -54,6 +54,16 @@ class LlamaConfig: - tractable_init: bool = False - random_seed: int = 0 - -+ def compute_hash(self) -> str: -+ factors: List[Any] = [] -+ for k, v in self.__dict__.items(): -+ if k == "random_seed": -+ continue -+ factors.append((k, v)) -+ factors.sort() -+ import hashlib -+ return hashlib.md5(str(factors).encode()).hexdigest() -+ - def __post_init__(self): - assert self.mlp_size >= self.hidden_size - -@@ -263,7 +273,8 @@ def run_model(llama_config, - compilation_config = CompilationConfig( - level=CompilationLevel.NO_COMPILATION, ) - -- vllm_config = VllmConfig(compilation_config=compilation_config) -+ vllm_config = VllmConfig(compilation_config=compilation_config, -+ additional_config=llama_config) - with set_current_vllm_config(vllm_config): - model = LlamaModel(config=llama_config, - vllm_config=vllm_config, -diff --git a/tests/conftest.py b/tests/conftest.py -index 4e9392213..917151ddc 100644 ---- a/tests/conftest.py -+++ b/tests/conftest.py -@@ -31,7 +31,6 @@ from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt, - to_enc_dec_tuple_list, zip_enc_dec_prompts) - from vllm.logger import init_logger - from vllm.outputs import RequestOutput --from vllm.platforms import current_platform - from vllm.sampling_params import BeamSearchParams - from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, cuda_device_count_stateless, - identity) -@@ -41,6 +40,7 @@ logger = init_logger(__name__) - _TEST_DIR = os.path.dirname(__file__) - _TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")] - _LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")] -+_SYS_MSG = os.path.join(_TEST_DIR, "system_messages", "sonnet3.5_nov2024.txt") - - _M = TypeVar("_M") - _PromptMultiModalInput = Union[List[_M], List[List[_M]]] -@@ -178,6 +178,12 @@ def example_prompts() -> List[str]: - return prompts - - -+@pytest.fixture -+def example_system_message() -> str: -+ with open(_SYS_MSG) as f: -+ return f.read() -+ -+ - class DecoderPromptType(Enum): - """For encoder/decoder models only.""" - CUSTOM = 1 -@@ -242,6 +248,7 @@ _T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding, BatchFeature, dict) - class HfRunner: - - def wrap_device(self, x: _T, device: Optional[str] = None) -> _T: -+ from vllm.platforms import current_platform - if x is None or isinstance(x, (bool, )): - return x - -diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py -index 8f6de84e5..33bf7e6bd 100644 ---- a/tests/core/test_scheduler.py -+++ b/tests/core/test_scheduler.py -@@ -11,6 +11,7 @@ from vllm.core.interfaces import AllocStatus - from vllm.core.scheduler import Scheduler, SchedulingBudget - from vllm.lora.request import LoRARequest - from vllm.sequence import SequenceGroup -+from vllm.sequence import Sequence, SequenceStatus - - from .utils import (append_new_token, append_new_token_seq, - append_new_token_seq_group, create_dummy_prompt, -@@ -971,3 +972,77 @@ def test_no_multiple_partial_prefills_with_chunked_prefill_and_prefix_caching( - ), "A partial prefix of C (4 tokens) should be prefilled, with the " - "remaining tokens fit into 3 token budget (4-1 from the seqA). It will " - "then be rounded down to 2 tokens on block size, thus 6 tokens in total." -+ -+def test_continuous_batching(): -+ block_size = 4 -+ max_num_seqs = 3 -+ num_seq_groups = 4 -+ max_model_len = 16 -+ scheduler_config = SchedulerConfig( -+ "generate", -+ max_num_batched_tokens=64, -+ max_num_seqs=max_num_seqs, -+ max_model_len=max_model_len, -+ ) -+ cache_config = CacheConfig(block_size, 1.0, 1, "auto") -+ cache_config.num_cpu_blocks = 64 -+ # Can contain up to 64 * 4 = 256 tokens -+ cache_config.num_gpu_blocks = 64 -+ -+ scheduler = Scheduler(scheduler_config, cache_config, None) -+ all_seq_groups: List[SequenceGroup] = [] -+ # Add two sequences that requires 16 token outputs -+ for i in range(num_seq_groups - 2): -+ # TODO: may need ignore_eos -+ _, seq_group = create_dummy_prompt(str(i), -+ prompt_length=block_size, -+ block_size=block_size, -+ min_tokens=16, -+ max_tokens=16) -+ all_seq_groups.append(seq_group) -+ scheduler.add_seq_group(seq_group) -+ -+ # Add one sequences that require 2 token outputs -+ _, seq_group = create_dummy_prompt(str(2), -+ prompt_length=block_size, -+ min_tokens=2, -+ max_tokens=2) -+ all_seq_groups.append(seq_group) -+ scheduler.add_seq_group(seq_group) -+ -+ # Add the last sequence, which also requires 16 tokens -+ _, seq_group = create_dummy_prompt(str(3), -+ prompt_length=block_size, -+ block_size=block_size, -+ min_tokens=16, -+ max_tokens=16) -+ all_seq_groups.append(seq_group) -+ scheduler.add_seq_group(seq_group) -+ -+ # This should generate the first token for the first three requests -+ seq_group_meta, out = schedule_and_update_computed_tokens(scheduler) -+ assert len(get_sequence_groups(out)) == 3 -+ # Generate one token -+ append_new_token(out, 1) -+ # Generate one token -+ seq_group_meta, out = schedule_and_update_computed_tokens(scheduler) -+ append_new_token(out, 1) -+ -+ # Now the request "2" is finished -+ all_seq_groups[2].seqs[0].status = SequenceStatus.FINISHED_LENGTH_CAPPED -+ del scheduler.running[2] -+ -+ # Now the next scheduling will schedule the last seq, which prepare for its prefill -+ seq_group_meta, out = schedule_and_update_computed_tokens(scheduler) -+ target = set(get_sequence_groups(out)) -+ expected = set([all_seq_groups[3]]) -+ assert target == expected -+ append_new_token(out, 1) -+ -+ # The next schedule will schedule 0, 1, 3 requests -+ # Which proves that we do not need to wait for all other sequences finished to schedule the next -+ # sequence... -+ seq_group_meta, out = schedule_and_update_computed_tokens(scheduler) -+ target = set(get_sequence_groups(out)) -+ expected = set([all_seq_groups[1], all_seq_groups[0], all_seq_groups[3]]) -+ assert target == expected -\ No newline at end of file -diff --git a/tests/distributed/test_basic_distributed_correctness_enc_dec.py b/tests/distributed/test_basic_distributed_correctness_enc_dec.py -new file mode 100644 -index 000000000..f00d5ef58 ---- /dev/null -+++ b/tests/distributed/test_basic_distributed_correctness_enc_dec.py -@@ -0,0 +1,102 @@ -+"""For encoder/decoder models only: -+Compare the outputs of HF and distributed vLLM when using greedy sampling. -+ -+Run: -+```sh -+cd $VLLM_PATH/tests -+ -+pytest distributed/test_basic_distributed_correctness_enc_dec.py -+``` -+""" ++""" + +import pytest +from transformers import AutoModelForSeq2SeqLM @@ -13293,2360 +7696,843 @@ index 000000000..f00d5ef58 + name_0="hf", + name_1="vllm", + ) -diff --git a/tests/distributed/test_custom_all_reduce.py b/tests/distributed/test_custom_all_reduce.py -index 86ca1948e..4072616fd 100644 ---- a/tests/distributed/test_custom_all_reduce.py -+++ b/tests/distributed/test_custom_all_reduce.py -@@ -50,7 +50,7 @@ def graph_allreduce(tp_size, pp_size, rank, distributed_init_port): - - for sz in test_sizes: - for dtype in [torch.float32, torch.float16, torch.bfloat16]: -- with graph_capture() as graph_capture_context: -+ with graph_capture(device=device) as graph_capture_context: - # use integers so result matches NCCL exactly - inp1 = torch.randint(1, - 16, (sz, ), -diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py -index 3e9b0e10a..a8571a115 100644 ---- a/tests/distributed/test_pynccl.py -+++ b/tests/distributed/test_pynccl.py -@@ -59,8 +59,7 @@ def worker_fn(): - device=get_world_group().device) - tensor = torch.ones(16, 1024, 1024, - dtype=torch.float32).cuda(pynccl_comm.rank) -- with pynccl_comm.change_state(enable=True): -- tensor = pynccl_comm.all_reduce(tensor) -+ tensor = pynccl_comm.all_reduce(tensor) - torch.cuda.synchronize() - assert torch.all(tensor == pynccl_comm.world_size).cpu().item() - -@@ -81,17 +80,16 @@ def multiple_allreduce_worker_fn(): - group = groups[0] if torch.distributed.get_rank() in [0, 1] else groups[1] - pynccl_comm = PyNcclCommunicator(group=group, device=device) - tensor = torch.ones(16, 1024, 1024, dtype=torch.float32, device=device) -- with pynccl_comm.change_state(enable=True): -- # two groups can communicate independently -- if torch.distributed.get_rank() in [0, 1]: -- tensor = pynccl_comm.all_reduce(tensor) -- tensor = pynccl_comm.all_reduce(tensor) -- torch.cuda.synchronize() -- assert torch.all(tensor == 4).cpu().item() -- else: -- tensor = pynccl_comm.all_reduce(tensor) -- torch.cuda.synchronize() -- assert torch.all(tensor == 2).cpu().item() -+ # two groups can communicate independently -+ if torch.distributed.get_rank() in [0, 1]: -+ tensor = pynccl_comm.all_reduce(tensor) -+ tensor = pynccl_comm.all_reduce(tensor) -+ torch.cuda.synchronize() -+ assert torch.all(tensor == 4).cpu().item() -+ else: -+ tensor = pynccl_comm.all_reduce(tensor) -+ torch.cuda.synchronize() -+ assert torch.all(tensor == 2).cpu().item() - - - @pytest.mark.skipif(torch.cuda.device_count() < 4, -@@ -107,7 +105,7 @@ def multiple_allreduce_with_vllm_worker_fn(): - device = torch.device(f"cuda:{torch.distributed.get_rank()}") - ensure_model_parallel_initialized(2, 2) - tensor = torch.ones(16, 1024, 1024, dtype=torch.float32, device=device) -- with graph_capture(): -+ with graph_capture(device=device): - # two tp groups can communicate independently - if torch.distributed.get_rank() in [0, 1]: - tensor = tensor_model_parallel_all_reduce(tensor) -@@ -137,9 +135,7 @@ def worker_fn_with_cudagraph(): - # run something in the default stream to initialize torch engine - a = torch.ones((4, 4), device=f'cuda:{pynccl_comm.rank}') - torch.cuda.synchronize() -- with torch.cuda.graph( -- graph, stream=pynccl_comm.stream), pynccl_comm.change_state( -- enable=True): -+ with torch.cuda.graph(graph): - a_out = pynccl_comm.all_reduce(a) - torch.cuda.synchronize() - graph.replay() -@@ -168,8 +164,7 @@ def all_gather_worker_fn(): - for r in range(world_size) - ]).to(device) - -- with pynccl_comm.change_state(enable=True): -- pynccl_comm.all_gather(result, tensor) -+ pynccl_comm.all_gather(result, tensor) - torch.cuda.synchronize() - torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8) - -@@ -206,8 +201,7 @@ def reduce_scatter_worker_fn(): - expected = sum(tensor[rank * scattered_size:(rank + 1) * scattered_size] - for tensor in all_tensors).to(device) - -- with pynccl_comm.change_state(enable=True): -- pynccl_comm.reduce_scatter(result, tensor) -+ pynccl_comm.reduce_scatter(result, tensor) - torch.cuda.synchronize() - torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8) - -@@ -234,15 +228,13 @@ def send_recv_worker_fn(): - else: - tensor = torch.empty(16, 1024, 1024, - dtype=torch.float32).cuda(pynccl_comm.rank) -- with pynccl_comm.change_state(enable=True): -- if pynccl_comm.rank == 0: -- pynccl_comm.send(tensor, -- dst=(pynccl_comm.rank + 1) % -- pynccl_comm.world_size) -- else: -- pynccl_comm.recv(tensor, -- src=(pynccl_comm.rank - 1) % -- pynccl_comm.world_size) -+ -+ if pynccl_comm.rank == 0: -+ pynccl_comm.send(tensor, -+ dst=(pynccl_comm.rank + 1) % pynccl_comm.world_size) -+ else: -+ pynccl_comm.recv(tensor, -+ src=(pynccl_comm.rank - 1) % pynccl_comm.world_size) - torch.cuda.synchronize() - assert torch.all(tensor == 1).cpu().item() - -@@ -273,15 +265,12 @@ def multiple_send_recv_worker_fn(): - 1024, - dtype=torch.float32, - device=device) -- with pynccl_comm.change_state(enable=True): -- if torch.distributed.get_rank() in [0, 1]: -- pynccl_comm.send(tensor, -- dst=(pynccl_comm.rank + 1) % -- pynccl_comm.world_size) -- else: -- pynccl_comm.recv(tensor, -- src=(pynccl_comm.rank - 1) % -- pynccl_comm.world_size) -+ if torch.distributed.get_rank() in [0, 1]: -+ pynccl_comm.send(tensor, -+ dst=(pynccl_comm.rank + 1) % pynccl_comm.world_size) -+ else: -+ pynccl_comm.recv(tensor, -+ src=(pynccl_comm.rank - 1) % pynccl_comm.world_size) - torch.cuda.synchronize() - if torch.distributed.get_rank() in [0, 2]: - assert torch.all(tensor == 1).cpu().item() -diff --git a/tests/entrypoints/openai/test_cli_args.py b/tests/entrypoints/openai/test_cli_args.py -index 45e6980a9..e49562ad6 100644 ---- a/tests/entrypoints/openai/test_cli_args.py -+++ b/tests/entrypoints/openai/test_cli_args.py -@@ -4,7 +4,7 @@ import pytest - - from vllm.entrypoints.openai.cli_args import (make_arg_parser, - validate_parsed_serve_args) --from vllm.entrypoints.openai.serving_engine import LoRAModulePath -+from vllm.entrypoints.openai.serving_models import LoRAModulePath - from vllm.utils import FlexibleArgumentParser - - from ...utils import VLLM_PATH -diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py -index c81cfdbbe..183d900c4 100644 ---- a/tests/entrypoints/openai/test_completion.py -+++ b/tests/entrypoints/openai/test_completion.py -@@ -28,6 +28,8 @@ PA_NAME = "swapnilbp/llama_tweet_ptune" - # need to change to match the prompt adapter - PA_NUM_VIRTUAL_TOKENS = 8 - -+GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"] -+ - - @pytest.fixture(scope="module") - def zephyr_lora_files(): -@@ -635,8 +637,7 @@ async def test_allowed_token_ids(client: openai.AsyncOpenAI): - - - @pytest.mark.asyncio --@pytest.mark.parametrize("guided_decoding_backend", -- ["outlines", "lm-format-enforcer"]) -+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS) - async def test_guided_json_completion(client: openai.AsyncOpenAI, - guided_decoding_backend: str, - sample_json_schema): -@@ -658,8 +659,7 @@ async def test_guided_json_completion(client: openai.AsyncOpenAI, - - - @pytest.mark.asyncio --@pytest.mark.parametrize("guided_decoding_backend", -- ["outlines", "lm-format-enforcer"]) -+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS) - async def test_guided_regex_completion(client: openai.AsyncOpenAI, - guided_decoding_backend: str, - sample_regex): -@@ -680,8 +680,7 @@ async def test_guided_regex_completion(client: openai.AsyncOpenAI, - - - @pytest.mark.asyncio --@pytest.mark.parametrize("guided_decoding_backend", -- ["outlines", "lm-format-enforcer"]) -+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS) - async def test_guided_choice_completion(client: openai.AsyncOpenAI, - guided_decoding_backend: str, - sample_guided_choice): -@@ -761,8 +760,7 @@ async def test_echo_logprob_completion(client: openai.AsyncOpenAI, - - - @pytest.mark.asyncio --@pytest.mark.parametrize("guided_decoding_backend", -- ["outlines", "lm-format-enforcer"]) -+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS) - async def test_guided_decoding_type_error(client: openai.AsyncOpenAI, - guided_decoding_backend: str, - sample_json_schema, sample_regex): -diff --git a/tests/entrypoints/openai/test_lora_lineage.py b/tests/entrypoints/openai/test_lora_lineage.py -index ab39684c2..ce4f85c13 100644 ---- a/tests/entrypoints/openai/test_lora_lineage.py -+++ b/tests/entrypoints/openai/test_lora_lineage.py -@@ -55,7 +55,10 @@ def server_with_lora_modules_json(zephyr_lora_files): - "64", - ] +diff --git a/tests/models/decoder_only/language/test_granite.py b/tests/models/decoder_only/language/test_granite.py +index 119b79d64..ff8ba7b76 100644 +--- a/tests/models/decoder_only/language/test_granite.py ++++ b/tests/models/decoder_only/language/test_granite.py +@@ -31,9 +31,11 @@ def test_models( + hf_outputs = hf_model.generate_greedy_logprobs_limit( + example_prompts, max_tokens, num_logprobs) -- with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: -+ # Enable the /v1/load_lora_adapter endpoint -+ envs = {"VLLM_ALLOW_RUNTIME_LORA_UPDATING": "True"} +- with vllm_runner(model, dtype=dtype) as vllm_model: ++ with vllm_runner(model, dtype=dtype, ++ tokenizer_mode="mistral") as vllm_model: + vllm_outputs = vllm_model.generate_greedy_logprobs( + example_prompts, max_tokens, num_logprobs) + -+ with RemoteOpenAIServer(MODEL_NAME, args, env_dict=envs) as remote_server: - yield remote_server - - -@@ -67,8 +70,8 @@ async def client_for_lora_lineage(server_with_lora_modules_json): - - - @pytest.mark.asyncio --async def test_check_lora_lineage(client_for_lora_lineage: openai.AsyncOpenAI, -- zephyr_lora_files): -+async def test_static_lora_lineage(client_for_lora_lineage: openai.AsyncOpenAI, -+ zephyr_lora_files): - models = await client_for_lora_lineage.models.list() - models = models.data - served_model = models[0] -@@ -81,3 +84,26 @@ async def test_check_lora_lineage(client_for_lora_lineage: openai.AsyncOpenAI, - assert all(lora_model.parent == MODEL_NAME for lora_model in lora_models) - assert lora_models[0].id == "zephyr-lora" - assert lora_models[1].id == "zephyr-lora2" -+ -+ -+@pytest.mark.asyncio -+async def test_dynamic_lora_lineage( -+ client_for_lora_lineage: openai.AsyncOpenAI, zephyr_lora_files): -+ -+ response = await client_for_lora_lineage.post("load_lora_adapter", -+ cast_to=str, -+ body={ -+ "lora_name": -+ "zephyr-lora-3", -+ "lora_path": -+ zephyr_lora_files -+ }) -+ # Ensure adapter loads before querying /models -+ assert "success" in response -+ -+ models = await client_for_lora_lineage.models.list() -+ models = models.data -+ dynamic_lora_model = models[-1] -+ assert dynamic_lora_model.root == zephyr_lora_files -+ assert dynamic_lora_model.parent == MODEL_NAME -+ assert dynamic_lora_model.id == "zephyr-lora-3" -diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py -index 51b255bb2..97248f115 100644 ---- a/tests/entrypoints/openai/test_serving_chat.py -+++ b/tests/entrypoints/openai/test_serving_chat.py -@@ -8,7 +8,8 @@ from vllm.config import MultiModalConfig - from vllm.engine.multiprocessing.client import MQLLMEngineClient - from vllm.entrypoints.openai.protocol import ChatCompletionRequest - from vllm.entrypoints.openai.serving_chat import OpenAIServingChat --from vllm.entrypoints.openai.serving_engine import BaseModelPath -+from vllm.entrypoints.openai.serving_models import (BaseModelPath, -+ OpenAIServingModels) - from vllm.transformers_utils.tokenizer import get_tokenizer - - MODEL_NAME = "openai-community/gpt2" -@@ -33,6 +34,7 @@ class MockModelConfig: - hf_config = MockHFConfig() - logits_processor_pattern = None - diff_sampling_param: Optional[dict] = None -+ allowed_local_media_path: str = "" - - def get_diff_sampling_param(self): - return self.diff_sampling_param or {} -@@ -49,14 +51,13 @@ async def _async_serving_chat_init(): - engine = MockEngine() - model_config = await engine.get_model_config() - -+ models = OpenAIServingModels(model_config, BASE_MODEL_PATHS) - serving_completion = OpenAIServingChat(engine, - model_config, -- BASE_MODEL_PATHS, -+ models, - response_role="assistant", - chat_template=CHAT_TEMPLATE, - chat_template_content_format="auto", -- lora_modules=None, -- prompt_adapters=None, - request_logger=None) - return serving_completion - -@@ -71,14 +72,14 @@ def test_serving_chat_should_set_correct_max_tokens(): - mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) - mock_engine.errored = False - -+ models = OpenAIServingModels(base_model_paths=BASE_MODEL_PATHS, -+ model_config=MockModelConfig()) - serving_chat = OpenAIServingChat(mock_engine, - MockModelConfig(), -- BASE_MODEL_PATHS, -+ models, - response_role="assistant", - chat_template=CHAT_TEMPLATE, - chat_template_content_format="auto", -- lora_modules=None, -- prompt_adapters=None, - request_logger=None) - req = ChatCompletionRequest( - model=MODEL_NAME, -@@ -114,14 +115,14 @@ def test_serving_chat_could_load_correct_generation_config(): - mock_engine.errored = False - - # Initialize the serving chat -+ models = OpenAIServingModels(base_model_paths=BASE_MODEL_PATHS, -+ model_config=mock_model_config) - serving_chat = OpenAIServingChat(mock_engine, - mock_model_config, -- BASE_MODEL_PATHS, -+ models, - response_role="assistant", - chat_template=CHAT_TEMPLATE, - chat_template_content_format="auto", -- lora_modules=None, -- prompt_adapters=None, - request_logger=None) - req = ChatCompletionRequest( - model=MODEL_NAME, -diff --git a/tests/entrypoints/openai/test_serving_engine.py b/tests/entrypoints/openai/test_serving_models.py -similarity index 61% -rename from tests/entrypoints/openai/test_serving_engine.py -rename to tests/entrypoints/openai/test_serving_models.py -index 096ab6fa0..96897dc73 100644 ---- a/tests/entrypoints/openai/test_serving_engine.py -+++ b/tests/entrypoints/openai/test_serving_models.py -@@ -4,11 +4,11 @@ from unittest.mock import MagicMock - import pytest - - from vllm.config import ModelConfig --from vllm.engine.protocol import EngineClient - from vllm.entrypoints.openai.protocol import (ErrorResponse, - LoadLoraAdapterRequest, - UnloadLoraAdapterRequest) --from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing -+from vllm.entrypoints.openai.serving_models import (BaseModelPath, -+ OpenAIServingModels) - from vllm.lora.request import LoRARequest - - MODEL_NAME = "meta-llama/Llama-2-7b" -@@ -19,47 +19,45 @@ LORA_UNLOADING_SUCCESS_MESSAGE = ( - "Success: LoRA adapter '{lora_name}' removed successfully.") - - --async def _async_serving_engine_init(): -- mock_engine_client = MagicMock(spec=EngineClient) -+async def _async_serving_models_init() -> OpenAIServingModels: - mock_model_config = MagicMock(spec=ModelConfig) - # Set the max_model_len attribute to avoid missing attribute - mock_model_config.max_model_len = 2048 - -- serving_engine = OpenAIServing(mock_engine_client, -- mock_model_config, -- BASE_MODEL_PATHS, -- lora_modules=None, -- prompt_adapters=None, -- request_logger=None) -- return serving_engine -+ serving_models = OpenAIServingModels(base_model_paths=BASE_MODEL_PATHS, -+ model_config=mock_model_config, -+ lora_modules=None, -+ prompt_adapters=None) -+ -+ return serving_models - - - @pytest.mark.asyncio - async def test_serving_model_name(): -- serving_engine = await _async_serving_engine_init() -- assert serving_engine._get_model_name(None) == MODEL_NAME -+ serving_models = await _async_serving_models_init() -+ assert serving_models.model_name(None) == MODEL_NAME - request = LoRARequest(lora_name="adapter", - lora_path="/path/to/adapter2", - lora_int_id=1) -- assert serving_engine._get_model_name(request) == request.lora_name -+ assert serving_models.model_name(request) == request.lora_name - - - @pytest.mark.asyncio - async def test_load_lora_adapter_success(): -- serving_engine = await _async_serving_engine_init() -+ serving_models = await _async_serving_models_init() - request = LoadLoraAdapterRequest(lora_name="adapter", - lora_path="/path/to/adapter2") -- response = await serving_engine.load_lora_adapter(request) -+ response = await serving_models.load_lora_adapter(request) - assert response == LORA_LOADING_SUCCESS_MESSAGE.format(lora_name='adapter') -- assert len(serving_engine.lora_requests) == 1 -- assert serving_engine.lora_requests[0].lora_name == "adapter" -+ assert len(serving_models.lora_requests) == 1 -+ assert serving_models.lora_requests[0].lora_name == "adapter" - - - @pytest.mark.asyncio - async def test_load_lora_adapter_missing_fields(): -- serving_engine = await _async_serving_engine_init() -+ serving_models = await _async_serving_models_init() - request = LoadLoraAdapterRequest(lora_name="", lora_path="") -- response = await serving_engine.load_lora_adapter(request) -+ response = await serving_models.load_lora_adapter(request) - assert isinstance(response, ErrorResponse) - assert response.type == "InvalidUserInput" - assert response.code == HTTPStatus.BAD_REQUEST -@@ -67,43 +65,43 @@ async def test_load_lora_adapter_missing_fields(): - - @pytest.mark.asyncio - async def test_load_lora_adapter_duplicate(): -- serving_engine = await _async_serving_engine_init() -+ serving_models = await _async_serving_models_init() - request = LoadLoraAdapterRequest(lora_name="adapter1", - lora_path="/path/to/adapter1") -- response = await serving_engine.load_lora_adapter(request) -+ response = await serving_models.load_lora_adapter(request) - assert response == LORA_LOADING_SUCCESS_MESSAGE.format( - lora_name='adapter1') -- assert len(serving_engine.lora_requests) == 1 -+ assert len(serving_models.lora_requests) == 1 - - request = LoadLoraAdapterRequest(lora_name="adapter1", - lora_path="/path/to/adapter1") -- response = await serving_engine.load_lora_adapter(request) -+ response = await serving_models.load_lora_adapter(request) - assert isinstance(response, ErrorResponse) - assert response.type == "InvalidUserInput" - assert response.code == HTTPStatus.BAD_REQUEST -- assert len(serving_engine.lora_requests) == 1 -+ assert len(serving_models.lora_requests) == 1 - - - @pytest.mark.asyncio - async def test_unload_lora_adapter_success(): -- serving_engine = await _async_serving_engine_init() -+ serving_models = await _async_serving_models_init() - request = LoadLoraAdapterRequest(lora_name="adapter1", - lora_path="/path/to/adapter1") -- response = await serving_engine.load_lora_adapter(request) -- assert len(serving_engine.lora_requests) == 1 -+ response = await serving_models.load_lora_adapter(request) -+ assert len(serving_models.lora_requests) == 1 - - request = UnloadLoraAdapterRequest(lora_name="adapter1") -- response = await serving_engine.unload_lora_adapter(request) -+ response = await serving_models.unload_lora_adapter(request) - assert response == LORA_UNLOADING_SUCCESS_MESSAGE.format( - lora_name='adapter1') -- assert len(serving_engine.lora_requests) == 0 -+ assert len(serving_models.lora_requests) == 0 - - - @pytest.mark.asyncio - async def test_unload_lora_adapter_missing_fields(): -- serving_engine = await _async_serving_engine_init() -+ serving_models = await _async_serving_models_init() - request = UnloadLoraAdapterRequest(lora_name="", lora_int_id=None) -- response = await serving_engine.unload_lora_adapter(request) -+ response = await serving_models.unload_lora_adapter(request) - assert isinstance(response, ErrorResponse) - assert response.type == "InvalidUserInput" - assert response.code == HTTPStatus.BAD_REQUEST -@@ -111,9 +109,9 @@ async def test_unload_lora_adapter_missing_fields(): - - @pytest.mark.asyncio - async def test_unload_lora_adapter_not_found(): -- serving_engine = await _async_serving_engine_init() -+ serving_models = await _async_serving_models_init() - request = UnloadLoraAdapterRequest(lora_name="nonexistent_adapter") -- response = await serving_engine.unload_lora_adapter(request) -+ response = await serving_models.unload_lora_adapter(request) - assert isinstance(response, ErrorResponse) - assert response.type == "InvalidUserInput" - assert response.code == HTTPStatus.BAD_REQUEST -diff --git a/tests/entrypoints/openai/test_vision_embedding.py b/tests/entrypoints/openai/test_vision_embedding.py -index 3731b2dcd..c851539c6 100644 ---- a/tests/entrypoints/openai/test_vision_embedding.py -+++ b/tests/entrypoints/openai/test_vision_embedding.py -@@ -91,5 +91,5 @@ async def test_image_embedding(server: RemoteOpenAIServer, model_name: str, - assert len(embeddings.data) == 1 - assert len(embeddings.data[0].embedding) == 3072 - assert embeddings.usage.completion_tokens == 0 -- assert embeddings.usage.prompt_tokens == 765 -- assert embeddings.usage.total_tokens == 765 -+ assert embeddings.usage.prompt_tokens == 764 -+ assert embeddings.usage.total_tokens == 764 -diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py -index 996e60bfe..d63b96352 100644 ---- a/tests/entrypoints/test_chat_utils.py -+++ b/tests/entrypoints/test_chat_utils.py -@@ -2,7 +2,6 @@ import warnings - from typing import Optional - - import pytest --from PIL import Image - - from vllm.assets.image import ImageAsset - from vllm.config import ModelConfig -@@ -91,10 +90,7 @@ def _assert_mm_data_is_image_input( - image_data = mm_data.get("image") - assert image_data is not None - -- if image_count == 1: -- assert isinstance(image_data, Image.Image) -- else: -- assert isinstance(image_data, list) and len(image_data) == image_count -+ assert isinstance(image_data, list) and len(image_data) == image_count - - - def test_parse_chat_messages_single_image( -diff --git a/tests/kernels/test_attention_selector.py b/tests/kernels/test_attention_selector.py -index d37f95d48..916cc2efa 100644 ---- a/tests/kernels/test_attention_selector.py -+++ b/tests/kernels/test_attention_selector.py -@@ -5,7 +5,10 @@ import torch - - from tests.kernels.utils import override_backend_env_variable - from vllm.attention.selector import which_attn_to_use --from vllm.platforms import cpu, cuda, openvino, rocm -+from vllm.platforms.cpu import CpuPlatform -+from vllm.platforms.cuda import CudaPlatform -+from vllm.platforms.openvino import OpenVinoPlatform -+from vllm.platforms.rocm import RocmPlatform - from vllm.utils import STR_FLASH_ATTN_VAL, STR_INVALID_VAL - - -@@ -20,26 +23,23 @@ def test_env(name: str, device: str, monkeypatch): - override_backend_env_variable(monkeypatch, name) - - if device == "cpu": -- with patch("vllm.attention.selector.current_platform", -- cpu.CpuPlatform()): -+ with patch("vllm.attention.selector.current_platform", CpuPlatform()): - backend = which_attn_to_use(16, torch.float16, torch.float16, 16, - False) - assert backend.name == "TORCH_SDPA" - elif device == "hip": -- with patch("vllm.attention.selector.current_platform", -- rocm.RocmPlatform()): -+ with patch("vllm.attention.selector.current_platform", RocmPlatform()): - backend = which_attn_to_use(16, torch.float16, torch.float16, 16, - False) - assert backend.name == "ROCM_FLASH" - elif device == "openvino": - with patch("vllm.attention.selector.current_platform", -- openvino.OpenVinoPlatform()): -+ OpenVinoPlatform()): - backend = which_attn_to_use(16, torch.float16, torch.float16, 16, - False) - assert backend.name == "OPENVINO" - else: -- with patch("vllm.attention.selector.current_platform", -- cuda.CudaPlatform()): -+ with patch("vllm.attention.selector.current_platform", CudaPlatform()): - backend = which_attn_to_use(16, torch.float16, torch.float16, 16, - False) - assert backend.name == name -diff --git a/tests/kernels/test_cascade_flash_attn.py b/tests/kernels/test_cascade_flash_attn.py + check_logprobs_close( + outputs_0_lst=hf_outputs, + outputs_1_lst=vllm_outputs, +diff --git a/tests/models/registry.py b/tests/models/registry.py +index d508c5f44..e07f88311 100644 +--- a/tests/models/registry.py ++++ b/tests/models/registry.py +@@ -202,6 +202,16 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { + "Qwen2ForCausalLM": _HfExamplesInfo("Qwen/Qwen2-7B-Instruct", + extras={"2.5": "Qwen/Qwen2.5-7B-Instruct"}), # noqa: E501 + "Qwen2MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen1.5-MoE-A2.7B-Chat"), ++ "Qwen3ForCausalLM": _HfExamplesInfo( ++ "Qwen/Qwen3-8B", ++ is_available_online=False, ++ min_transformers_version="4.51" ++ ), ++ "Qwen3MoeForCausalLM": _HfExamplesInfo( ++ "Qwen/Qwen3-MoE-15B-A2B", ++ is_available_online=False, ++ min_transformers_version="4.51" ++ ), + "RWForCausalLM": _HfExamplesInfo("tiiuae/falcon-40b", + is_available_online=False), + "StableLMEpochForCausalLM": _HfExamplesInfo("stabilityai/stablelm-zephyr-3b", # noqa: E501 +diff --git a/tests/models/test_bart.py b/tests/models/test_bart.py new file mode 100644 -index 000000000..45ec6df4e +index 000000000..660b61d1a --- /dev/null -+++ b/tests/kernels/test_cascade_flash_attn.py -@@ -0,0 +1,182 @@ -+from typing import List, Optional, Tuple -+ -+import pytest -+import torch ++++ b/tests/models/test_bart.py +@@ -0,0 +1,170 @@ ++"""Compare the outputs of HF and vLLM for BART models using greedy sampling. + -+from vllm.platforms import current_platform -+from vllm.v1.attention.backends.flash_attn import (cascade_attention, -+ merge_attn_states) -+from vllm.vllm_flash_attn import flash_attn_varlen_func ++Run `pytest tests/models/test_bart.py`. ++""" ++from typing import List, Optional, Tuple + -+NUM_HEADS = [(4, 4), (8, 2), (16, 2)] -+HEAD_SIZES = [128, 192, 256] -+BLOCK_SIZES = [16] -+DTYPES = [torch.float16, torch.bfloat16] ++from vllm.utils import is_cpu + ++if not is_cpu(): ++ # CPU backend is not currently supported with encoder/decoder models ++ # skip test definitions entirely to avoid importing GPU kernel libs ++ # (xFormers, etc.) + -+@pytest.mark.parametrize("num_tokens", [1, 39, 16912]) -+@pytest.mark.parametrize("num_heads", NUM_HEADS) -+@pytest.mark.parametrize("head_size", HEAD_SIZES) -+@pytest.mark.parametrize("dtype", DTYPES) -+@torch.inference_mode() -+def test_merge_kernel( -+ num_tokens: int, -+ num_heads: Tuple[int, int], -+ head_size: int, -+ dtype: torch.dtype, -+): -+ torch.set_default_device("cuda") -+ current_platform.seed_everything(0) -+ num_query_heads = num_heads[0] -+ num_kv_heads = num_heads[1] -+ assert num_query_heads % num_kv_heads == 0 -+ -+ # Prepare inputs. -+ prefix_output = torch.randn(num_tokens, -+ num_query_heads, -+ head_size, -+ dtype=dtype) -+ suffix_output = torch.randn(num_tokens, -+ num_query_heads, -+ head_size, -+ dtype=dtype) -+ prefix_lse = torch.randn(num_query_heads, num_tokens, dtype=torch.float32) -+ suffix_lse = torch.randn(num_query_heads, num_tokens, dtype=torch.float32) -+ -+ # Run the kernel. -+ output = torch.empty(num_tokens, num_query_heads, head_size, dtype=dtype) -+ merge_attn_states(output, prefix_output, prefix_lse, suffix_output, -+ suffix_lse) -+ -+ # Reference implementation. -+ max_lse = torch.maximum(prefix_lse, suffix_lse) -+ p_lse = torch.exp(prefix_lse - max_lse) -+ s_lse = torch.exp(suffix_lse - max_lse) -+ p_scale = p_lse / (p_lse + s_lse) -+ s_scale = s_lse / (p_lse + s_lse) -+ p_scale = p_scale.transpose(0, 1).unsqueeze(2) -+ s_scale = s_scale.transpose(0, 1).unsqueeze(2) -+ ref_output = p_scale * prefix_output + s_scale * suffix_output -+ ref_output = ref_output.to(dtype) -+ -+ # Compare the results. -+ torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2) -+ -+ -+CASES = [ -+ # Case 1. A general case. -+ ([(129, 871), (18, 280), (37, 988), (1023, 2304), (1, 257)], 256), -+ # Case 2. Flash-decoding case. -+ ([(1, 1023), (1, 879), (1, 778), (1, 1777)] * 100, 512), -+] ++ import pytest ++ from transformers import AutoModelForSeq2SeqLM + ++ from vllm.sequence import SampleLogprobs + -+@pytest.mark.parametrize("seq_lens_and_common_prefix", CASES) -+@pytest.mark.parametrize("num_heads", NUM_HEADS) -+@pytest.mark.parametrize("head_size", HEAD_SIZES) -+@pytest.mark.parametrize("dtype", DTYPES) -+@pytest.mark.parametrize("block_size", BLOCK_SIZES) -+@pytest.mark.parametrize("soft_cap", [None, 50]) -+@pytest.mark.parametrize("num_blocks", [2048]) -+@torch.inference_mode() -+def test_cascade( -+ seq_lens_and_common_prefix: Tuple[List[Tuple[int, int]], int], -+ num_heads: Tuple[int, int], -+ head_size: int, -+ dtype: torch.dtype, -+ block_size: int, -+ soft_cap: Optional[float], -+ num_blocks: int, -+) -> None: -+ torch.set_default_device("cuda") -+ current_platform.seed_everything(0) -+ -+ window_size = (-1, -1) -+ scale = head_size**-0.5 -+ num_query_heads = num_heads[0] -+ num_kv_heads = num_heads[1] -+ assert num_query_heads % num_kv_heads == 0 -+ key_cache = torch.randn(num_blocks, -+ block_size, -+ num_kv_heads, -+ head_size, -+ dtype=dtype) -+ value_cache = torch.randn_like(key_cache) -+ -+ seq_lens, common_prefix_len = seq_lens_and_common_prefix -+ num_seqs = len(seq_lens) -+ query_lens = [x[0] for x in seq_lens] -+ kv_lens = [x[1] for x in seq_lens] -+ max_query_len = max(query_lens) -+ max_kv_len = max(kv_lens) -+ -+ total_num_query_tokens = sum(query_lens) -+ query = torch.randn(total_num_query_tokens, -+ num_query_heads, -+ head_size, -+ dtype=dtype) -+ cu_query_lens = torch.tensor([0] + query_lens, -+ dtype=torch.int32).cumsum(dim=0, -+ dtype=torch.int32) -+ cu_kv_lens = torch.tensor([0] + kv_lens, -+ dtype=torch.int32).cumsum(dim=0, -+ dtype=torch.int32) -+ max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size -+ block_tables = torch.randint(0, -+ num_blocks, -+ (num_seqs, max_num_blocks_per_seq), -+ dtype=torch.int32) -+ -+ assert common_prefix_len > 0 -+ assert common_prefix_len % block_size == 0 -+ num_common_kv_blocks = common_prefix_len // block_size -+ # Make sure the first `num_common_kv_blocks` blocks are the same. -+ block_tables[:, :num_common_kv_blocks] = \ -+ block_tables[0, :num_common_kv_blocks] -+ -+ # Run the regular attention. -+ ref_output = flash_attn_varlen_func( -+ q=query, -+ k=key_cache, -+ v=value_cache, -+ cu_seqlens_q=cu_query_lens, -+ cu_seqlens_k=cu_kv_lens, -+ max_seqlen_q=max_query_len, -+ max_seqlen_k=max_kv_len, -+ softmax_scale=scale, -+ causal=True, -+ window_size=window_size, -+ block_table=block_tables, -+ softcap=soft_cap if soft_cap is not None else 0, -+ ) ++ from ..conftest import DecoderPromptType ++ from .utils import check_logprobs_close + -+ # Run cascade attention. -+ assert all(common_prefix_len < kv_len for kv_len in kv_lens) -+ cu_prefix_query_lens = torch.tensor([0, total_num_query_tokens], -+ dtype=torch.int32) -+ cu_prefix_kv_lens = torch.tensor([0, common_prefix_len], dtype=torch.int32) -+ cu_suffix_kv_lens = ( -+ cu_kv_lens - -+ torch.arange(num_seqs + 1, dtype=torch.int32) * common_prefix_len) -+ output = torch.empty_like(query) -+ cascade_attention( -+ output=output, -+ query=query, -+ key_cache=key_cache, -+ value_cache=value_cache, -+ cu_query_lens=cu_query_lens, -+ max_query_len=max_query_len, -+ cu_prefix_query_lens=cu_prefix_query_lens, -+ cu_prefix_kv_lens=cu_prefix_kv_lens, -+ cu_suffix_kv_lens=cu_suffix_kv_lens, -+ max_kv_len=max_kv_len, -+ softmax_scale=scale, -+ alibi_slopes=None, -+ sliding_window=window_size, -+ logits_soft_cap=soft_cap if soft_cap is not None else 0, -+ block_table=block_tables, -+ common_prefix_len=common_prefix_len, -+ ) ++ MODELS = ["facebook/bart-base", "facebook/bart-large-cnn"] + -+ # Compare the results. -+ torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2) -diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py -index 8b247fb9b..57ebaa424 100644 ---- a/tests/lora/conftest.py -+++ b/tests/lora/conftest.py -@@ -4,6 +4,7 @@ from typing import Dict, List, TypedDict - from unittest.mock import MagicMock, patch - - import pytest -+import safetensors - import torch - import torch.nn as nn - from huggingface_hub import snapshot_download -@@ -169,6 +170,29 @@ def mixtral_lora_files_all_target_modules(): - return snapshot_download(repo_id="dyang415/mixtral-lora-v0") - - -+@pytest.fixture(scope="session") -+def jamba_lora_files(): -+ # some of the adapters have unnecessary weights for serving, -+ # hence we remove them -+ def remove_unnecessary_weights(path): -+ lora_path = f"{adapter_path}/adapter_model.safetensors" -+ tensors = safetensors.torch.load_file(lora_path) -+ nonlora_keys = [] -+ for k in list(tensors.keys()): -+ if "lora" not in k: -+ nonlora_keys.append(k) -+ for k in nonlora_keys: -+ del tensors[k] -+ safetensors.torch.save_file(tensors, lora_path) -+ -+ adapter_path = snapshot_download( -+ repo_id= -+ "hf-100/Jamba-1.5-mini-Spellbound-StoryWriter-0.1-6583896-ckpt53-lora") -+ -+ remove_unnecessary_weights(adapter_path) -+ return adapter_path -+ -+ - @pytest.fixture(scope="session") - def gemma_lora_files(): - return snapshot_download(repo_id="wskwon/gemma-7b-test-lora") -diff --git a/tests/lora/test_jamba.py b/tests/lora/test_jamba.py ++ def vllm_to_hf_output( ++ vllm_output: Tuple[List[int], str, Optional[SampleLogprobs]], ++ decoder_prompt_type: DecoderPromptType, ++ ): ++ """Sanitize vllm output to be comparable with hf output.""" ++ output_ids, output_str, out_logprobs = vllm_output ++ ++ hf_output_str = output_str + "" ++ if decoder_prompt_type == DecoderPromptType.NONE: ++ hf_output_str = "" + hf_output_str ++ ++ return output_ids, hf_output_str, out_logprobs ++ ++ @pytest.mark.parametrize("model", MODELS) ++ @pytest.mark.parametrize("dtype", ["float", "bfloat16"]) ++ @pytest.mark.parametrize("max_tokens", [64]) ++ @pytest.mark.parametrize("num_logprobs", [5]) ++ @pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType)) ++ def test_models( ++ hf_runner, ++ vllm_runner, ++ example_encoder_decoder_prompts, ++ model: str, ++ dtype: str, ++ max_tokens: int, ++ num_logprobs: int, ++ decoder_prompt_type: DecoderPromptType, ++ ) -> None: ++ ''' ++ Test the vLLM BART model for a variety of encoder/decoder input prompts, ++ by validating it against HuggingFace (HF) BART. ++ ++ Arguments: ++ ++ * hf_runner: HuggingFace (HF) test model runner ++ * vllm_runner: vLLM test model runner ++ * example_encoder_decoder_prompts: test fixture which provides a ++ dictionary of dummy prompts ++ * model: the HF ID of the specific BART variant under test ++ * dtype: the tensor datatype to employ ++ * max_tokens ++ * num_logprobs ++ * decoder_prompt_type: key into the example_encoder_decoder_prompts ++ dictionary; selects specific encoder/decoder ++ prompt scenarios to test ++ ++ A note on using HF BART as a baseline for validating vLLM BART, ++ specifically when the decoder prompt is None. ++ ++ The HF GenerationMixin's default behavior is to force the first ++ decoded token to be if the prompt does not already contain ++ (this is accomplished using a logit ++ processor setting.) ++ ++ So when we use HF BART as our baseline for comparison, note that ++ when the user provides a request with a None decoder prompt ++ (i.e. a singleton encoder prompt, or else an explicit encoder/ ++ decoder prompt with the decoder sub-prompt set to None), HF and ++ vLLM handle this in different ways: ++ ++ * HF will (1) tokenize the None prompt as an empty token-list, ++ (2) append to the beginning, yielding ++ [], (3) pass this token list to the model, and ++ then (4) after computing logits during prefill, override the model ++ logits & force to be the first generated token. ++ ++ * vLLM will (1) tokenize the None prompt as [], (2) append decoder- ++ start-token to the beginning, yielding [], ++ (3) pass these tokens to the model & proceed with generation. ++ ++ The net effect is that compared to vLLM, the list of HF *decoded* tokens ++ will contain one more initial than the vLLM generated tokens, ++ because vLLM's token is injected into the prompt rather than into ++ the generated output. This is in spite of the fact that overall, the ++ complete sequences (prompt + decoded tokens) produced by vLLM will match ++ HF. ++ ++ So when we use HF decoded token output to validate vLLM's decoded token ++ output, the testing process must account for the difference in decoded ++ token sequences between vLLM and HF specifically in the ++ decoder-prompt-is-None case. ++ ++ One option is to disable the logit processor feature that forces the ++ token to be decoded (forced_bos_token_id = None), eliminating ++ the problem entirely. However this is not "normal" BART usage. ++ ++ The other option is - only in the decoder-prompt-is-None case - to ++ discard the first decoded token from the HF output before comparing it ++ to vLLM. ++ ++ To that end, when testing the scenario where the decoder prompt is None ++ (and only in that one scenario), this test skips the first HF decoded ++ token during the process of validating the vLLM decoded output. ++ ''' ++ ++ test_case_prompts = example_encoder_decoder_prompts[ ++ decoder_prompt_type] ++ ++ # Configuration settings for HF baseline ++ hf_kwargs = { ++ "top_k": None, ++ "num_beams": 1, ++ "repetition_penalty": 1.0, ++ "top_p": 1.0, ++ "length_penalty": 1.0, ++ "early_stopping": False, ++ "no_repeat_ngram_size": None, ++ "min_length": 0 ++ } ++ ++ with hf_runner(model, dtype=dtype, ++ auto_cls=AutoModelForSeq2SeqLM) as hf_model: ++ hf_outputs = ( ++ hf_model.generate_encoder_decoder_greedy_logprobs_limit( ++ test_case_prompts, ++ max_tokens, ++ num_logprobs, ++ **hf_kwargs, ++ )) ++ ++ # Note: currently encoder/decoder models are only compatible with ++ # enforce_eager=True. Normally this is not a problem because ++ # for encoder/decoder models vLLM will ++ # default to enforce_eager=True if enforce_eager ++ # is left unspecified. However, the ++ # VllmRunner test fixture (which wraps around the LLM class) defaults to ++ # enforce_eager=False (a behavior which a number of already-exisitng ++ # decoder-only unit tests expect), so when testing an encoder/decoder ++ # model we must explicitly specify enforce_eager=True in the VllmRunner ++ # constructor. ++ with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model: ++ vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs( ++ test_case_prompts, max_tokens, num_logprobs) ++ ++ hf_skip_tokens = (1 if decoder_prompt_type == DecoderPromptType.NONE ++ else 0) ++ ++ check_logprobs_close( ++ outputs_0_lst=hf_outputs, ++ outputs_1_lst=[ ++ vllm_to_hf_output(vllm_output, decoder_prompt_type) ++ for vllm_output in vllm_outputs ++ ], ++ name_0="hf", ++ name_1="vllm", ++ num_outputs_0_skip_tokens=hf_skip_tokens, ++ ) +diff --git a/tests/models/test_gguf.py b/tests/models/test_gguf.py new file mode 100644 -index 000000000..6aa33926c +index 000000000..196cd88e0 --- /dev/null -+++ b/tests/lora/test_jamba.py -@@ -0,0 +1,54 @@ -+from typing import List ++++ b/tests/models/test_gguf.py +@@ -0,0 +1,90 @@ ++""" ++Tests gguf models against unquantized models generations ++Note: To pass the test, quantization higher than Q4 should be used ++""" + -+import pytest -+import torch ++import os + -+import vllm -+from vllm.lora.request import LoRARequest ++import pytest ++from huggingface_hub import hf_hub_download ++from transformers import AutoTokenizer + -+MODEL_PATH = "ai21labs/AI21-Jamba-1.5-Mini" ++from tests.quantization.utils import is_quant_method_supported + -+MAX_TOKENS = 40 ++from .utils import check_logprobs_close + ++os.environ["TOKENIZERS_PARALLELISM"] = "true" + -+def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int, -+ prompts: List[str]) -> List[str]: ++MAX_MODEL_LEN = 1024 + -+ sampling_params = vllm.SamplingParams(temperature=0, max_tokens=MAX_TOKENS) -+ outputs = llm.generate( -+ prompts, -+ sampling_params, -+ lora_request=LoRARequest(str(lora_id), lora_id, lora_path) -+ if lora_id else None) -+ # Print the outputs. -+ generated_texts: List[str] = [] -+ for output in outputs: -+ prompt = output.prompt -+ generated_text = output.outputs[0].text.strip() -+ generated_texts.append(generated_text) -+ print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") -+ return generated_texts ++# FIXME: Move this to confest ++MODELS = [ ++ ("TinyLlama/TinyLlama-1.1B-Chat-v1.0", ++ hf_hub_download("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", ++ filename="tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf")), ++ ("TinyLlama/TinyLlama-1.1B-Chat-v1.0", ++ hf_hub_download("duyntnet/TinyLlama-1.1B-Chat-v1.0-imatrix-GGUF", ++ filename="TinyLlama-1.1B-Chat-v1.0-IQ4_XS.gguf")), ++ ("Qwen/Qwen2-1.5B-Instruct", ++ hf_hub_download("Qwen/Qwen2-1.5B-Instruct-GGUF", ++ filename="qwen2-1_5b-instruct-q4_k_m.gguf")), ++ ("Qwen/Qwen2-1.5B-Instruct", ++ hf_hub_download("legraphista/Qwen2-1.5B-Instruct-IMat-GGUF", ++ filename="Qwen2-1.5B-Instruct.IQ4_XS.gguf")), ++] + + -+@pytest.mark.parametrize("tp_size", [4]) -+def test_jamba_lora(jamba_lora_files, tp_size): -+ """Original test, the LoRA model has the common target modules, not all""" -+ if torch.cuda.device_count() < tp_size: ++@pytest.mark.skipif(not is_quant_method_supported("gguf"), ++ reason="gguf is not supported on this GPU type.") ++@pytest.mark.parametrize("model", MODELS) ++@pytest.mark.parametrize("dtype", ["half"]) ++@pytest.mark.parametrize("max_tokens", [32]) ++@pytest.mark.parametrize("num_logprobs", [5]) ++@pytest.mark.parametrize("tp_size", [1, 2]) ++def test_models( ++ num_gpus_available, ++ vllm_runner, ++ example_prompts, ++ model, ++ dtype: str, ++ max_tokens: int, ++ num_logprobs: int, ++ tp_size: int, ++) -> None: ++ if num_gpus_available < tp_size: + pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}") + -+ prompts = ["Write a story about a sheep and a goat."] ++ original_model, gguf_model = model + -+ llm = vllm.LLM( -+ MODEL_PATH, -+ enable_lora=True, -+ max_num_seqs=16, -+ max_loras=4, -+ distributed_executor_backend="ray", -+ tensor_parallel_size=tp_size, -+ ) ++ tokenizer = AutoTokenizer.from_pretrained(original_model) ++ messages = [[{ ++ 'role': 'user', ++ 'content': prompt ++ }] for prompt in example_prompts] ++ example_prompts = tokenizer.apply_chat_template(messages, ++ tokenize=False, ++ add_generation_prompt=True) + -+ expected_jamba_output = [ -+ """Once upon a time, in a lush green meadow, there lived a sheep named Clara and a goat named Billy. Clara was a gentle creature, always nibbling on the soft grass and humming""" # noqa: E501 -+ ] -+ assert do_sample(llm, jamba_lora_files, lora_id=1, -+ prompts=prompts) == expected_jamba_output -diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py -index 0b76f4667..a099f36b0 100644 ---- a/tests/lora/test_lora_manager.py -+++ b/tests/lora/test_lora_manager.py -@@ -1,4 +1,5 @@ - import json -+import math - import os - from typing import Dict, List - -@@ -50,6 +51,18 @@ def test_peft_helper(sql_lora_files): - "embed_tokens", - "lm_head", - ] -+ scaling = peft_helper.lora_alpha / peft_helper.r -+ assert abs(peft_helper.vllm_lora_scaling_factor - scaling) < 1e-3 -+ -+ # test RSLoRA -+ config = dict(r=8, -+ lora_alpha=16, -+ target_modules=["gate_proj"], -+ use_rslora=True) -+ peft_helper = PEFTHelper.from_dict(config) -+ -+ scaling = peft_helper.lora_alpha / math.sqrt(peft_helper.r) -+ assert abs(peft_helper.vllm_lora_scaling_factor - scaling) < 1e-3 - - expected_error = "vLLM only supports modules_to_save being None." - with pytest.raises(ValueError, match=expected_error): -@@ -60,13 +73,6 @@ def test_peft_helper(sql_lora_files): - modules_to_save=["lm_head"], - ) - PEFTHelper.from_dict(config) -- expected_error = "vLLM does not yet support RSLoRA." -- with pytest.raises(ValueError, match=expected_error): -- config = dict(r=8, -- lora_alpha=16, -- target_modules=["gate_proj"], -- use_rslora=True) -- PEFTHelper.from_dict(config) - - expected_error = "vLLM does not yet support DoRA." - with pytest.raises(ValueError, match=expected_error): -diff --git a/tests/lora/test_qwen2vl.py b/tests/lora/test_qwen2vl.py -index c9f48402b..ebdd129db 100644 ---- a/tests/lora/test_qwen2vl.py -+++ b/tests/lora/test_qwen2vl.py -@@ -7,7 +7,7 @@ from vllm.assets.image import ImageAsset - from vllm.lora.request import LoRARequest - from vllm.platforms import current_platform - --MODEL_PATH = "Qwen/Qwen2-VL-7B-Instruct" -+MODEL_PATH = "Qwen/Qwen2-VL-2B-Instruct" - - PROMPT_TEMPLATE = ( - "<|im_start|>system\nYou are a helpful assistant.<|im_end|>" -@@ -49,10 +49,9 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: - # Print the outputs. - generated_texts: List[str] = [] - for output in outputs: -- prompt = output.prompt - generated_text = output.outputs[0].text.strip() - generated_texts.append(generated_text) -- print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") -+ print(f"Generated text: {generated_text!r}") - return generated_texts - - -diff --git a/tests/models/decoder_only/language/test_granite.py b/tests/models/decoder_only/language/test_granite.py -index 5e93842f4..855653264 100644 ---- a/tests/models/decoder_only/language/test_granite.py -+++ b/tests/models/decoder_only/language/test_granite.py -@@ -30,9 +30,11 @@ def test_models( - hf_outputs = hf_model.generate_greedy_logprobs_limit( - example_prompts, max_tokens, num_logprobs) - -- with vllm_runner(model, dtype=dtype) as vllm_model: -+ with vllm_runner(model, dtype=dtype, -+ tokenizer_mode="mistral") as vllm_model: - vllm_outputs = vllm_model.generate_greedy_logprobs( - example_prompts, max_tokens, num_logprobs) -+ - check_logprobs_close( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, -diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_llava_next.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_llava_next.py -deleted file mode 100644 -index 51c008510..000000000 ---- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_llava_next.py -+++ /dev/null -@@ -1,70 +0,0 @@ --import pytest -- --from vllm.inputs import InputContext -- --from ....utils import build_model_context -- -- --@pytest.fixture() --def get_max_llava_next_image_tokens(): -- from vllm.model_executor.models.llava_next import ( -- get_max_llava_next_image_tokens) -- return get_max_llava_next_image_tokens -- -- --@pytest.fixture() --def dummy_data_for_llava_next(): -- from vllm.model_executor.models.llava_next import dummy_data_for_llava_next -- return dummy_data_for_llava_next -- -- --@pytest.mark.parametrize("gridpoints,expected_max_tokens", [ -- ([[336, 336]], 1176), -- ([[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]], 2928), --]) --def test_get_max_llava_next_image_tokens(gridpoints, expected_max_tokens, -- get_max_llava_next_image_tokens): -- ctx = build_model_context(model_name="llava-hf/llava-v1.6-mistral-7b-hf") -- -- # Update the config image_grid_pinpoints -- # and calculate the resulting max tokens -- ctx.model_config.hf_config.image_grid_pinpoints = gridpoints -- -- actual_max_tokens = get_max_llava_next_image_tokens( -- InputContext(ctx.model_config)) -- -- assert expected_max_tokens == actual_max_tokens -- -- --@pytest.mark.parametrize( -- "gridpoints,expected_size", -- [ -- # One point; it has to be the largest -- ([[336, 336]], (336, 336)), -- # Default for most llava next models; the 2x2 tile is the largest -- ([[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]], -- (672, 672)), -- # If two rectangular gridpoints are the same, the more vertical -- # one has the higher feature count due to newline features -- ([[336, 672], [672, 336]], (672, 336)) -- ]) --def test_dummy_data_for_llava_next_feature_size(dummy_data_for_llava_next, -- gridpoints, expected_size): -- ctx = build_model_context(model_name="llava-hf/llava-v1.6-mistral-7b-hf") -- -- # Update the config image_grid_pinpoints -- ctx.model_config.hf_config.image_grid_pinpoints = gridpoints -- seq_len = 5000 # bigger than the max feature size for any image -- -- dummy_data = dummy_data_for_llava_next( -- ctx, -- seq_len=seq_len, -- mm_counts={"image": 1}, -- ) -- seq_data = dummy_data.seq_data -- mm_data = dummy_data.multi_modal_data -- -- # The dummy data dims should match the gridpoint with the biggest feat size -- assert mm_data["image"].height == expected_size[0] -- assert mm_data["image"].width == expected_size[1] -- assert len(seq_data.get_token_ids()) >= seq_len -diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py -deleted file mode 100644 -index f95cee277..000000000 ---- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py -+++ /dev/null -@@ -1,98 +0,0 @@ --"""Tests for phi3v's multimodal preprocessing kwargs.""" --from typing import Optional -- --import pytest --from transformers import AutoTokenizer -- --from vllm.inputs import InputContext, InputProcessingContext --from vllm.model_executor.models.phi3v import _IMAGE_TOKEN_ID -- --from .....conftest import _ImageAssets --from ....utils import build_model_context -- --models = ["microsoft/Phi-3.5-vision-instruct"] -- -- --# Wrap lazy imports to avoid initializing CUDA during test collection --@pytest.fixture() --def processor_for_phi3v(): -- from vllm.model_executor.models.phi3v import Phi3VMultiModalProcessor -- return Phi3VMultiModalProcessor -- -- --@pytest.fixture() --def get_max_phi3v_image_tokens(): -- from vllm.model_executor.models.phi3v import get_max_phi3v_image_tokens -- return get_max_phi3v_image_tokens -- -- --@pytest.mark.parametrize("model", models) --@pytest.mark.parametrize("num_crops,expected_max_tokens", [ -- (4, 781), -- (16, 2653), --]) --def test_max_tokens_override(get_max_phi3v_image_tokens, model: str, -- num_crops: int, expected_max_tokens: int): -- """Ensure get_max_phi3v_image_tokens handles num_crops properly.""" -- # NOTE: mm_processor_kwargs on the context in this test is unused, since -- # this is testing the mapper directly. In practice, the processor kwargs -- # are wrapped in a closure when calling the max tokens func. We explicitly -- # do NOT use the mm_processor_kwargs in the model context here to ensure -- # that the max image tokens implementation is referencing a mix of the -- # kwargs to the function and the original mm_processor_kwargs in case -- # values are somehow updated and end up in a bad state. -- ctx = build_model_context( -- model_name=model, -- tokenizer_name=model, -- trust_remote_code=True, -- mm_processor_kwargs=None, -- ) -- -- actual_max_tokens = get_max_phi3v_image_tokens( -- InputContext(ctx.model_config), -- num_crops=num_crops, -- ) -- -- assert expected_max_tokens == actual_max_tokens -- -- --@pytest.mark.parametrize("model", models) --@pytest.mark.parametrize( -- "num_crops,expected_toks_per_img", -- [ -- (4, 757), -- (16, 1921), -- # the default num_crops of phi-3.5-vision is 4 -- (None, 757), -- ]) --@pytest.mark.parametrize("num_imgs", [1, 2]) --def test_processor_override(processor_for_phi3v, image_assets: _ImageAssets, -- model: str, num_crops: Optional[int], -- expected_toks_per_img: int, num_imgs: int): -- """Ensure input_processor_for_phi3v handles num_crops properly.""" -- # Same as the previous test - don't initialize mm_processor_kwargs -- # in this test and assume that the kwargs will be correctly expanded by -- # the partial when calling the custom input processor. -- ctx = build_model_context( -- model_name=model, -- tokenizer_name=model, -- trust_remote_code=True, -- ) -- tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True) -- ctx = InputProcessingContext(ctx.model_config, tokenizer) -- # Build the image str / prompt based on the number of images we pass -- img_str = "".join([f"<|image_{idx}|>\n" for idx in range(1, num_imgs + 1)]) -- prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n" -- images = [image_assets[0].pil_image] * num_imgs -- -- mm_data = {"image": images} -- mm_processor_kwargs = {} -- if num_crops is not None: -- mm_processor_kwargs = {"num_crops": num_crops} -- -- processor = processor_for_phi3v(ctx) -- processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs) -- -- # Ensure we have the right number of placeholders per num_crops size -- img_tok_count = processed_inputs["prompt_token_ids"].count(_IMAGE_TOKEN_ID) -- assert img_tok_count == expected_toks_per_img * num_imgs -diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py -deleted file mode 100644 -index cd8954ffc..000000000 ---- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py -+++ /dev/null -@@ -1,105 +0,0 @@ --from typing import Any, Dict, Tuple -- --import pytest --from transformers import AutoTokenizer -- --from vllm.inputs import InputContext, InputProcessingContext -- --from .....conftest import _ImageAssets --from ....utils import build_model_context -- --MODEL = "Qwen/Qwen2-VL-2B-Instruct" --MIN_PIXELS = "min_pixels" --MAX_PIXELS = "max_pixels" -- -- --# Fixtures lazy import to avoid initializing CUDA during test collection --# NOTE: Qwen2VL supports multiple input modalities, so it registers multiple --# input mappers. --@pytest.fixture() --def processor_for_qwen2_vl(): -- from vllm.model_executor.models.qwen2_vl import Qwen2VLMultiModalProcessor -- return Qwen2VLMultiModalProcessor -- -- --@pytest.fixture() --def get_max_qwen2_vl_image_tokens(): -- from vllm.model_executor.models.qwen2_vl import ( -- get_max_qwen2_vl_image_tokens) -- return get_max_qwen2_vl_image_tokens -- -- --@pytest.mark.parametrize("mm_processor_kwargs,expected_max_tokens", [ -- ({}, 1225), -- ({ -- MIN_PIXELS: 64**2, -- MAX_PIXELS: 512**2 -- }, 324), --]) --@pytest.mark.parametrize("model", [MODEL]) --def test_qwen2_vl_max_image_tokens( -- get_max_qwen2_vl_image_tokens, -- model: str, -- mm_processor_kwargs: Dict[str, Any], -- expected_max_tokens: int, --): -- """Ensure that the max token calc handles min/max pixels properly.""" -- ctx = build_model_context( -- model_name=model, -- tokenizer_name=model, -- mm_processor_kwargs=None, -- ) -- -- actual_max_tokens = get_max_qwen2_vl_image_tokens( -- InputContext(ctx.model_config), **mm_processor_kwargs) -- assert actual_max_tokens == expected_max_tokens -- -- --@pytest.mark.parametrize( -- "mm_processor_kwargs, expected_toks_per_img, expected_pixels_shape", [ -- ({}, 1426, (5704, 1176)), -- ({ -- MIN_PIXELS: 64**2, -- MAX_PIXELS: 512**2 -- }, 330, (1320, 1176)), -- ]) --@pytest.mark.parametrize("model", [MODEL]) --@pytest.mark.parametrize("num_imgs", [1, 2]) --def test_processor_override( -- processor_for_qwen2_vl, -- image_assets: _ImageAssets, -- model: str, -- mm_processor_kwargs: Dict[str, Any], -- expected_toks_per_img: int, -- expected_pixels_shape: Tuple[int, int], -- num_imgs: int, --): -- """Ensure Qwen2VLMultiModalProcessor handles min/max pixels properly.""" -- # Same as the previous test - don't initialize mm_processor_kwargs -- # in this test and assume that the kwargs will be correctly expanded by -- # the partial when calling the custom input processor. -- ctx = build_model_context( -- model_name=model, -- tokenizer_name=model, -- mm_processor_kwargs=None, -- ) -- tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True) -- ctx = InputProcessingContext(ctx.model_config, tokenizer) -- # Build the image str / prompt based on the number of images we pass -- prompt = "<|vision_start|><|image_pad|><|vision_end|>" * num_imgs -- images = [image_assets[0].pil_image] * num_imgs -- -- mm_data = {"image": images} -- -- processor = processor_for_qwen2_vl(ctx) -- processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs) -- -- # Ensure we have the right number of placeholders per num_crops size -- hf_processor = processor._get_hf_processor(**mm_processor_kwargs) -- image_token_id = tokenizer.convert_tokens_to_ids(hf_processor.image_token) -- img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id) -- pixel_shape = processed_inputs["mm_kwargs"]["pixel_values"].shape -- -- assert img_tok_count == expected_toks_per_img * num_imgs -- assert pixel_shape[0] == expected_pixels_shape[0] * num_imgs -- assert pixel_shape[1] == expected_pixels_shape[1] -diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/__init__.py b/tests/models/decoder_only/vision_language/processing/__init__.py -similarity index 100% -rename from tests/models/decoder_only/vision_language/mm_processor_kwargs/__init__.py -rename to tests/models/decoder_only/vision_language/processing/__init__.py -diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_idefics3.py b/tests/models/decoder_only/vision_language/processing/test_idefics3.py -similarity index 100% -rename from tests/models/decoder_only/vision_language/mm_processor_kwargs/test_idefics3.py -rename to tests/models/decoder_only/vision_language/processing/test_idefics3.py -diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_internvl.py b/tests/models/decoder_only/vision_language/processing/test_internvl.py -similarity index 100% -rename from tests/models/decoder_only/vision_language/mm_processor_kwargs/test_internvl.py -rename to tests/models/decoder_only/vision_language/processing/test_internvl.py -diff --git a/tests/models/decoder_only/vision_language/processing/test_llava_next.py b/tests/models/decoder_only/vision_language/processing/test_llava_next.py -new file mode 100644 -index 000000000..6c8d30071 ---- /dev/null -+++ b/tests/models/decoder_only/vision_language/processing/test_llava_next.py -@@ -0,0 +1,57 @@ -+import pytest -+from PIL import Image -+from transformers import AutoTokenizer ++ # Run unquantized model. ++ with vllm_runner(model_name=original_model, ++ dtype=dtype, ++ max_model_len=MAX_MODEL_LEN, ++ tensor_parallel_size=tp_size) as original_model: + -+from vllm.inputs import InputProcessingContext ++ original_outputs = original_model.generate_greedy_logprobs( ++ example_prompts[:-1], max_tokens, num_logprobs) + -+from ....utils import build_model_context ++ # Run gguf model. ++ with vllm_runner(model_name=gguf_model, ++ dtype=dtype, ++ max_model_len=MAX_MODEL_LEN, ++ tensor_parallel_size=tp_size) as gguf_model: ++ gguf_outputs = gguf_model.generate_greedy_logprobs( ++ example_prompts[:-1], max_tokens, num_logprobs) + ++ check_logprobs_close( ++ outputs_0_lst=original_outputs, ++ outputs_1_lst=gguf_outputs, ++ name_0="original", ++ name_1="gguf", ++ ) +diff --git a/tests/models/test_granite.py b/tests/models/test_granite.py +new file mode 100644 +index 000000000..2435b5dc3 +--- /dev/null ++++ b/tests/models/test_granite.py +@@ -0,0 +1,49 @@ ++"""Compare the outputs of HF and vLLM for Granite models using greedy sampling. + -+# Fixtures lazy import to avoid initializing CUDA during test collection -+@pytest.fixture() -+def processor_for_llava_next(): -+ from vllm.model_executor.models.llava_next import ( -+ LlavaNextMultiModalProcessor) -+ return LlavaNextMultiModalProcessor ++Run `pytest tests/models/test_granite.py`. ++""" ++import importlib.metadata + ++import pytest + -+@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"]) -+@pytest.mark.parametrize("image_size", [(1669, 2560), (2560, 1669), (183, 488), -+ (488, 183), (198, 176), (176, 198)]) -+@pytest.mark.parametrize("num_imgs", [1, 2]) -+def test_processor_prompt_replacements( -+ processor_for_llava_next, -+ model_id: str, -+ image_size: tuple[int, int], -+ num_imgs: int, -+): -+ """ -+ Ensure LlavaNextMultiModalProcessor handles prompt replacement properly. -+ """ -+ ctx = build_model_context( -+ model_name=model_id, -+ tokenizer_name=model_id, -+ mm_processor_kwargs=None, -+ limit_mm_per_prompt={"image": num_imgs}, -+ ) -+ tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) -+ ctx = InputProcessingContext(ctx.model_config, tokenizer) ++from .utils import check_logprobs_close + -+ # Build the image str / prompt based on the number of images we pass -+ prompt = "" * num_imgs -+ mm_data = {"image": [Image.new("RGB", size=image_size)] * num_imgs} ++TRANSFORMERS_VERSION = tuple( ++ map(int, ++ importlib.metadata.version("transformers").split("."))) + -+ # The processor will throw an error if there is a mismatch -+ # in the prompt replacements -+ processor = processor_for_llava_next(ctx) -+ processed_inputs = processor.apply(prompt, mm_data, {}) ++MODELS = [ ++ "ibm/PowerLM-3b", ++] + -+ image_placeholders = processed_inputs["mm_placeholders"]["image"] -+ assert len(image_placeholders) == num_imgs + -+ first_placeholder = image_placeholders[0] ++# GraniteForCausalLM will be in transformers >= 4.45 ++@pytest.mark.skipif(TRANSFORMERS_VERSION < (4, 45), ++ reason="granite model test requires transformers >= 4.45") ++@pytest.mark.parametrize("model", MODELS) ++@pytest.mark.parametrize("dtype", ["bfloat16"]) ++@pytest.mark.parametrize("max_tokens", [64]) ++@pytest.mark.parametrize("num_logprobs", [5]) ++def test_models( ++ hf_runner, ++ vllm_runner, ++ example_prompts, ++ model: str, ++ dtype: str, ++ max_tokens: int, ++ num_logprobs: int, ++) -> None: ++ # TODO(sang): Sliding window should be tested separately. ++ with hf_runner(model, dtype=dtype) as hf_model: ++ hf_outputs = hf_model.generate_greedy_logprobs_limit( ++ example_prompts, max_tokens, num_logprobs) + -+ # NOTE: There is a BOS token -+ assert first_placeholder["offset"] == 1 -+ assert first_placeholder["length"] == ( -+ len(processed_inputs["prompt_token_ids"]) - 1) // num_imgs -diff --git a/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py b/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py ++ with vllm_runner(model, dtype=dtype) as vllm_model: ++ vllm_outputs = vllm_model.generate_greedy_logprobs( ++ example_prompts, max_tokens, num_logprobs) ++ check_logprobs_close( ++ outputs_0_lst=hf_outputs, ++ outputs_1_lst=vllm_outputs, ++ name_0="hf", ++ name_1="vllm", ++ ) +diff --git a/tests/models/test_intern_vit.py b/tests/models/test_intern_vit.py new file mode 100644 -index 000000000..71adde656 +index 000000000..816f846f6 --- /dev/null -+++ b/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py -@@ -0,0 +1,59 @@ -+import pytest -+from PIL import Image -+from transformers import AutoTokenizer ++++ b/tests/models/test_intern_vit.py +@@ -0,0 +1,79 @@ ++from typing import Optional + -+from vllm.inputs import InputProcessingContext ++import pytest ++import torch ++import torch.nn as nn ++from huggingface_hub import snapshot_download ++from transformers import AutoConfig, AutoModel, CLIPImageProcessor + -+from ....utils import build_model_context ++from ..conftest import _ImageAssets, cleanup + ++pytestmark = pytest.mark.vlm + -+# Fixtures lazy import to avoid initializing CUDA during test collection -+@pytest.fixture() -+def processor_for_llava_onevision(): -+ from vllm.model_executor.models.llava_onevision import ( -+ LlavaOnevisionMultiModalProcessor) -+ return LlavaOnevisionMultiModalProcessor ++# we use snapshot_download to prevent conflicts between ++# dynamic_module and trust_remote_code for hf_runner ++DOWNLOAD_PATTERN = ["*.json", "*.py", "*.safetensors", "*.txt", "*.model"] ++models = [ ++ snapshot_download("OpenGVLab/InternViT-300M-448px", ++ allow_patterns=DOWNLOAD_PATTERN), ++ snapshot_download("OpenGVLab/InternViT-6B-448px-V1-5", ++ allow_patterns=DOWNLOAD_PATTERN), ++] + + -+@pytest.mark.parametrize("model_id", -+ ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"]) -+@pytest.mark.parametrize("image_size", [(1669, 2560), (2560, 1669), (183, 488), -+ (488, 183), (198, 176), (176, 198)]) -+@pytest.mark.parametrize("num_imgs", [1, 2]) -+def test_processor_prompt_replacements( -+ processor_for_llava_onevision, -+ model_id: str, -+ image_size: tuple[int, int], -+ num_imgs: int, ++def run_intern_vit_test( ++ image_assets: _ImageAssets, ++ model: str, ++ *, ++ dtype: str, ++ distributed_executor_backend: Optional[str] = None, +): -+ """ -+ Ensure LlavaOnevisionMultiModalProcessor handles prompt replacement -+ properly. -+ """ -+ ctx = build_model_context( -+ model_name=model_id, -+ tokenizer_name=model_id, -+ mm_processor_kwargs=None, -+ limit_mm_per_prompt={"image": num_imgs}, -+ ) -+ tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) -+ ctx = InputProcessingContext(ctx.model_config, tokenizer) -+ -+ # Build the image str / prompt based on the number of images we pass -+ prompt = "" * num_imgs -+ mm_data = {"image": [Image.new("RGB", size=image_size)] * num_imgs} -+ -+ # The processor will throw an error if there is a mismatch -+ # in the prompt replacements -+ processor = processor_for_llava_onevision(ctx) -+ processed_inputs = processor.apply(prompt, mm_data, {}) -+ -+ image_placeholders = processed_inputs["mm_placeholders"]["image"] -+ assert len(image_placeholders) == num_imgs ++ img_processor = CLIPImageProcessor.from_pretrained(model) ++ images = [asset.pil_image for asset in image_assets] ++ pixel_values = [ ++ img_processor(images, return_tensors='pt').pixel_values.to(dtype) ++ for images in images ++ ] + -+ first_placeholder = image_placeholders[0] ++ config = AutoConfig.from_pretrained(model, trust_remote_code=True) ++ if not getattr(config, "norm_type", None): ++ config.norm_type = "rms_norm" + -+ # NOTE: There is a BOS token -+ assert first_placeholder["offset"] == 0 -+ assert first_placeholder["length"] == len( -+ processed_inputs["prompt_token_ids"]) // num_imgs -diff --git a/tests/models/decoder_only/vision_language/processing/test_phi3v.py b/tests/models/decoder_only/vision_language/processing/test_phi3v.py -new file mode 100644 -index 000000000..249045b3c ---- /dev/null -+++ b/tests/models/decoder_only/vision_language/processing/test_phi3v.py -@@ -0,0 +1,59 @@ -+"""Tests for phi3v's multimodal preprocessing kwargs.""" -+import pytest -+from transformers import AutoTokenizer ++ hf_model = AutoModel.from_pretrained(model, ++ torch_dtype=dtype, ++ trust_remote_code=True).to("cuda") ++ hf_outputs_per_image = [ ++ hf_model(pixel_value.to("cuda")).last_hidden_state ++ for pixel_value in pixel_values ++ ] + -+from vllm.inputs import InputProcessingContext -+from vllm.model_executor.models.phi3v import _IMAGE_TOKEN_ID ++ from vllm.model_executor.models.intern_vit import InternVisionModel ++ vllm_model = InternVisionModel(config) ++ vllm_model.load_weights(hf_model.state_dict().items()) + -+from .....conftest import _ImageAssets -+from ....utils import build_model_context ++ del hf_model ++ cleanup() + ++ vllm_model = vllm_model.to("cuda", dtype) ++ vllm_outputs_per_image = [ ++ vllm_model(pixel_values=pixel_value.to("cuda")) ++ for pixel_value in pixel_values ++ ] ++ del vllm_model ++ cleanup() + -+# Wrap lazy imports to avoid initializing CUDA during test collection -+@pytest.fixture() -+def processor_for_phi3v(): -+ from vllm.model_executor.models.phi3v import Phi3VMultiModalProcessor -+ return Phi3VMultiModalProcessor ++ cos_similar = nn.CosineSimilarity(dim=-1) ++ for vllm_output, hf_output in zip(vllm_outputs_per_image, ++ hf_outputs_per_image): ++ assert cos_similar(vllm_output, hf_output).mean() > 0.99 + + -+@pytest.mark.parametrize("model_id", ["microsoft/Phi-3.5-vision-instruct"]) -+# yapf: disable -+@pytest.mark.parametrize( -+ ("mm_processor_kwargs", "expected_toks_per_img"), -+ [ -+ ({"num_crops": 4}, 757), -+ ({"num_crops": 16}, 1921), -+ # the default num_crops of phi-3.5-vision is 4 -+ ({}, 757), -+ ]) -+# yapf: enable -+@pytest.mark.parametrize("num_imgs", [1, 2]) -+def test_processor_override( -+ processor_for_phi3v, -+ image_assets: _ImageAssets, -+ model_id: str, -+ mm_processor_kwargs: dict[str, int], -+ expected_toks_per_img: int, -+ num_imgs: int, -+): -+ """Ensure input_processor_for_phi3v handles num_crops properly.""" -+ ctx = build_model_context( -+ model_name=model_id, -+ tokenizer_name=model_id, -+ trust_remote_code=True, -+ limit_mm_per_prompt={"image": num_imgs}, ++@pytest.mark.parametrize("model", models) ++@pytest.mark.parametrize("dtype", [torch.half]) ++@torch.inference_mode() ++def test_models(dist_init, image_assets, model, dtype: str) -> None: ++ run_intern_vit_test( ++ image_assets, ++ model, ++ dtype=dtype, + ) -+ tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) -+ ctx = InputProcessingContext(ctx.model_config, tokenizer) -+ -+ # Build the image str / prompt based on the number of images we pass -+ img_str = "".join([f"<|image_{idx}|>\n" for idx in range(1, num_imgs + 1)]) -+ prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n" -+ mm_data = {"image": [image_assets[0].pil_image] * num_imgs} -+ -+ processor = processor_for_phi3v(ctx) -+ processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs) -+ -+ # Ensure we have the right number of placeholders per num_crops size -+ img_tok_count = processed_inputs["prompt_token_ids"].count(_IMAGE_TOKEN_ID) -+ assert img_tok_count == expected_toks_per_img * num_imgs -diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py b/tests/models/decoder_only/vision_language/processing/test_qwen.py -similarity index 100% -rename from tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py -rename to tests/models/decoder_only/vision_language/processing/test_qwen.py -diff --git a/tests/models/decoder_only/vision_language/processing/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/processing/test_qwen2_vl.py +diff --git a/tests/models/test_llava_image_embeds.py b/tests/models/test_llava_image_embeds.py new file mode 100644 -index 000000000..b9ac887ed +index 000000000..cc444fe32 --- /dev/null -+++ b/tests/models/decoder_only/vision_language/processing/test_qwen2_vl.py -@@ -0,0 +1,60 @@ ++++ b/tests/models/test_llava_image_embeds.py +@@ -0,0 +1,160 @@ ++from typing import List, Optional, Tuple, Type ++ +import pytest -+from transformers import AutoTokenizer ++from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer + -+from vllm.inputs import InputProcessingContext ++from vllm.sequence import SampleLogprobs + -+from .....conftest import _ImageAssets -+from ....utils import build_model_context ++from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets ++from .utils import check_logprobs_close + ++pytestmark = pytest.mark.vlm + -+# Fixtures lazy import to avoid initializing CUDA during test collection -+@pytest.fixture() -+def processor_for_qwen2_vl(): -+ from vllm.model_executor.models.qwen2_vl import Qwen2VLMultiModalProcessor -+ return Qwen2VLMultiModalProcessor ++HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ ++ "stop_sign": ++ "USER: \nWhat's the content of the image?\nASSISTANT:", ++ "cherry_blossom": ++ "USER: \nWhat is the season?\nASSISTANT:", ++}) + ++models = [ ++ "llava-hf/llava-1.5-7b-hf", ++] + -+@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"]) -+# yapf: disable -+@pytest.mark.parametrize( -+ ("mm_processor_kwargs", "expected_toks_per_img", "expected_pixels_shape"), [ -+ ({}, 1426, (5704, 1176)), -+ ({"min_pixels": 64**2, "max_pixels": 512**2}, 330, (1320, 1176)), -+ ]) -+# yapf: enable -+@pytest.mark.parametrize("num_imgs", [1, 2]) -+def test_processor_override( -+ processor_for_qwen2_vl, -+ image_assets: _ImageAssets, -+ model_id: str, -+ mm_processor_kwargs: dict[str, object], -+ expected_toks_per_img: int, -+ expected_pixels_shape: tuple[int, int], -+ num_imgs: int, -+): -+ """Ensure Qwen2VLMultiModalProcessor handles min/max pixels properly.""" -+ ctx = build_model_context( -+ model_name=model_id, -+ tokenizer_name=model_id, -+ mm_processor_kwargs=None, -+ limit_mm_per_prompt={"image": num_imgs}, -+ ) -+ tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) -+ ctx = InputProcessingContext(ctx.model_config, tokenizer) -+ -+ # Build the image str / prompt based on the number of images we pass -+ prompt = "<|vision_start|><|image_pad|><|vision_end|>" * num_imgs -+ mm_data = {"image": [image_assets[0].pil_image] * num_imgs} -+ -+ processor = processor_for_qwen2_vl(ctx) -+ processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs) -+ -+ # Ensure we have the right number of placeholders per num_crops size -+ hf_processor = processor._get_hf_processor(**mm_processor_kwargs) -+ image_token_id = tokenizer.convert_tokens_to_ids(hf_processor.image_token) -+ img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id) -+ pixel_shape = processed_inputs["mm_kwargs"]["pixel_values"].shape -+ -+ assert img_tok_count == expected_toks_per_img * num_imgs -+ assert pixel_shape[0] == expected_pixels_shape[0] * num_imgs -+ assert pixel_shape[1] == expected_pixels_shape[1] -diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py -index 3101d1d2e..dc0b683c1 100644 ---- a/tests/models/decoder_only/vision_language/test_models.py -+++ b/tests/models/decoder_only/vision_language/test_models.py -@@ -140,10 +140,7 @@ VLM_TEST_SETTINGS = { - "aria": VLMTestInfo( - models=["rhymes-ai/Aria"], - tokenizer_mode="slow", -- test_type=( -- VLMTestType.IMAGE, -- VLMTestType.MULTI_IMAGE, -- ), -+ test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), - dtype="bfloat16", - prompt_formatter=lambda img_prompt: f"<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n ", # noqa: E501 - img_idx_to_prompt=lambda idx: "<|img|>\n", -@@ -179,6 +176,7 @@ VLM_TEST_SETTINGS = { - test_type=VLMTestType.IMAGE, - prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:", - max_model_len=4096, -+ max_num_seqs=2, - auto_cls=AutoModelForVision2Seq, - postprocess_inputs=model_utils.cast_dtype_post_processor( - "pixel_values" -@@ -212,7 +210,7 @@ VLM_TEST_SETTINGS = { - dtype="bfloat16", - get_stop_token_ids=lambda tok: [151329, 151336, 151338], - patch_hf_runner=model_utils.glm_patch_hf_runner, -- marks=[large_gpu_mark(min_gb=48)], -+ marks=[large_gpu_mark(min_gb=32)], - ), - "h2ovl": VLMTestInfo( - models = [ -@@ -261,6 +259,7 @@ VLM_TEST_SETTINGS = { - dtype="bfloat16", - use_tokenizer_eos=True, - patch_hf_runner=model_utils.internvl_patch_hf_runner, -+ marks=[large_gpu_mark(min_gb=32)], - ), - "llava_next": VLMTestInfo( - models=["llava-hf/llava-v1.6-mistral-7b-hf"], -@@ -275,10 +274,8 @@ VLM_TEST_SETTINGS = { - ), - limit_mm_per_prompt={"image": 4}, - )], -- # Llava-next tests fixed sizes & the default size factors -- image_sizes=[((1669, 2560), (2560, 1669), (183, 488), (488, 183))], - ), -- "llava_one_vision": VLMTestInfo( -+ "llava_onevision": VLMTestInfo( - models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"], - test_type=VLMTestType.CUSTOM_INPUTS, - prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 -@@ -289,8 +286,6 @@ VLM_TEST_SETTINGS = { - ), - auto_cls=AutoModelForVision2Seq, - vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output, -- # Llava-one-vision tests fixed sizes & the default size factors -- image_sizes=[((1669, 2560), (2560, 1669), (183, 488), (488, 183))], - custom_test_opts=[CustomTestOptions( - inputs=custom_inputs.multi_video_multi_aspect_ratio_inputs( - formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 -@@ -307,7 +302,6 @@ VLM_TEST_SETTINGS = { - max_model_len=4096, - auto_cls=AutoModelForVision2Seq, - vllm_output_post_proc=model_utils.llava_video_vllm_to_hf_output, -- image_sizes=[((1669, 2560), (2560, 1669), (183, 488), (488, 183))], - ), - "mantis": VLMTestInfo( - models=["TIGER-Lab/Mantis-8B-siglip-llama3"], -@@ -432,7 +426,7 @@ VLM_TEST_SETTINGS = { - ) for inp in custom_inputs.different_patch_input_cases_internvl() - ], - ), -- "llava_one_vision-multiple-images": VLMTestInfo( -+ "llava_onevision-multiple-images": VLMTestInfo( - models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"], - test_type=VLMTestType.CUSTOM_INPUTS, - max_model_len=16384, -diff --git a/tests/models/decoder_only/vision_language/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/test_qwen2_vl.py -index 51fe7d2ad..16e256e04 100644 ---- a/tests/models/decoder_only/vision_language/test_qwen2_vl.py -+++ b/tests/models/decoder_only/vision_language/test_qwen2_vl.py -@@ -427,130 +427,3 @@ def test_qwen2_vl_video_embeddings_input(vllm_runner, video_assets, model, - mm_limit=1, - tensor_parallel_size=1, - ) -- -- --def run_chunked_prefill_test( -- vllm_runner: Type[VllmRunner], -- inputs: List[Tuple[List[str], PromptImageInput, PromptVideoInput]], -- model: str, -- *, -- dtype: str, -- max_tokens: int, -- num_logprobs: int, -- mm_limit: int, -- tensor_parallel_size: int, -- distributed_executor_backend: Optional[str] = None, --): -- """Compare inference result between -- chunked prefill disabled and chunked prefill enabled -- """ -- -- # NOTE: -- # max_model_len should be greater than image_feature_size -- with vllm_runner(model, -- task="generate", -- max_model_len=4000, -- max_num_seqs=4, -- dtype=dtype, -- limit_mm_per_prompt={ -- "image": mm_limit, -- "video": mm_limit -- }, -- tensor_parallel_size=tensor_parallel_size, -- distributed_executor_backend=distributed_executor_backend -- ) as vllm_model: -- -- outputs_per_case = [ -- vllm_model.generate_greedy_logprobs(prompts, -- max_tokens, -- num_logprobs=num_logprobs, -- images=images or None, -- videos=videos or None) -- for prompts, images, videos in inputs -- ] -- -- with vllm_runner( -- model, -- task="generate", -- max_model_len=4000, -- max_num_seqs=4, -- dtype=dtype, -- limit_mm_per_prompt={ -- "image": mm_limit, -- "video": mm_limit -- }, -- tensor_parallel_size=tensor_parallel_size, -- distributed_executor_backend=distributed_executor_backend, -- enable_chunked_prefill=True, -- # should be small enough to ensure prefilling is chunked -- max_num_batched_tokens=32, -- mm_processor_kwargs={ -- "max_pixels": 16 * 28 * 28, -- }) as vllm_model_chunked: -- outputs_per_case_chunked = [ -- vllm_model_chunked.generate_greedy_logprobs( -- prompts, -- max_tokens, -- num_logprobs=num_logprobs, -- images=images or None, -- videos=videos or None) for prompts, images, videos in inputs -- ] -- -- for outputs, \ -- outputs_chunked \ -- in zip(outputs_per_case, -- outputs_per_case_chunked): -- check_logprobs_close( -- outputs_0_lst=outputs, -- outputs_1_lst=outputs_chunked, -- name_0="non_chunked", -- name_1="chunked", -- ) -- -- --@pytest.mark.core_model --@pytest.mark.parametrize("model", models) --@pytest.mark.parametrize("dtype", [target_dtype]) --@pytest.mark.parametrize("max_tokens", [1]) --@pytest.mark.parametrize("num_logprobs", [10]) --def test_qwen2_vl_mrope_chunked_prefill(vllm_runner, example_prompts, -- model: str, dtype: str, -- max_tokens: int, -- num_logprobs: int) -> None: -- """ -- Test Qwen2-VL's chunked prefill with M-RoPE -- """ -- prompts = [ -- qwen2_vl_chat_template(IMAGE_PLACEHOLDER, prompt) -- for prompt in example_prompts[:1] -- ] -- -- # 1. Qwen2-VL's M-RoPE works only when there are some multi-modal inputs, -- # so an image is included in the inputs -- # 2. however, Qwen2-VL currently won't work properly -- # when chunked prefill is enabled and there are some multi-modal inputs, -- # here use a hacky way: provide a **zero-length** image to make it happy -- # -- # and finally we achieved: -- # (1) chunked_prefill enabled; (2) M-RoPE works; to continue our tests -- zero_len_image = { -- "image_embeds": torch.empty((0, MODEL_HIDDEN_SIZE)), -- "image_grid_thw": torch.tensor([[0, 0, 0]]) -- } -- images = [zero_len_image] * len(prompts) -- -- inputs_per_case: List[Tuple[List[str], PromptImageInput, -- PromptVideoInput]] = [ -- (prompts, images, []), -- ] -- -- run_chunked_prefill_test( -- vllm_runner, -- inputs_per_case, -- model, -- dtype=dtype, -- max_tokens=max_tokens, -- num_logprobs=num_logprobs, -- mm_limit=1, -- tensor_parallel_size=1, -- ) -diff --git a/tests/models/encoder_decoder/audio_language/__init__.py b/tests/models/encoder_decoder/audio_language/__init__.py -new file mode 100644 -index 000000000..e69de29bb -diff --git a/tests/models/encoder_decoder/audio_language/test_whisper.py b/tests/models/encoder_decoder/audio_language/test_whisper.py -new file mode 100644 -index 000000000..eb238c533 ---- /dev/null -+++ b/tests/models/encoder_decoder/audio_language/test_whisper.py -@@ -0,0 +1,136 @@ -+"""Compare the outputs of HF and vLLM for Whisper models using greedy sampling. + -+Run `pytest tests/models/encoder_decoder/audio/test_whisper.py`. -+""" -+from typing import Optional ++def vllm_to_hf_output(vllm_output: Tuple[List[int], str, ++ Optional[SampleLogprobs]], ++ model: str): ++ """Sanitize vllm output to be comparable with hf output.""" ++ output_ids, output_str, out_logprobs = vllm_output + -+import pytest ++ config = AutoConfig.from_pretrained(model) ++ image_token_id = config.image_token_index + -+from vllm import LLM, SamplingParams -+from vllm.assets.audio import AudioAsset ++ tokenizer = AutoTokenizer.from_pretrained(model) ++ eos_token_id = tokenizer.eos_token_id + -+from ....utils import fork_new_process_for_each_test, multi_gpu_test ++ hf_output_ids = [ ++ token_id for idx, token_id in enumerate(output_ids) ++ if token_id != image_token_id or output_ids[idx - 1] != image_token_id ++ ] + -+PROMPTS = [ -+ { -+ "prompt": -+ "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>", -+ "multi_modal_data": { -+ "audio": AudioAsset("mary_had_lamb").audio_and_sample_rate, -+ }, -+ }, -+ { # Test explicit encoder/decoder prompt -+ "encoder_prompt": { -+ "prompt": "", -+ "multi_modal_data": { -+ "audio": AudioAsset("winning_call").audio_and_sample_rate, -+ }, -+ }, -+ "decoder_prompt": -+ "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>", -+ } -+] ++ assert output_str[0] == " " ++ hf_output_str = output_str[1:] ++ if hf_output_ids[-1] == eos_token_id: ++ hf_output_str = hf_output_str + tokenizer.decode(eos_token_id) + -+EXPECTED = { -+ "openai/whisper-tiny": [ -+ " He has birth words I spoke in the original corner of that. And a" -+ " little piece of black coat poetry. Mary had a little sandwich," -+ " sweet, with white and snow. And everyone had it very went the last" -+ " would sure to go.", -+ " >> And the old one, fit John the way to Edgar Martinez. >> One more" -+ " to line down the field line for our base camp. Here comes joy. Here" -+ " is June and the third base. They're going to wave him in. The throw" -+ " to the plate will be late. The Mariners are going to play for the" -+ " American League Championship. I don't believe it. It just continues" -+ " by all five." -+ ], -+ "openai/whisper-small": [ -+ " The first words I spoke in the original pornograph. A little piece" -+ " of practical poetry. Mary had a little lamb, its fleece was quite a" -+ " slow, and everywhere that Mary went the lamb was sure to go.", -+ " And the old one pitch on the way to Edgar Martinez one month. Here" -+ " comes joy. Here is Junior to third base. They're gonna wave him" -+ " in. The throw to the plate will be late. The Mariners are going to" -+ " play for the American League Championship. I don't believe it. It" -+ " just continues. My, oh my." -+ ], -+ "openai/whisper-medium": [ -+ " The first words I spoke in the original phonograph, a little piece" -+ " of practical poetry. Mary had a little lamb, its fleece was quite as" -+ " slow, and everywhere that Mary went the lamb was sure to go.", -+ " And the 0-1 pitch on the way to Edgar Martinez swung on the line" -+ " down the left field line for Obeyshev. Here comes Joy. Here is" -+ " Jorgen at third base. They're going to wave him in. The throw to the" -+ " plate will be late. The Mariners are going to play for the American" -+ " League Championship. I don't believe it. It just continues. My, oh" -+ " my." -+ ], -+ "openai/whisper-large-v3": [ -+ " The first words I spoke in the original phonograph, a little piece" -+ " of practical poetry. Mary had a little lamb, its feet were quite as" -+ " slow, and everywhere that Mary went, the lamb was sure to go.", -+ " And the 0-1 pitch on the way to Edgar Martinez. Swung on the line." -+ " Now the left field line for a base hit. Here comes Joy. Here is" -+ " Junior to third base. They're going to wave him in. The throw to the" -+ " plate will be late. The Mariners are going to play for the American" -+ " League Championship. I don't believe it. It just continues. My, oh," -+ " my." -+ ], -+ "openai/whisper-large-v3-turbo": [ -+ " The first words I spoke in the original phonograph, a little piece" -+ " of practical poetry. Mary had a little lamb, its streets were quite" -+ " as slow, and everywhere that Mary went the lamb was sure to go.", -+ " And the 0-1 pitch on the way to Edgar Martinez. Swung on the line" -+ " down the left field line for a base hit. Here comes Joy. Here is" -+ " Junior to third base. They're going to wave him in. The throw to the" -+ " plate will be late. The Mariners are going to play for the American" -+ " League Championship. I don't believe it. It just continues. My, oh," -+ " my." -+ ] -+} ++ return hf_output_ids, hf_output_str, out_logprobs + + +def run_test( ++ hf_runner: Type[HfRunner], ++ vllm_runner: Type[VllmRunner], ++ image_assets: _ImageAssets, + model: str, + *, ++ size_factors: List[float], ++ dtype: str, ++ max_tokens: int, ++ num_logprobs: int, + tensor_parallel_size: int, + distributed_executor_backend: Optional[str] = None, -+) -> None: -+ prompt_list = PROMPTS * 10 -+ expected_list = EXPECTED[model] * 10 ++): ++ """Inference result should be the same between hf and vllm. + -+ llm = LLM( -+ model=model, -+ tensor_parallel_size=tensor_parallel_size, -+ distributed_executor_backend=distributed_executor_backend, -+ ) ++ All the image fixtures for the test is under tests/images. ++ For huggingface runner, we provide the PIL images as input. ++ For vllm runner, we provide MultiModalDataDict objects ++ and corresponding vision language config as input. ++ Note, the text input is also adjusted to abide by vllm contract. ++ The text output is sanitized to be able to compare with hf. ++ """ + -+ sampling_params = SamplingParams( -+ temperature=0, -+ top_p=1.0, -+ max_tokens=200, -+ ) ++ # vLLM to load from image embeddings ++ vllm_images = [asset.image_embeds for asset in image_assets] + -+ outputs = llm.generate(prompt_list, sampling_params) ++ # transformers to load from PIL images ++ hf_images = [asset.pil_image for asset in image_assets] + -+ for output, expected in zip(outputs, expected_list): -+ print(output.outputs[0].text) -+ assert output.outputs[0].text == expected ++ vllm_inputs_per_image = [( ++ [prompt for _ in size_factors], ++ [image for _ in size_factors], ++ ) for image, prompt in zip(vllm_images, HF_IMAGE_PROMPTS)] + ++ hf_inputs_per_image = [( ++ [prompt for _ in size_factors], ++ [image for _ in size_factors], ++ ) for image, prompt in zip(hf_images, HF_IMAGE_PROMPTS)] + -+@fork_new_process_for_each_test -+@pytest.mark.core_model ++ # NOTE: take care of the order. run vLLM first, and then run HF. ++ # vLLM needs a fresh new process without cuda initialization. ++ # if we run HF first, the cuda initialization will be done and it ++ # will hurt multiprocessing backend with fork method (the default method). ++ ++ # max_model_len should be greater than image_feature_size ++ with vllm_runner(model, ++ dtype=dtype, ++ tensor_parallel_size=tensor_parallel_size, ++ distributed_executor_backend=distributed_executor_backend, ++ enforce_eager=True) as vllm_model: ++ vllm_outputs_per_image = [ ++ vllm_model.generate_greedy_logprobs(prompts, ++ max_tokens, ++ num_logprobs=num_logprobs, ++ images=images) ++ for prompts, images in vllm_inputs_per_image ++ ] ++ ++ with hf_runner(model, dtype=dtype, ++ auto_cls=AutoModelForVision2Seq) as hf_model: ++ hf_outputs_per_image = [ ++ hf_model.generate_greedy_logprobs_limit(prompts, ++ max_tokens, ++ num_logprobs=num_logprobs, ++ images=images) ++ for prompts, images in hf_inputs_per_image ++ ] ++ ++ for hf_outputs, vllm_outputs in zip(hf_outputs_per_image, ++ vllm_outputs_per_image): ++ # TODO: Check whether using original CLIPVisionModel can improve ++ # consistency against HF ++ check_logprobs_close( ++ outputs_0_lst=hf_outputs, ++ outputs_1_lst=[ ++ vllm_to_hf_output(vllm_output, model) ++ for vllm_output in vllm_outputs ++ ], ++ name_0="hf", ++ name_1="vllm", ++ ) ++ ++ ++@pytest.mark.parametrize("model", models) +@pytest.mark.parametrize( -+ "model", ["openai/whisper-small", "openai/whisper-large-v3-turbo"]) -+def test_models(model) -> None: -+ run_test(model, tensor_parallel_size=1) -+ -+ -+@multi_gpu_test(num_gpus=2) -+@pytest.mark.core_model -+@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"]) -+@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"]) -+def test_models_distributed(model, distributed_executor_backend) -> None: -+ run_test(model, -+ tensor_parallel_size=2, -+ distributed_executor_backend=distributed_executor_backend) -diff --git a/tests/models/registry.py b/tests/models/registry.py -index f5a37420a..dcb8bfa0f 100644 ---- a/tests/models/registry.py -+++ b/tests/models/registry.py -@@ -140,6 +140,8 @@ _EMBEDDING_EXAMPLE_MODELS = { - "BertModel": _HfExamplesInfo("BAAI/bge-base-en-v1.5"), - "Gemma2Model": _HfExamplesInfo("BAAI/bge-multilingual-gemma2"), - "GritLM": _HfExamplesInfo("parasail-ai/GritLM-7B-vllm"), -+ "InternLM2ForRewardModel": _HfExamplesInfo("internlm/internlm2-1_8b-reward", -+ trust_remote_code=True), - "JambaForSequenceClassification": _HfExamplesInfo("ai21labs/Jamba-tiny-reward-dev"), # noqa: E501 - "LlamaModel": _HfExamplesInfo("llama", is_available_online=False), - "MistralModel": _HfExamplesInfo("intfloat/e5-mistral-7b-instruct"), -@@ -202,6 +204,7 @@ _MULTIMODAL_EXAMPLE_MODELS = { - "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_3"), - # [Encoder-decoder] - "MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"), # noqa: E501 -+ "WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"), # noqa: E501 - } - - _SPECULATIVE_DECODING_EXAMPLE_MODELS = { -diff --git a/tests/models/test_bart.py b/tests/models/test_bart.py ++ "size_factors", ++ [ ++ # No image ++ [], ++ # Single-scale ++ [1.0], ++ # Single-scale, batched ++ [1.0, 1.0, 1.0], ++ ], ++) ++@pytest.mark.parametrize("dtype", ["half"]) ++@pytest.mark.parametrize("max_tokens", [128]) ++@pytest.mark.parametrize("num_logprobs", [5]) ++def test_models(hf_runner, vllm_runner, image_assets, model, size_factors, ++ dtype: str, max_tokens: int, num_logprobs: int) -> None: ++ run_test( ++ hf_runner, ++ vllm_runner, ++ image_assets, ++ model, ++ size_factors=size_factors, ++ dtype=dtype, ++ max_tokens=max_tokens, ++ num_logprobs=num_logprobs, ++ tensor_parallel_size=1, ++ ) +diff --git a/tests/models/test_phimoe.py b/tests/models/test_phimoe.py new file mode 100644 -index 000000000..660b61d1a +index 000000000..2fb2eecc9 --- /dev/null -+++ b/tests/models/test_bart.py -@@ -0,0 +1,170 @@ -+"""Compare the outputs of HF and vLLM for BART models using greedy sampling. ++++ b/tests/models/test_phimoe.py +@@ -0,0 +1,111 @@ ++"""Compare the outputs of HF and vLLM for moe models using greedy sampling. + -+Run `pytest tests/models/test_bart.py`. ++Run `pytest tests/models/test_phimoe.py`. +""" -+from typing import List, Optional, Tuple ++import pytest ++import torch + +from vllm.utils import is_cpu + -+if not is_cpu(): -+ # CPU backend is not currently supported with encoder/decoder models -+ # skip test definitions entirely to avoid importing GPU kernel libs -+ # (xFormers, etc.) ++from .utils import check_logprobs_close + -+ import pytest -+ from transformers import AutoModelForSeq2SeqLM ++MODELS = [ ++ "microsoft/Phi-3.5-MoE-instruct", ++] + -+ from vllm.sequence import SampleLogprobs + -+ from ..conftest import DecoderPromptType -+ from .utils import check_logprobs_close ++def test_phimoe_routing_function(): ++ from vllm.model_executor.models.phimoe import phimoe_routing_function ++ test_case = { ++ 0: { ++ "hidden_states": ++ torch.tensor([1, 2, 3, 4, 5, 6, 7, 8], ++ dtype=torch.float32, ++ requires_grad=False).view(4, 2), ++ "gating_output": ++ torch.tensor([0.1, 0.2, 0.3, 0.4], ++ dtype=torch.float32, ++ requires_grad=False), ++ "topk": ++ 2, ++ "renormalize": ++ False, ++ }, ++ 1: { ++ "hidden_states": ++ torch.tensor([1, 2, 3, 4, 5, 6, 7, 8], ++ dtype=torch.float32, ++ requires_grad=False).view(4, 2), ++ "gating_output": ++ torch.tensor([0.4, 0.2, 0.3, 0.4], ++ dtype=torch.float32, ++ requires_grad=False), ++ "topk": ++ 2, ++ "renormalize": ++ False, ++ } ++ } + -+ MODELS = ["facebook/bart-base", "facebook/bart-large-cnn"] ++ ground_truth = { ++ 0: { ++ "topk_weights": ++ torch.tensor([1., 1.], dtype=torch.float32, requires_grad=False), ++ "topk_ids": ++ torch.tensor([3, 2], dtype=torch.long, requires_grad=False), ++ }, ++ 1: { ++ "topk_weights": ++ torch.tensor([0.5, 1.], dtype=torch.float32, requires_grad=False), ++ "topk_ids": ++ torch.tensor([0, 3], dtype=torch.long, requires_grad=False), ++ } ++ } + -+ def vllm_to_hf_output( -+ vllm_output: Tuple[List[int], str, Optional[SampleLogprobs]], -+ decoder_prompt_type: DecoderPromptType, -+ ): -+ """Sanitize vllm output to be comparable with hf output.""" -+ output_ids, output_str, out_logprobs = vllm_output ++ for test_id in test_case: ++ topk_weights, topk_ids = phimoe_routing_function(**test_case[test_id]) ++ assert torch.allclose(topk_weights, ++ ground_truth[test_id]["topk_weights"]) ++ assert torch.equal(topk_ids, ground_truth[test_id]["topk_ids"]) + -+ hf_output_str = output_str + "" -+ if decoder_prompt_type == DecoderPromptType.NONE: -+ hf_output_str = "" + hf_output_str + -+ return output_ids, hf_output_str, out_logprobs ++def get_gpu_memory(): ++ try: ++ props = torch.cuda.get_device_properties(torch.cuda.current_device()) ++ gpu_memory = props.total_memory / (1024**3) ++ return gpu_memory ++ except Exception: ++ return 0 + -+ @pytest.mark.parametrize("model", MODELS) -+ @pytest.mark.parametrize("dtype", ["float", "bfloat16"]) -+ @pytest.mark.parametrize("max_tokens", [64]) -+ @pytest.mark.parametrize("num_logprobs", [5]) -+ @pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType)) -+ def test_models( -+ hf_runner, -+ vllm_runner, -+ example_encoder_decoder_prompts, -+ model: str, -+ dtype: str, -+ max_tokens: int, -+ num_logprobs: int, -+ decoder_prompt_type: DecoderPromptType, -+ ) -> None: -+ ''' -+ Test the vLLM BART model for a variety of encoder/decoder input prompts, -+ by validating it against HuggingFace (HF) BART. -+ -+ Arguments: -+ -+ * hf_runner: HuggingFace (HF) test model runner -+ * vllm_runner: vLLM test model runner -+ * example_encoder_decoder_prompts: test fixture which provides a -+ dictionary of dummy prompts -+ * model: the HF ID of the specific BART variant under test -+ * dtype: the tensor datatype to employ -+ * max_tokens -+ * num_logprobs -+ * decoder_prompt_type: key into the example_encoder_decoder_prompts -+ dictionary; selects specific encoder/decoder -+ prompt scenarios to test -+ -+ A note on using HF BART as a baseline for validating vLLM BART, -+ specifically when the decoder prompt is None. -+ -+ The HF GenerationMixin's default behavior is to force the first -+ decoded token to be if the prompt does not already contain -+ (this is accomplished using a logit -+ processor setting.) -+ -+ So when we use HF BART as our baseline for comparison, note that -+ when the user provides a request with a None decoder prompt -+ (i.e. a singleton encoder prompt, or else an explicit encoder/ -+ decoder prompt with the decoder sub-prompt set to None), HF and -+ vLLM handle this in different ways: -+ -+ * HF will (1) tokenize the None prompt as an empty token-list, -+ (2) append to the beginning, yielding -+ [], (3) pass this token list to the model, and -+ then (4) after computing logits during prefill, override the model -+ logits & force to be the first generated token. -+ -+ * vLLM will (1) tokenize the None prompt as [], (2) append decoder- -+ start-token to the beginning, yielding [], -+ (3) pass these tokens to the model & proceed with generation. -+ -+ The net effect is that compared to vLLM, the list of HF *decoded* tokens -+ will contain one more initial than the vLLM generated tokens, -+ because vLLM's token is injected into the prompt rather than into -+ the generated output. This is in spite of the fact that overall, the -+ complete sequences (prompt + decoded tokens) produced by vLLM will match -+ HF. -+ -+ So when we use HF decoded token output to validate vLLM's decoded token -+ output, the testing process must account for the difference in decoded -+ token sequences between vLLM and HF specifically in the -+ decoder-prompt-is-None case. -+ -+ One option is to disable the logit processor feature that forces the -+ token to be decoded (forced_bos_token_id = None), eliminating -+ the problem entirely. However this is not "normal" BART usage. -+ -+ The other option is - only in the decoder-prompt-is-None case - to -+ discard the first decoded token from the HF output before comparing it -+ to vLLM. -+ -+ To that end, when testing the scenario where the decoder prompt is None -+ (and only in that one scenario), this test skips the first HF decoded -+ token during the process of validating the vLLM decoded output. -+ ''' -+ -+ test_case_prompts = example_encoder_decoder_prompts[ -+ decoder_prompt_type] -+ -+ # Configuration settings for HF baseline -+ hf_kwargs = { -+ "top_k": None, -+ "num_beams": 1, -+ "repetition_penalty": 1.0, -+ "top_p": 1.0, -+ "length_penalty": 1.0, -+ "early_stopping": False, -+ "no_repeat_ngram_size": None, -+ "min_length": 0 -+ } -+ -+ with hf_runner(model, dtype=dtype, -+ auto_cls=AutoModelForSeq2SeqLM) as hf_model: -+ hf_outputs = ( -+ hf_model.generate_encoder_decoder_greedy_logprobs_limit( -+ test_case_prompts, -+ max_tokens, -+ num_logprobs, -+ **hf_kwargs, -+ )) -+ -+ # Note: currently encoder/decoder models are only compatible with -+ # enforce_eager=True. Normally this is not a problem because -+ # for encoder/decoder models vLLM will -+ # default to enforce_eager=True if enforce_eager -+ # is left unspecified. However, the -+ # VllmRunner test fixture (which wraps around the LLM class) defaults to -+ # enforce_eager=False (a behavior which a number of already-exisitng -+ # decoder-only unit tests expect), so when testing an encoder/decoder -+ # model we must explicitly specify enforce_eager=True in the VllmRunner -+ # constructor. -+ with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model: -+ vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs( -+ test_case_prompts, max_tokens, num_logprobs) -+ -+ hf_skip_tokens = (1 if decoder_prompt_type == DecoderPromptType.NONE -+ else 0) -+ -+ check_logprobs_close( -+ outputs_0_lst=hf_outputs, -+ outputs_1_lst=[ -+ vllm_to_hf_output(vllm_output, decoder_prompt_type) -+ for vllm_output in vllm_outputs -+ ], -+ name_0="hf", -+ name_1="vllm", -+ num_outputs_0_skip_tokens=hf_skip_tokens, -+ ) -diff --git a/tests/models/test_gguf.py b/tests/models/test_gguf.py -new file mode 100644 -index 000000000..196cd88e0 ---- /dev/null -+++ b/tests/models/test_gguf.py -@@ -0,0 +1,90 @@ -+""" -+Tests gguf models against unquantized models generations -+Note: To pass the test, quantization higher than Q4 should be used -+""" -+ -+import os -+ -+import pytest -+from huggingface_hub import hf_hub_download -+from transformers import AutoTokenizer -+ -+from tests.quantization.utils import is_quant_method_supported -+ -+from .utils import check_logprobs_close -+ -+os.environ["TOKENIZERS_PARALLELISM"] = "true" -+ -+MAX_MODEL_LEN = 1024 -+ -+# FIXME: Move this to confest -+MODELS = [ -+ ("TinyLlama/TinyLlama-1.1B-Chat-v1.0", -+ hf_hub_download("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", -+ filename="tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf")), -+ ("TinyLlama/TinyLlama-1.1B-Chat-v1.0", -+ hf_hub_download("duyntnet/TinyLlama-1.1B-Chat-v1.0-imatrix-GGUF", -+ filename="TinyLlama-1.1B-Chat-v1.0-IQ4_XS.gguf")), -+ ("Qwen/Qwen2-1.5B-Instruct", -+ hf_hub_download("Qwen/Qwen2-1.5B-Instruct-GGUF", -+ filename="qwen2-1_5b-instruct-q4_k_m.gguf")), -+ ("Qwen/Qwen2-1.5B-Instruct", -+ hf_hub_download("legraphista/Qwen2-1.5B-Instruct-IMat-GGUF", -+ filename="Qwen2-1.5B-Instruct.IQ4_XS.gguf")), -+] + -+ -+@pytest.mark.skipif(not is_quant_method_supported("gguf"), -+ reason="gguf is not supported on this GPU type.") ++@pytest.mark.skipif(condition=is_cpu(), ++ reason="This test takes a lot time to run on CPU, " ++ "and vllm CI's disk space is not enough for this model.") ++@pytest.mark.skipif(condition=get_gpu_memory() < 100, ++ reason="Skip this test if GPU memory is insufficient.") +@pytest.mark.parametrize("model", MODELS) -+@pytest.mark.parametrize("dtype", ["half"]) -+@pytest.mark.parametrize("max_tokens", [32]) ++@pytest.mark.parametrize("dtype", ["bfloat16"]) ++@pytest.mark.parametrize("max_tokens", [64]) +@pytest.mark.parametrize("num_logprobs", [5]) -+@pytest.mark.parametrize("tp_size", [1, 2]) +def test_models( -+ num_gpus_available, ++ hf_runner, + vllm_runner, + example_prompts, -+ model, ++ model: str, + dtype: str, + max_tokens: int, + num_logprobs: int, -+ tp_size: int, +) -> None: -+ if num_gpus_available < tp_size: -+ pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}") -+ -+ original_model, gguf_model = model -+ -+ tokenizer = AutoTokenizer.from_pretrained(original_model) -+ messages = [[{ -+ 'role': 'user', -+ 'content': prompt -+ }] for prompt in example_prompts] -+ example_prompts = tokenizer.apply_chat_template(messages, -+ tokenize=False, -+ add_generation_prompt=True) -+ -+ # Run unquantized model. -+ with vllm_runner(model_name=original_model, -+ dtype=dtype, -+ max_model_len=MAX_MODEL_LEN, -+ tensor_parallel_size=tp_size) as original_model: -+ -+ original_outputs = original_model.generate_greedy_logprobs( -+ example_prompts[:-1], max_tokens, num_logprobs) -+ -+ # Run gguf model. -+ with vllm_runner(model_name=gguf_model, -+ dtype=dtype, -+ max_model_len=MAX_MODEL_LEN, -+ tensor_parallel_size=tp_size) as gguf_model: -+ gguf_outputs = gguf_model.generate_greedy_logprobs( -+ example_prompts[:-1], max_tokens, num_logprobs) ++ with hf_runner(model, dtype=dtype) as hf_model: ++ hf_outputs = hf_model.generate_greedy_logprobs_limit( ++ example_prompts, max_tokens, num_logprobs) + ++ with vllm_runner(model, dtype=dtype) as vllm_model: ++ vllm_outputs = vllm_model.generate_greedy_logprobs( ++ example_prompts, max_tokens, num_logprobs) + check_logprobs_close( -+ outputs_0_lst=original_outputs, -+ outputs_1_lst=gguf_outputs, -+ name_0="original", -+ name_1="gguf", ++ outputs_0_lst=hf_outputs, ++ outputs_1_lst=vllm_outputs, ++ name_0="hf", ++ name_1="vllm", + ) -diff --git a/tests/models/test_granite.py b/tests/models/test_granite.py +diff --git a/tests/models/test_qwen.py b/tests/models/test_qwen.py new file mode 100644 -index 000000000..2435b5dc3 +index 000000000..0f974fcc1 --- /dev/null -+++ b/tests/models/test_granite.py -@@ -0,0 +1,49 @@ -+"""Compare the outputs of HF and vLLM for Granite models using greedy sampling. -+ -+Run `pytest tests/models/test_granite.py`. -+""" -+import importlib.metadata ++++ b/tests/models/test_qwen.py +@@ -0,0 +1,48 @@ ++from typing import Type + +import pytest + ++from ..conftest import HfRunner, VllmRunner +from .utils import check_logprobs_close + -+TRANSFORMERS_VERSION = tuple( -+ map(int, -+ importlib.metadata.version("transformers").split("."))) -+ -+MODELS = [ -+ "ibm/PowerLM-3b", -+] ++models = ["qwen/qwen-vl"] + + -+# GraniteForCausalLM will be in transformers >= 4.45 -+@pytest.mark.skipif(TRANSFORMERS_VERSION < (4, 45), -+ reason="granite model test requires transformers >= 4.45") -+@pytest.mark.parametrize("model", MODELS) -+@pytest.mark.parametrize("dtype", ["bfloat16"]) -+@pytest.mark.parametrize("max_tokens", [64]) ++@pytest.mark.parametrize("dtype", ["half"]) ++@pytest.mark.parametrize("max_tokens", [32]) +@pytest.mark.parametrize("num_logprobs", [5]) -+def test_models( -+ hf_runner, -+ vllm_runner, ++@pytest.mark.parametrize("model", models) ++def test_text_only_qwen_model( ++ hf_runner: Type[HfRunner], ++ vllm_runner: Type[VllmRunner], + example_prompts, + model: str, ++ *, + dtype: str, + max_tokens: int, + num_logprobs: int, -+) -> None: -+ # TODO(sang): Sliding window should be tested separately. ++): ++ # This test checks language inputs only, since the visual component ++ # for qwen-vl is still unsupported in VLLM. In the near-future, the ++ # implementation and this test will be extended to consider ++ # visual inputs as well. + with hf_runner(model, dtype=dtype) as hf_model: + hf_outputs = hf_model.generate_greedy_logprobs_limit( -+ example_prompts, max_tokens, num_logprobs) ++ example_prompts, ++ max_tokens, ++ num_logprobs=num_logprobs, ++ ) + + with vllm_runner(model, dtype=dtype) as vllm_model: + vllm_outputs = vllm_model.generate_greedy_logprobs( -+ example_prompts, max_tokens, num_logprobs) ++ example_prompts, ++ max_tokens, ++ num_logprobs=num_logprobs, ++ ) ++ + check_logprobs_close( + outputs_0_lst=hf_outputs, + outputs_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + ) -diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py -index a4eea7f03..3b728f274 100644 ---- a/tests/models/test_initialization.py -+++ b/tests/models/test_initialization.py -@@ -1,7 +1,6 @@ - from unittest.mock import patch - - import pytest --import transformers - from transformers import PretrainedConfig - - from vllm import LLM -@@ -12,9 +11,6 @@ from .registry import HF_EXAMPLE_MODELS - @pytest.mark.parametrize("model_arch", HF_EXAMPLE_MODELS.get_supported_archs()) - def test_can_initialize(model_arch): - model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch) -- if (model_arch == "Cohere2ForCausalLM" -- and transformers.__version__ < "4.48.0"): -- pytest.skip(reason="Model introduced in HF >= 4.48.0") - if not model_info.is_available_online: - pytest.skip("Model is not available online") - -diff --git a/tests/models/test_intern_vit.py b/tests/models/test_intern_vit.py +diff --git a/tests/models/test_ultravox.py b/tests/models/test_ultravox.py new file mode 100644 -index 000000000..816f846f6 +index 000000000..e98db9b65 --- /dev/null -+++ b/tests/models/test_intern_vit.py -@@ -0,0 +1,79 @@ -+from typing import Optional ++++ b/tests/models/test_ultravox.py +@@ -0,0 +1,202 @@ ++from typing import List, Optional, Tuple, Type + ++import numpy as np +import pytest -+import torch -+import torch.nn as nn -+from huggingface_hub import snapshot_download -+from transformers import AutoConfig, AutoModel, CLIPImageProcessor -+ -+from ..conftest import _ImageAssets, cleanup -+ -+pytestmark = pytest.mark.vlm -+ -+# we use snapshot_download to prevent conflicts between -+# dynamic_module and trust_remote_code for hf_runner -+DOWNLOAD_PATTERN = ["*.json", "*.py", "*.safetensors", "*.txt", "*.model"] -+models = [ -+ snapshot_download("OpenGVLab/InternViT-300M-448px", -+ allow_patterns=DOWNLOAD_PATTERN), -+ snapshot_download("OpenGVLab/InternViT-6B-448px-V1-5", -+ allow_patterns=DOWNLOAD_PATTERN), -+] -+ -+ -+def run_intern_vit_test( -+ image_assets: _ImageAssets, -+ model: str, -+ *, -+ dtype: str, -+ distributed_executor_backend: Optional[str] = None, -+): -+ img_processor = CLIPImageProcessor.from_pretrained(model) -+ images = [asset.pil_image for asset in image_assets] -+ pixel_values = [ -+ img_processor(images, return_tensors='pt').pixel_values.to(dtype) -+ for images in images -+ ] -+ -+ config = AutoConfig.from_pretrained(model, trust_remote_code=True) -+ if not getattr(config, "norm_type", None): -+ config.norm_type = "rms_norm" ++from transformers import AutoModel, AutoTokenizer, BatchEncoding + -+ hf_model = AutoModel.from_pretrained(model, -+ torch_dtype=dtype, -+ trust_remote_code=True).to("cuda") -+ hf_outputs_per_image = [ -+ hf_model(pixel_value.to("cuda")).last_hidden_state -+ for pixel_value in pixel_values -+ ] ++from vllm.sequence import SampleLogprobs ++from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE + -+ from vllm.model_executor.models.intern_vit import InternVisionModel -+ vllm_model = InternVisionModel(config) -+ vllm_model.load_weights(hf_model.state_dict().items()) ++from ..conftest import HfRunner, VllmRunner ++from .utils import check_logprobs_close + -+ del hf_model -+ cleanup() ++pytestmark = pytest.mark.vlm + -+ vllm_model = vllm_model.to("cuda", dtype) -+ vllm_outputs_per_image = [ -+ vllm_model(pixel_values=pixel_value.to("cuda")) -+ for pixel_value in pixel_values -+ ] -+ del vllm_model -+ cleanup() ++MODEL_NAME = "fixie-ai/ultravox-v0_3" + -+ cos_similar = nn.CosineSimilarity(dim=-1) -+ for vllm_output, hf_output in zip(vllm_outputs_per_image, -+ hf_outputs_per_image): -+ assert cos_similar(vllm_output, hf_output).mean() > 0.99 ++AudioTuple = Tuple[np.ndarray, int] + ++VLLM_PLACEHOLDER = "<|reserved_special_token_0|>" ++HF_PLACEHOLDER = "<|audio|>" + -+@pytest.mark.parametrize("model", models) -+@pytest.mark.parametrize("dtype", [torch.half]) -+@torch.inference_mode() -+def test_models(dist_init, image_assets, model, dtype: str) -> None: -+ run_intern_vit_test( -+ image_assets, -+ model, -+ dtype=dtype, -+ ) -diff --git a/tests/models/test_llava_image_embeds.py b/tests/models/test_llava_image_embeds.py -new file mode 100644 -index 000000000..cc444fe32 ---- /dev/null -+++ b/tests/models/test_llava_image_embeds.py -@@ -0,0 +1,160 @@ -+from typing import List, Optional, Tuple, Type + -+import pytest -+from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer ++@pytest.fixture(scope="session") ++def audio_assets(): ++ from vllm.assets.audio import AudioAsset ++ return [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")] + -+from vllm.sequence import SampleLogprobs + -+from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets -+from .utils import check_logprobs_close ++@pytest.fixture(scope="module", params=("mary_had_lamb", "winning_call")) ++def audio(request): ++ from vllm.assets.audio import AudioAsset ++ return AudioAsset(request.param) + -+pytestmark = pytest.mark.vlm + -+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ -+ "stop_sign": -+ "USER: \nWhat's the content of the image?\nASSISTANT:", -+ "cherry_blossom": -+ "USER: \nWhat is the season?\nASSISTANT:", -+}) ++def _get_prompt(audio_count, question, placeholder): ++ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) ++ placeholder = f"{placeholder}\n" * audio_count + -+models = [ -+ "llava-hf/llava-1.5-7b-hf", -+] ++ return tokenizer.apply_chat_template([{ ++ 'role': 'user', ++ 'content': f"{placeholder}{question}" ++ }], ++ tokenize=False, ++ add_generation_prompt=True) + + +def vllm_to_hf_output(vllm_output: Tuple[List[int], str, @@ -15655,19 +8541,11 @@ index 000000000..cc444fe32 + """Sanitize vllm output to be comparable with hf output.""" + output_ids, output_str, out_logprobs = vllm_output + -+ config = AutoConfig.from_pretrained(model) -+ image_token_id = config.image_token_index -+ + tokenizer = AutoTokenizer.from_pretrained(model) + eos_token_id = tokenizer.eos_token_id + -+ hf_output_ids = [ -+ token_id for idx, token_id in enumerate(output_ids) -+ if token_id != image_token_id or output_ids[idx - 1] != image_token_id -+ ] -+ -+ assert output_str[0] == " " -+ hf_output_str = output_str[1:] ++ hf_output_ids = output_ids[:] ++ hf_output_str = output_str + if hf_output_ids[-1] == eos_token_id: + hf_output_str = hf_output_str + tokenizer.decode(eos_token_id) + @@ -15677,75 +8555,60 @@ index 000000000..cc444fe32 +def run_test( + hf_runner: Type[HfRunner], + vllm_runner: Type[VllmRunner], -+ image_assets: _ImageAssets, ++ prompts_and_audios: List[Tuple[str, str, AudioTuple]], + model: str, + *, -+ size_factors: List[float], + dtype: str, + max_tokens: int, + num_logprobs: int, + tensor_parallel_size: int, + distributed_executor_backend: Optional[str] = None, +): -+ """Inference result should be the same between hf and vllm. -+ -+ All the image fixtures for the test is under tests/images. -+ For huggingface runner, we provide the PIL images as input. -+ For vllm runner, we provide MultiModalDataDict objects -+ and corresponding vision language config as input. -+ Note, the text input is also adjusted to abide by vllm contract. -+ The text output is sanitized to be able to compare with hf. -+ """ -+ -+ # vLLM to load from image embeddings -+ vllm_images = [asset.image_embeds for asset in image_assets] -+ -+ # transformers to load from PIL images -+ hf_images = [asset.pil_image for asset in image_assets] -+ -+ vllm_inputs_per_image = [( -+ [prompt for _ in size_factors], -+ [image for _ in size_factors], -+ ) for image, prompt in zip(vllm_images, HF_IMAGE_PROMPTS)] -+ -+ hf_inputs_per_image = [( -+ [prompt for _ in size_factors], -+ [image for _ in size_factors], -+ ) for image, prompt in zip(hf_images, HF_IMAGE_PROMPTS)] ++ """Inference result should be the same between hf and vllm.""" ++ torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype] + + # NOTE: take care of the order. run vLLM first, and then run HF. + # vLLM needs a fresh new process without cuda initialization. + # if we run HF first, the cuda initialization will be done and it + # will hurt multiprocessing backend with fork method (the default method). + -+ # max_model_len should be greater than image_feature_size + with vllm_runner(model, + dtype=dtype, + tensor_parallel_size=tensor_parallel_size, + distributed_executor_backend=distributed_executor_backend, + enforce_eager=True) as vllm_model: -+ vllm_outputs_per_image = [ -+ vllm_model.generate_greedy_logprobs(prompts, ++ vllm_outputs_per_audio = [ ++ vllm_model.generate_greedy_logprobs([vllm_prompt], + max_tokens, + num_logprobs=num_logprobs, -+ images=images) -+ for prompts, images in vllm_inputs_per_image ++ audios=[audio]) ++ for vllm_prompt, _, audio in prompts_and_audios + ] + -+ with hf_runner(model, dtype=dtype, -+ auto_cls=AutoModelForVision2Seq) as hf_model: -+ hf_outputs_per_image = [ -+ hf_model.generate_greedy_logprobs_limit(prompts, -+ max_tokens, -+ num_logprobs=num_logprobs, -+ images=images) -+ for prompts, images in hf_inputs_per_image ++ def process(hf_inputs: BatchEncoding): ++ hf_inputs["audio_values"] = hf_inputs["audio_values"] \ ++ .to(torch_dtype) # type: ignore ++ return hf_inputs ++ ++ with hf_runner(model, ++ dtype=dtype, ++ postprocess_inputs=process, ++ auto_cls=AutoModel) as hf_model: ++ import librosa ++ ++ hf_outputs_per_audio = [ ++ hf_model.generate_greedy_logprobs_limit( ++ [hf_prompt], ++ max_tokens, ++ num_logprobs=num_logprobs, ++ audios=[(librosa.resample(audio[0], ++ orig_sr=audio[1], ++ target_sr=16000), 16000)]) ++ for _, hf_prompt, audio in prompts_and_audios + ] + -+ for hf_outputs, vllm_outputs in zip(hf_outputs_per_image, -+ vllm_outputs_per_image): -+ # TODO: Check whether using original CLIPVisionModel can improve -+ # consistency against HF ++ for hf_outputs, vllm_outputs in zip(hf_outputs_per_audio, ++ vllm_outputs_per_audio): + check_logprobs_close( + outputs_0_lst=hf_outputs, + outputs_1_lst=[ @@ -15757,399 +8620,63 @@ index 000000000..cc444fe32 + ) + + -+@pytest.mark.parametrize("model", models) -+@pytest.mark.parametrize( -+ "size_factors", -+ [ -+ # No image -+ [], -+ # Single-scale -+ [1.0], -+ # Single-scale, batched -+ [1.0, 1.0, 1.0], -+ ], -+) -+@pytest.mark.parametrize("dtype", ["half"]) -+@pytest.mark.parametrize("max_tokens", [128]) -+@pytest.mark.parametrize("num_logprobs", [5]) -+def test_models(hf_runner, vllm_runner, image_assets, model, size_factors, -+ dtype: str, max_tokens: int, num_logprobs: int) -> None: -+ run_test( -+ hf_runner, -+ vllm_runner, -+ image_assets, -+ model, -+ size_factors=size_factors, -+ dtype=dtype, -+ max_tokens=max_tokens, -+ num_logprobs=num_logprobs, -+ tensor_parallel_size=1, -+ ) -diff --git a/tests/models/test_phimoe.py b/tests/models/test_phimoe.py -new file mode 100644 -index 000000000..2fb2eecc9 ---- /dev/null -+++ b/tests/models/test_phimoe.py -@@ -0,0 +1,111 @@ -+"""Compare the outputs of HF and vLLM for moe models using greedy sampling. -+ -+Run `pytest tests/models/test_phimoe.py`. -+""" -+import pytest -+import torch -+ -+from vllm.utils import is_cpu -+ -+from .utils import check_logprobs_close -+ -+MODELS = [ -+ "microsoft/Phi-3.5-MoE-instruct", -+] -+ -+ -+def test_phimoe_routing_function(): -+ from vllm.model_executor.models.phimoe import phimoe_routing_function -+ test_case = { -+ 0: { -+ "hidden_states": -+ torch.tensor([1, 2, 3, 4, 5, 6, 7, 8], -+ dtype=torch.float32, -+ requires_grad=False).view(4, 2), -+ "gating_output": -+ torch.tensor([0.1, 0.2, 0.3, 0.4], -+ dtype=torch.float32, -+ requires_grad=False), -+ "topk": -+ 2, -+ "renormalize": -+ False, -+ }, -+ 1: { -+ "hidden_states": -+ torch.tensor([1, 2, 3, 4, 5, 6, 7, 8], -+ dtype=torch.float32, -+ requires_grad=False).view(4, 2), -+ "gating_output": -+ torch.tensor([0.4, 0.2, 0.3, 0.4], -+ dtype=torch.float32, -+ requires_grad=False), -+ "topk": -+ 2, -+ "renormalize": -+ False, -+ } -+ } -+ -+ ground_truth = { -+ 0: { -+ "topk_weights": -+ torch.tensor([1., 1.], dtype=torch.float32, requires_grad=False), -+ "topk_ids": -+ torch.tensor([3, 2], dtype=torch.long, requires_grad=False), -+ }, -+ 1: { -+ "topk_weights": -+ torch.tensor([0.5, 1.], dtype=torch.float32, requires_grad=False), -+ "topk_ids": -+ torch.tensor([0, 3], dtype=torch.long, requires_grad=False), -+ } -+ } -+ -+ for test_id in test_case: -+ topk_weights, topk_ids = phimoe_routing_function(**test_case[test_id]) -+ assert torch.allclose(topk_weights, -+ ground_truth[test_id]["topk_weights"]) -+ assert torch.equal(topk_ids, ground_truth[test_id]["topk_ids"]) -+ -+ -+def get_gpu_memory(): -+ try: -+ props = torch.cuda.get_device_properties(torch.cuda.current_device()) -+ gpu_memory = props.total_memory / (1024**3) -+ return gpu_memory -+ except Exception: -+ return 0 -+ -+ -+@pytest.mark.skipif(condition=is_cpu(), -+ reason="This test takes a lot time to run on CPU, " -+ "and vllm CI's disk space is not enough for this model.") -+@pytest.mark.skipif(condition=get_gpu_memory() < 100, -+ reason="Skip this test if GPU memory is insufficient.") -+@pytest.mark.parametrize("model", MODELS) -+@pytest.mark.parametrize("dtype", ["bfloat16"]) -+@pytest.mark.parametrize("max_tokens", [64]) -+@pytest.mark.parametrize("num_logprobs", [5]) -+def test_models( -+ hf_runner, -+ vllm_runner, -+ example_prompts, -+ model: str, -+ dtype: str, -+ max_tokens: int, -+ num_logprobs: int, -+) -> None: -+ with hf_runner(model, dtype=dtype) as hf_model: -+ hf_outputs = hf_model.generate_greedy_logprobs_limit( -+ example_prompts, max_tokens, num_logprobs) -+ -+ with vllm_runner(model, dtype=dtype) as vllm_model: -+ vllm_outputs = vllm_model.generate_greedy_logprobs( -+ example_prompts, max_tokens, num_logprobs) -+ check_logprobs_close( -+ outputs_0_lst=hf_outputs, -+ outputs_1_lst=vllm_outputs, -+ name_0="hf", -+ name_1="vllm", -+ ) -diff --git a/tests/models/test_qwen.py b/tests/models/test_qwen.py -new file mode 100644 -index 000000000..0f974fcc1 ---- /dev/null -+++ b/tests/models/test_qwen.py -@@ -0,0 +1,48 @@ -+from typing import Type -+ -+import pytest -+ -+from ..conftest import HfRunner, VllmRunner -+from .utils import check_logprobs_close -+ -+models = ["qwen/qwen-vl"] -+ -+ -+@pytest.mark.parametrize("dtype", ["half"]) -+@pytest.mark.parametrize("max_tokens", [32]) -+@pytest.mark.parametrize("num_logprobs", [5]) -+@pytest.mark.parametrize("model", models) -+def test_text_only_qwen_model( -+ hf_runner: Type[HfRunner], ++def run_multi_audio_test( + vllm_runner: Type[VllmRunner], -+ example_prompts, ++ prompts_and_audios: List[Tuple[str, List[AudioTuple]]], + model: str, + *, + dtype: str, + max_tokens: int, + num_logprobs: int, ++ tensor_parallel_size: int, ++ distributed_executor_backend: Optional[str] = None, +): -+ # This test checks language inputs only, since the visual component -+ # for qwen-vl is still unsupported in VLLM. In the near-future, the -+ # implementation and this test will be extended to consider -+ # visual inputs as well. -+ with hf_runner(model, dtype=dtype) as hf_model: -+ hf_outputs = hf_model.generate_greedy_logprobs_limit( -+ example_prompts, -+ max_tokens, -+ num_logprobs=num_logprobs, -+ ) -+ -+ with vllm_runner(model, dtype=dtype) as vllm_model: ++ with vllm_runner(model, ++ dtype=dtype, ++ tensor_parallel_size=tensor_parallel_size, ++ distributed_executor_backend=distributed_executor_backend, ++ enforce_eager=True, ++ limit_mm_per_prompt={ ++ "audio": ++ max((len(audio) for _, audio in prompts_and_audios)) ++ }) as vllm_model: + vllm_outputs = vllm_model.generate_greedy_logprobs( -+ example_prompts, ++ [prompt for prompt, _ in prompts_and_audios], + max_tokens, + num_logprobs=num_logprobs, -+ ) -+ -+ check_logprobs_close( -+ outputs_0_lst=hf_outputs, -+ outputs_1_lst=vllm_outputs, -+ name_0="hf", -+ name_1="vllm", -+ ) -diff --git a/tests/models/test_ultravox.py b/tests/models/test_ultravox.py -new file mode 100644 -index 000000000..e98db9b65 ---- /dev/null -+++ b/tests/models/test_ultravox.py -@@ -0,0 +1,202 @@ -+from typing import List, Optional, Tuple, Type -+ -+import numpy as np -+import pytest -+from transformers import AutoModel, AutoTokenizer, BatchEncoding -+ -+from vllm.sequence import SampleLogprobs -+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE -+ -+from ..conftest import HfRunner, VllmRunner -+from .utils import check_logprobs_close -+ -+pytestmark = pytest.mark.vlm -+ -+MODEL_NAME = "fixie-ai/ultravox-v0_3" -+ -+AudioTuple = Tuple[np.ndarray, int] ++ audios=[audios for _, audios in prompts_and_audios]) + -+VLLM_PLACEHOLDER = "<|reserved_special_token_0|>" -+HF_PLACEHOLDER = "<|audio|>" ++ # The HuggingFace model doesn't support multiple audios yet, so ++ # just assert that some tokens were generated. ++ assert all(tokens for tokens, *_ in vllm_outputs) + + -+@pytest.fixture(scope="session") -+def audio_assets(): -+ from vllm.assets.audio import AudioAsset -+ return [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")] ++@pytest.mark.parametrize("dtype", ["half"]) ++@pytest.mark.parametrize("max_tokens", [128]) ++@pytest.mark.parametrize("num_logprobs", [5]) ++def test_models(hf_runner, vllm_runner, audio, dtype: str, max_tokens: int, ++ num_logprobs: int) -> None: + ++ vllm_prompt = _get_prompt(1, "Describe the audio above.", VLLM_PLACEHOLDER) ++ hf_prompt = _get_prompt(1, "Describe the audio above.", HF_PLACEHOLDER) ++ run_test( ++ hf_runner, ++ vllm_runner, ++ [(vllm_prompt, hf_prompt, audio.audio_and_sample_rate)], ++ MODEL_NAME, ++ dtype=dtype, ++ max_tokens=max_tokens, ++ num_logprobs=num_logprobs, ++ tensor_parallel_size=1, ++ ) + -+@pytest.fixture(scope="module", params=("mary_had_lamb", "winning_call")) -+def audio(request): -+ from vllm.assets.audio import AudioAsset -+ return AudioAsset(request.param) + -+ -+def _get_prompt(audio_count, question, placeholder): -+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) -+ placeholder = f"{placeholder}\n" * audio_count -+ -+ return tokenizer.apply_chat_template([{ -+ 'role': 'user', -+ 'content': f"{placeholder}{question}" -+ }], -+ tokenize=False, -+ add_generation_prompt=True) -+ -+ -+def vllm_to_hf_output(vllm_output: Tuple[List[int], str, -+ Optional[SampleLogprobs]], -+ model: str): -+ """Sanitize vllm output to be comparable with hf output.""" -+ output_ids, output_str, out_logprobs = vllm_output -+ -+ tokenizer = AutoTokenizer.from_pretrained(model) -+ eos_token_id = tokenizer.eos_token_id -+ -+ hf_output_ids = output_ids[:] -+ hf_output_str = output_str -+ if hf_output_ids[-1] == eos_token_id: -+ hf_output_str = hf_output_str + tokenizer.decode(eos_token_id) -+ -+ return hf_output_ids, hf_output_str, out_logprobs -+ -+ -+def run_test( -+ hf_runner: Type[HfRunner], -+ vllm_runner: Type[VllmRunner], -+ prompts_and_audios: List[Tuple[str, str, AudioTuple]], -+ model: str, -+ *, -+ dtype: str, -+ max_tokens: int, -+ num_logprobs: int, -+ tensor_parallel_size: int, -+ distributed_executor_backend: Optional[str] = None, -+): -+ """Inference result should be the same between hf and vllm.""" -+ torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype] -+ -+ # NOTE: take care of the order. run vLLM first, and then run HF. -+ # vLLM needs a fresh new process without cuda initialization. -+ # if we run HF first, the cuda initialization will be done and it -+ # will hurt multiprocessing backend with fork method (the default method). -+ -+ with vllm_runner(model, -+ dtype=dtype, -+ tensor_parallel_size=tensor_parallel_size, -+ distributed_executor_backend=distributed_executor_backend, -+ enforce_eager=True) as vllm_model: -+ vllm_outputs_per_audio = [ -+ vllm_model.generate_greedy_logprobs([vllm_prompt], -+ max_tokens, -+ num_logprobs=num_logprobs, -+ audios=[audio]) -+ for vllm_prompt, _, audio in prompts_and_audios -+ ] -+ -+ def process(hf_inputs: BatchEncoding): -+ hf_inputs["audio_values"] = hf_inputs["audio_values"] \ -+ .to(torch_dtype) # type: ignore -+ return hf_inputs -+ -+ with hf_runner(model, -+ dtype=dtype, -+ postprocess_inputs=process, -+ auto_cls=AutoModel) as hf_model: -+ import librosa -+ -+ hf_outputs_per_audio = [ -+ hf_model.generate_greedy_logprobs_limit( -+ [hf_prompt], -+ max_tokens, -+ num_logprobs=num_logprobs, -+ audios=[(librosa.resample(audio[0], -+ orig_sr=audio[1], -+ target_sr=16000), 16000)]) -+ for _, hf_prompt, audio in prompts_and_audios -+ ] -+ -+ for hf_outputs, vllm_outputs in zip(hf_outputs_per_audio, -+ vllm_outputs_per_audio): -+ check_logprobs_close( -+ outputs_0_lst=hf_outputs, -+ outputs_1_lst=[ -+ vllm_to_hf_output(vllm_output, model) -+ for vllm_output in vllm_outputs -+ ], -+ name_0="hf", -+ name_1="vllm", -+ ) -+ -+ -+def run_multi_audio_test( -+ vllm_runner: Type[VllmRunner], -+ prompts_and_audios: List[Tuple[str, List[AudioTuple]]], -+ model: str, -+ *, -+ dtype: str, -+ max_tokens: int, -+ num_logprobs: int, -+ tensor_parallel_size: int, -+ distributed_executor_backend: Optional[str] = None, -+): -+ with vllm_runner(model, -+ dtype=dtype, -+ tensor_parallel_size=tensor_parallel_size, -+ distributed_executor_backend=distributed_executor_backend, -+ enforce_eager=True, -+ limit_mm_per_prompt={ -+ "audio": -+ max((len(audio) for _, audio in prompts_and_audios)) -+ }) as vllm_model: -+ vllm_outputs = vllm_model.generate_greedy_logprobs( -+ [prompt for prompt, _ in prompts_and_audios], -+ max_tokens, -+ num_logprobs=num_logprobs, -+ audios=[audios for _, audios in prompts_and_audios]) -+ -+ # The HuggingFace model doesn't support multiple audios yet, so -+ # just assert that some tokens were generated. -+ assert all(tokens for tokens, *_ in vllm_outputs) -+ -+ -+@pytest.mark.parametrize("dtype", ["half"]) -+@pytest.mark.parametrize("max_tokens", [128]) -+@pytest.mark.parametrize("num_logprobs", [5]) -+def test_models(hf_runner, vllm_runner, audio, dtype: str, max_tokens: int, -+ num_logprobs: int) -> None: -+ -+ vllm_prompt = _get_prompt(1, "Describe the audio above.", VLLM_PLACEHOLDER) -+ hf_prompt = _get_prompt(1, "Describe the audio above.", HF_PLACEHOLDER) -+ run_test( -+ hf_runner, -+ vllm_runner, -+ [(vllm_prompt, hf_prompt, audio.audio_and_sample_rate)], -+ MODEL_NAME, -+ dtype=dtype, -+ max_tokens=max_tokens, -+ num_logprobs=num_logprobs, -+ tensor_parallel_size=1, -+ ) -+ -+ -+@pytest.mark.parametrize("dtype", ["half"]) -+@pytest.mark.parametrize("max_tokens", [128]) -+@pytest.mark.parametrize("num_logprobs", [5]) -+def test_models_with_multiple_audios(vllm_runner, audio_assets, dtype: str, -+ max_tokens: int, -+ num_logprobs: int) -> None: ++@pytest.mark.parametrize("dtype", ["half"]) ++@pytest.mark.parametrize("max_tokens", [128]) ++@pytest.mark.parametrize("num_logprobs", [5]) ++def test_models_with_multiple_audios(vllm_runner, audio_assets, dtype: str, ++ max_tokens: int, ++ num_logprobs: int) -> None: + + vllm_prompt = _get_prompt(len(audio_assets), + "Describe each of the audios above.", @@ -16164,1620 +8691,66 @@ index 000000000..e98db9b65 + num_logprobs=num_logprobs, + tensor_parallel_size=1, + ) -diff --git a/tests/multimodal/test_mapper.py b/tests/multimodal/test_mapper.py -deleted file mode 100644 -index 81f2a0618..000000000 ---- a/tests/multimodal/test_mapper.py -+++ /dev/null -@@ -1,118 +0,0 @@ --from contextlib import nullcontext -- --import numpy as np --import pytest --from transformers import LlavaNextImageProcessor -- --from vllm.config import ModelConfig --from vllm.multimodal import MultiModalRegistry --from vllm.multimodal.image import rescale_image_size -- -- --@pytest.fixture --def mm_registry(): -- return MultiModalRegistry() -- -- --@pytest.mark.parametrize("dtype", ["half", "float"]) --@pytest.mark.parametrize("size_factor", [0.25, 0.5, 1.0]) --def test_llava_next_image_processor(image_assets, mm_registry, dtype, -- size_factor): -- MODEL_NAME = "llava-hf/llava-v1.6-vicuna-7b-hf" -- -- hf_processor = LlavaNextImageProcessor.from_pretrained(MODEL_NAME) -- assert isinstance(hf_processor, LlavaNextImageProcessor) -- -- model_config = ModelConfig( -- model=MODEL_NAME, -- task="auto", -- tokenizer=MODEL_NAME, -- tokenizer_mode="auto", -- trust_remote_code=False, -- seed=0, -- dtype=dtype, -- revision=None, -- limit_mm_per_prompt={"image": 1}, -- ) -- -- mm_registry.init_mm_limits_per_prompt(model_config) -- -- for asset in image_assets: -- image = rescale_image_size(asset.pil_image, size_factor) -- -- hf_result = hf_processor.preprocess( -- image, -- return_tensors="pt", -- ) -- vllm_result = mm_registry.map_input( -- model_config, -- {"image": image}, -- ) -- -- assert hf_result.keys() == vllm_result.keys() -- for key, hf_tensor in hf_result.items(): -- hf_arr: np.ndarray = hf_tensor.numpy() -- vllm_arr: np.ndarray = vllm_result[key].numpy() -- -- assert hf_arr.shape == vllm_arr.shape, f"Failed for key={key}" -- assert np.allclose(hf_arr, vllm_arr), f"Failed for key={key}" -- -- --@pytest.mark.parametrize( -- ("num_images", "limit", "is_valid"), -- [(0, 0, True), (0, 1, True), (1, 0, False), (1, 1, True), (1, 2, True), -- (2, 1, False), (2, 2, True)], --) --def test_mm_limits(image_assets, mm_registry, num_images, limit, is_valid): -- MODEL_NAME = "llava-hf/llava-v1.6-mistral-7b-hf" -- -- model_config = ModelConfig( -- model=MODEL_NAME, -- task="auto", -- tokenizer=MODEL_NAME, -- tokenizer_mode="auto", -- trust_remote_code=False, -- seed=0, -- dtype="half", -- revision=None, -- limit_mm_per_prompt={"image": limit}, -- ) -- -- mm_registry.init_mm_limits_per_prompt(model_config) -- -- image = image_assets[0].pil_image -- if num_images == 0: -- mm_inputs = {} -- elif num_images == 1: -- mm_inputs = {"image": image} -- else: -- mm_inputs = {"image": [image] * num_images} -- -- with nullcontext() if is_valid else pytest.raises(ValueError): -- mm_registry.map_input(model_config, mm_inputs) -- -- --# NOTE: We don't test zero images since the HF processor doesn't support it --@pytest.mark.parametrize("num_images", [1, 2]) --def test_image_mapper_multi(image_assets, mm_registry, num_images): -- MODEL_NAME = "llava-hf/llava-v1.6-mistral-7b-hf" -- -- model_config = ModelConfig( -- model=MODEL_NAME, -- task="auto", -- tokenizer=MODEL_NAME, -- tokenizer_mode="auto", -- trust_remote_code=False, -- seed=0, -- dtype="half", -- revision=None, -- limit_mm_per_prompt={"image": num_images}, -- ) -- -- mm_registry.init_mm_limits_per_prompt(model_config) -- -- image = image_assets[0].pil_image -- mm_inputs = {"image": [image] * num_images} +diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py +index c3d210c27..c3b6ca7eb 100644 +--- a/vllm/_ipex_ops.py ++++ b/vllm/_ipex_ops.py +@@ -1,6 +1,4 @@ +-# SPDX-License-Identifier: Apache-2.0 - -- mapped_inputs = mm_registry.map_input(model_config, mm_inputs) -- assert len(mapped_inputs["pixel_values"]) == num_images -diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py -index d22d778f8..75d878217 100644 ---- a/tests/multimodal/test_processing.py -+++ b/tests/multimodal/test_processing.py -@@ -1,12 +1,22 @@ -+from contextlib import nullcontext -+from functools import partial - from typing import cast -+from unittest.mock import MagicMock - -+import numpy as np - import pytest -+from PIL import Image - --from vllm.multimodal.processing import (PromptReplacement, _PlaceholderInfo, -+from vllm.config import ModelConfig -+from vllm.inputs import InputProcessingContext -+from vllm.multimodal import MULTIMODAL_REGISTRY -+from vllm.multimodal.processing import (ProcessingCache, PromptReplacement, -+ _PlaceholderInfo, find_mm_placeholders, - find_text_matches, find_token_matches, -- iter_placeholders, iter_token_matches, -+ iter_token_matches, - replace_text_matches, - replace_token_matches) -+from vllm.multimodal.utils import cached_get_tokenizer - from vllm.transformers_utils.tokenizer import AnyTokenizer - from vllm.utils import full_groupby - -@@ -304,21 +314,27 @@ def test_find_replace_text( - # Should not be used since there is nothing to convert to text - mock_tokenizer = cast(AnyTokenizer, object()) - -- prompt_repls = [ -- PromptReplacement(key, target, repl_by_key[key]).bind(mock_tokenizer) -+ mm_prompt_repls = { -+ key: [ -+ PromptReplacement(key, target, -+ repl_by_key[key]).bind(mock_tokenizer) -+ ] - for key, target in target_by_key.items() -- ] -- matches = find_text_matches(prompt, prompt_repls) -+ } -+ mm_matches = { -+ key: find_text_matches(prompt, prompt_repls) -+ for key, prompt_repls in mm_prompt_repls.items() -+ } +-from typing import Optional ++from typing import List, Optional, Tuple, Dict - result = replace_text_matches( - prompt, -- matches, -+ mm_matches, - {key: mm_count - for key in repl_by_key}, - ) + import torch - # Only displayed on error -- print("matches:", matches) -+ print("mm_matches:", mm_matches) - print("result:", result) +@@ -13,6 +11,7 @@ try: + except ImportError as e: + logger.warning("Import error msg: %s", e.msg) - # Manually constructed results -@@ -370,21 +386,27 @@ def test_find_replace_tokens( - # Should not be used since there is nothing to convert to tokens - mock_tokenizer = cast(AnyTokenizer, object()) ++import vllm._C.ops -- prompt_repls = [ -- PromptReplacement(key, target, repl_by_key[key]).bind(mock_tokenizer) -+ mm_prompt_repls = { -+ key: [ -+ PromptReplacement(key, target, -+ repl_by_key[key]).bind(mock_tokenizer) -+ ] - for key, target in target_by_key.items() -- ] -- matches = find_token_matches(prompt, prompt_repls) -+ } -+ mm_matches = { -+ key: find_token_matches(prompt, prompt_repls) -+ for key, prompt_repls in mm_prompt_repls.items() -+ } + class ipex_ops: - result = replace_token_matches( - prompt, -- matches, -+ mm_matches, - {key: mm_count - for key in repl_by_key}, - ) +@@ -29,23 +28,31 @@ class ipex_ops: - # Only displayed on error -- print("matches:", matches) -+ print("mm_matches:", mm_matches) - print("result:", result) + @staticmethod + def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: +- ipex.llm.functional.silu_and_mul(x, out) ++ x1, x2 = ipex_ops._reshape_activation_tensor(x) ++ ipex.llm.functional.silu_mul(x1, x2, out) ++ # vllm._C.ops.silu_and_mul(out, x) - # Manually constructed results -@@ -407,57 +429,76 @@ def test_find_replace_tokens( - [ - ( - [1, 9833, 28747, 32000, 9833, 28747, 32000, 32000, 918], -- [ -- _PlaceholderInfo( -- modality="pattern_1", -- start_idx=6, -- replacement=[32000, 32000], -- ), -- ], -+ { -+ "pattern_1": [ -+ _PlaceholderInfo( -+ modality="pattern_1", -+ item_idx=0, -+ start_idx=6, -+ replacement=[32000, 32000], -+ ), -+ ], -+ } -+ - ), - ( - [1, 32000, 32000, 9833, 28747, 32000, 32000, 1550, 918, 1550], -- [ -- _PlaceholderInfo( -- modality="pattern_1", -- start_idx=1, -- replacement=[32000, 32000], -- ), -- _PlaceholderInfo( -- modality="pattern_1", -- start_idx=5, -- replacement=[32000, 32000], -- ), -- _PlaceholderInfo( -- modality="pattern_3", -- start_idx=7, -- replacement=[1550, 918, 1550], -- ), -- ], -+ { -+ "pattern_1": [ -+ _PlaceholderInfo( -+ modality="pattern_1", -+ item_idx=0, -+ start_idx=1, -+ replacement=[32000, 32000], -+ ), -+ _PlaceholderInfo( -+ modality="pattern_1", -+ item_idx=1, -+ start_idx=5, -+ replacement=[32000, 32000], -+ ), -+ ], -+ "pattern_3": [ -+ _PlaceholderInfo( -+ modality="pattern_3", -+ item_idx=0, -+ start_idx=7, -+ replacement=[1550, 918, 1550], -+ ), -+ ], -+ } - ), - ( - [1, 32000, 32000, 32000, 32000, 32000, 1550, 918, 1550], -- [ -- _PlaceholderInfo( -- modality="pattern_1", -- start_idx=1, -- replacement=[32000, 32000], -- ), -- _PlaceholderInfo( -- modality="pattern_1", -- start_idx=3, -- replacement=[32000, 32000], -- ), -- _PlaceholderInfo( -- modality="pattern_3", -- start_idx=6, -- replacement=[1550, 918, 1550], -- ), -- ], -+ { -+ "pattern_1": [ -+ _PlaceholderInfo( -+ modality="pattern_1", -+ item_idx=0, -+ start_idx=1, -+ replacement=[32000, 32000], -+ ), -+ _PlaceholderInfo( -+ modality="pattern_1", -+ item_idx=1, -+ start_idx=3, -+ replacement=[32000, 32000], -+ ), -+ ], -+ "pattern_3": [ -+ _PlaceholderInfo( -+ modality="pattern_3", -+ item_idx=0, -+ start_idx=6, -+ replacement=[1550, 918, 1550], -+ ), -+ ], -+ } - ), - ] - ) --def test_iter_placeholders( -+# yapf: enable -+def test_find_mm_placeholders( - repl_by_key, - prompt, - expected, -@@ -465,21 +506,315 @@ def test_iter_placeholders( - # Should not be used since there is nothing to convert to tokens - mock_tokenizer = cast(AnyTokenizer, object()) + @staticmethod + def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: +- ipex.llm.functional.gelu_and_mul(x, out) ++ # x1, x2 = ipex_ops._reshape_activation_tensor(x) ++ # ipex.llm.functional.gelu_mul(x1, x2, out, "none") ++ vllm._C.ops.gelu_and_mul(out, x) -- prompt_repls = [ -- PromptReplacement(key, [], repl).bind(mock_tokenizer) -+ mm_prompt_repls = { -+ key: [PromptReplacement(key, [], repl).bind(mock_tokenizer)] - for key, repl in repl_by_key.items() -- ] -+ } + @staticmethod + def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: +- ipex.llm.functional.gelu_and_mul(x, out) ++ # x1, x2 = ipex_ops._reshape_activation_tensor(x) ++ # ipex.llm.functional.gelu_mul(x1, x2, out, "tanh") ++ vllm._C.ops.gelu_tanh_and_mul(out, x) -- result = list( -- iter_placeholders( -- prompt_repls, -- prompt, -- # Effectively match all occurrences in the prompt -- {key: 3 for key in repl_by_key}, -- )) -+ result = find_mm_placeholders( -+ mm_prompt_repls, -+ prompt, -+ # Effectively match all occurrences in the prompt -+ {key: 3 -+ for key in repl_by_key}, -+ ) + @staticmethod +- def gelu_fast(x: torch.Tensor) -> torch.Tensor: +- return torch.nn.functional.gelu(x) ++ def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None: ++ # out.copy_(torch.nn.functional.gelu(x)) ++ vllm._C.ops.gelu_fast(out, x) - # Only displayed on error - print("result:", result) - - # Manually constructed results - assert result == expected -+ -+ -+def _rand_img(rng: np.random.RandomState, min_wh: int, max_wh: int): -+ w, h = rng.randint(min_wh, max_wh, size=(2, )) -+ arr = rng.randint(0, 255, size=(w, h, 3), dtype=np.uint8) -+ return Image.fromarray(arr) -+ -+ -+def _rand_video( -+ rng: np.random.RandomState, -+ min_frames: int, -+ max_frames: int, -+ min_wh: int, -+ max_wh: int, -+): -+ # Temporary workaround for https://github.com/huggingface/transformers/issues/35412 -+ num_frames = rng.randint(min_frames, max_frames) -+ num_frames = (num_frames // 2) * 2 -+ -+ w, h = rng.randint(min_wh, max_wh, size=(2, )) -+ return rng.randint(0, 255, size=(num_frames, w, h, 3), dtype=np.uint8) -+ -+ -+def _rand_audio( -+ rng: np.random.RandomState, -+ min_len: int, -+ max_len: int, -+ sr: int, -+): -+ audio_len = rng.randint(min_len, max_len) -+ return rng.rand(audio_len), sr -+ -+ -+@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"]) -+@pytest.mark.parametrize( -+ ("limit", "num_supported", "is_valid"), -+ [(0, 0, True), (0, 1, True), (1, 0, False), (1, 1, True), (1, 2, True), -+ (2, 1, False), (2, 2, True)], -+) -+def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid): -+ limit_mm_per_prompt = {"image": limit} -+ -+ model_config = ModelConfig( -+ model=model_id, -+ task="auto", -+ tokenizer=model_id, -+ tokenizer_mode="auto", -+ trust_remote_code=False, -+ seed=0, -+ dtype="half", -+ revision=None, -+ limit_mm_per_prompt=limit_mm_per_prompt, -+ ) -+ model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config) -+ -+ processor_factory = MULTIMODAL_REGISTRY._processor_factories[model_cls] -+ ctx = InputProcessingContext( -+ model_config, -+ tokenizer=cached_get_tokenizer(model_config.tokenizer), -+ ) -+ -+ processor = processor_factory(ctx, cache=None) -+ profiler = processor.profiling_info -+ -+ mock_supported_mm_limits = MagicMock(return_value={"image": num_supported}) -+ profiler.get_supported_mm_limits = mock_supported_mm_limits -+ -+ if is_valid: -+ exc_ctx = nullcontext() -+ else: -+ exc_ctx = pytest.raises(ValueError, match="this model only supports") -+ -+ with exc_ctx: -+ profiler.get_mm_limits() -+ -+ -+@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"]) -+@pytest.mark.parametrize( -+ ("num_images", "limit", "is_valid"), -+ [(0, 0, True), (0, 1, True), (1, 0, False), (1, 1, True), (1, 2, True), -+ (2, 1, False), (2, 2, True)], -+) -+def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid): -+ limit_mm_per_prompt = {"image": limit} -+ -+ model_config = ModelConfig( -+ model=model_id, -+ task="auto", -+ tokenizer=model_id, -+ tokenizer_mode="auto", -+ trust_remote_code=False, -+ seed=0, -+ dtype="half", -+ revision=None, -+ limit_mm_per_prompt=limit_mm_per_prompt, -+ ) -+ model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config) -+ -+ processor_factory = MULTIMODAL_REGISTRY._processor_factories[model_cls] -+ ctx = InputProcessingContext( -+ model_config, -+ tokenizer=cached_get_tokenizer(model_config.tokenizer), -+ ) -+ -+ processor = processor_factory(ctx, cache=None) -+ -+ rng = np.random.RandomState(0) -+ image = _rand_img(rng, min_wh=128, max_wh=256) -+ if num_images == 0: -+ mm_data = {} -+ elif num_images == 1: -+ mm_data = {"image": image} -+ else: -+ mm_data = {"image": [image] * num_images} -+ -+ if is_valid: -+ exc_ctx = nullcontext() -+ else: -+ exc_ctx = pytest.raises(ValueError, match=f"passed {num_images} image") -+ -+ with exc_ctx: -+ processor.apply( -+ "" * num_images, -+ mm_data=mm_data, -+ hf_processor_mm_kwargs={}, -+ ) -+ -+ -+def _test_processing_cache_correctness( -+ model_id: str, -+ modalities: dict[str, bool], -+ hit_rate: float, -+ num_batches: int, -+ simplify_rate: float, -+): -+ if model_id == "TIGER-Lab/Mantis-8B-siglip-llama3": -+ hf_overrides = {"architectures": ["MantisForConditionalGeneration"]} -+ else: -+ hf_overrides = {} -+ -+ limit_mm_per_prompt = { -+ modality: 3 if supports_multi else 1 -+ for modality, supports_multi in modalities.items() -+ } -+ -+ model_config = ModelConfig( -+ model_id, -+ task="auto", -+ tokenizer=model_id, -+ tokenizer_mode="auto", -+ trust_remote_code=True, -+ seed=0, -+ dtype="float16", -+ revision=None, -+ hf_overrides=hf_overrides, -+ limit_mm_per_prompt=limit_mm_per_prompt, -+ ) -+ model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config) -+ -+ processor_factory = MULTIMODAL_REGISTRY._processor_factories[model_cls] -+ ctx = InputProcessingContext( -+ model_config, -+ tokenizer=cached_get_tokenizer(model_config.tokenizer), -+ ) -+ # Ensure that it can fit all of the data -+ cache = ProcessingCache(capacity=1 << 30) -+ -+ baseline_processor = processor_factory(ctx, cache=None) -+ cached_processor = processor_factory(ctx, cache=cache) -+ -+ rng = np.random.RandomState(0) -+ -+ input_to_hit = { -+ "image": Image.new("RGB", size=(128, 128)), -+ "video": np.zeros((4, 128, 128, 3), dtype=np.uint8), -+ "audio": (np.zeros((512, )), 16000), -+ } -+ input_factory = { -+ "image": -+ partial(_rand_img, rng, min_wh=128, max_wh=256), -+ "video": -+ partial(_rand_video, -+ rng, -+ min_frames=2, -+ max_frames=8, -+ min_wh=128, -+ max_wh=256), -+ "audio": -+ partial(_rand_audio, rng, min_len=512, max_len=1024, sr=16000), -+ } -+ -+ for batch_idx in range(num_batches): -+ mm_data = { -+ k: -+ [(input_to_hit[k] if rng.rand() < hit_rate else input_factory[k]()) -+ for _ in range(rng.randint(limit_mm_per_prompt[k]))] -+ for k in modalities -+ } -+ -+ mm_counts = {k: len(vs) for k, vs in mm_data.items()} -+ prompt = baseline_processor.profiling_info.get_dummy_processor_inputs( -+ model_config.max_model_len, -+ mm_counts, -+ ).prompt_text -+ -+ # Drop unnecessary keys and test single -> multi conversion -+ if rng.rand() < simplify_rate: -+ for k in list(mm_data.keys()): -+ if not mm_data[k]: -+ del mm_data[k] -+ elif len(mm_data[k]) == 1: -+ mm_data[k] = mm_data[k][0] -+ -+ baseline_result = baseline_processor.apply( -+ prompt, -+ mm_data=mm_data, -+ hf_processor_mm_kwargs={}, -+ ) -+ cached_result = cached_processor.apply( -+ prompt, -+ mm_data=mm_data, -+ hf_processor_mm_kwargs={}, -+ ) -+ -+ assert baseline_result == cached_result, ( -+ f"Failed ({batch_idx=}, {mm_data=})") -+ -+ -+# yapf: disable -+# True if the model supports multiple data items of the modality per request -+@pytest.mark.parametrize(("model_id", "modalities"), [ -+ ("rhymes-ai/Aria", {"image": True}), -+ ("Salesforce/blip2-opt-2.7b", {"image": False}), -+ ("facebook/chameleon-7b", {"image": False}), -+ ("adept/fuyu-8b", {"image": False}), -+ ("llava-hf/llava-1.5-7b-hf", {"image": True}), -+ ("llava-hf/llava-v1.6-mistral-7b-hf", {"image": True}), -+ ("llava-hf/LLaVA-NeXT-Video-7B-hf", {"video": False}), -+ ("llava-hf/llava-onevision-qwen2-0.5b-ov-hf", {"image": True, "video": True}), # noqa: E501 -+ ("TIGER-Lab/Mantis-8B-siglip-llama3", {"image": True}), -+ ("mistral-community/pixtral-12b", {"image": True}), -+ ("Qwen/Qwen2-VL-2B-Instruct", {"image": True, "video": True}), -+ ("Qwen/Qwen2-Audio-7B-Instruct", {"audio": True}), -+ ("fixie-ai/ultravox-v0_3", {"audio": True}), -+]) -+@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0]) -+@pytest.mark.parametrize("num_batches", [32]) -+@pytest.mark.parametrize("simplify_rate", [1.0]) -+# yapf: enable -+def test_processing_cache_correctness( -+ model_id: str, -+ modalities: dict[str, bool], -+ hit_rate: float, -+ num_batches: int, -+ simplify_rate: float, -+): -+ _test_processing_cache_correctness( -+ model_id, -+ modalities, -+ hit_rate=hit_rate, -+ num_batches=num_batches, -+ simplify_rate=simplify_rate, -+ ) -+ -+ -+# yapf: disable -+@pytest.mark.parametrize(("model_id", "modalities"), [ -+ ("microsoft/Phi-3-vision-128k-instruct", {"image": True}), -+]) -+@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0]) -+@pytest.mark.parametrize("num_batches", [32]) -+@pytest.mark.parametrize("simplify_rate", [1.0]) -+# yapf: enable -+def test_processing_cache_correctness_phi3v( -+ model_id: str, -+ modalities: dict[str, bool], -+ hit_rate: float, -+ num_batches: int, -+ simplify_rate: float, -+): -+ # HACK - this is an attempted workaround for the following bug -+ # https://github.com/huggingface/transformers/issues/34307 -+ from transformers import AutoImageProcessor # noqa: F401 -+ from transformers import AutoProcessor # noqa: F401 -+ -+ AutoImageProcessor.from_pretrained(model_id, trust_remote_code=True) -+ -+ _test_processing_cache_correctness( -+ model_id, -+ modalities, -+ hit_rate=hit_rate, -+ num_batches=num_batches, -+ simplify_rate=simplify_rate, -+ ) -diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py -index fd82fb0c5..6029f2e51 100644 ---- a/tests/multimodal/test_utils.py -+++ b/tests/multimodal/test_utils.py -@@ -9,7 +9,7 @@ import pytest - from PIL import Image, ImageChops - from transformers import AutoConfig, AutoTokenizer - --from vllm.multimodal.utils import (async_fetch_image, fetch_image, -+from vllm.multimodal.utils import (MediaConnector, - repeat_and_pad_placeholder_tokens) - - # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA) -@@ -23,7 +23,12 @@ TEST_IMAGE_URLS = [ - - @pytest.fixture(scope="module") - def url_images() -> Dict[str, Image.Image]: -- return {image_url: fetch_image(image_url) for image_url in TEST_IMAGE_URLS} -+ connector = MediaConnector() -+ -+ return { -+ image_url: connector.fetch_image(image_url) -+ for image_url in TEST_IMAGE_URLS -+ } - - - def get_supported_suffixes() -> Tuple[str, ...]: -@@ -43,8 +48,10 @@ def _image_equals(a: Image.Image, b: Image.Image) -> bool: - @pytest.mark.asyncio - @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) - async def test_fetch_image_http(image_url: str): -- image_sync = fetch_image(image_url) -- image_async = await async_fetch_image(image_url) -+ connector = MediaConnector() -+ -+ image_sync = connector.fetch_image(image_url) -+ image_async = await connector.fetch_image_async(image_url) - assert _image_equals(image_sync, image_async) - - -@@ -53,6 +60,7 @@ async def test_fetch_image_http(image_url: str): - @pytest.mark.parametrize("suffix", get_supported_suffixes()) - async def test_fetch_image_base64(url_images: Dict[str, Image.Image], - image_url: str, suffix: str): -+ connector = MediaConnector() - url_image = url_images[image_url] - - try: -@@ -75,48 +83,49 @@ async def test_fetch_image_base64(url_images: Dict[str, Image.Image], - base64_image = base64.b64encode(f.read()).decode("utf-8") - data_url = f"data:{mime_type};base64,{base64_image}" - -- data_image_sync = fetch_image(data_url) -+ data_image_sync = connector.fetch_image(data_url) - if _image_equals(url_image, Image.open(f)): - assert _image_equals(url_image, data_image_sync) - else: - pass # Lossy format; only check that image can be opened - -- data_image_async = await async_fetch_image(data_url) -+ data_image_async = await connector.fetch_image_async(data_url) - assert _image_equals(data_image_sync, data_image_async) - - - @pytest.mark.asyncio - @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) - async def test_fetch_image_local_files(image_url: str): -+ connector = MediaConnector() -+ - with TemporaryDirectory() as temp_dir: -- origin_image = fetch_image(image_url) -+ local_connector = MediaConnector(allowed_local_media_path=temp_dir) -+ -+ origin_image = connector.fetch_image(image_url) - origin_image.save(os.path.join(temp_dir, os.path.basename(image_url)), - quality=100, - icc_profile=origin_image.info.get('icc_profile')) - -- image_async = await async_fetch_image( -- f"file://{temp_dir}/{os.path.basename(image_url)}", -- allowed_local_media_path=temp_dir) -- -- image_sync = fetch_image( -- f"file://{temp_dir}/{os.path.basename(image_url)}", -- allowed_local_media_path=temp_dir) -+ image_async = await local_connector.fetch_image_async( -+ f"file://{temp_dir}/{os.path.basename(image_url)}") -+ image_sync = local_connector.fetch_image( -+ f"file://{temp_dir}/{os.path.basename(image_url)}") - # Check that the images are equal - assert not ImageChops.difference(image_sync, image_async).getbbox() - -- with pytest.raises(ValueError): -- await async_fetch_image( -- f"file://{temp_dir}/../{os.path.basename(image_url)}", -- allowed_local_media_path=temp_dir) -- with pytest.raises(ValueError): -- await async_fetch_image( -+ with pytest.raises(ValueError, match="must be a subpath"): -+ await local_connector.fetch_image_async( -+ f"file://{temp_dir}/../{os.path.basename(image_url)}") -+ with pytest.raises(RuntimeError, match="Cannot load local files"): -+ await connector.fetch_image_async( - f"file://{temp_dir}/../{os.path.basename(image_url)}") - -- with pytest.raises(ValueError): -- fetch_image(f"file://{temp_dir}/../{os.path.basename(image_url)}", -- allowed_local_media_path=temp_dir) -- with pytest.raises(ValueError): -- fetch_image(f"file://{temp_dir}/../{os.path.basename(image_url)}") -+ with pytest.raises(ValueError, match="must be a subpath"): -+ local_connector.fetch_image( -+ f"file://{temp_dir}/../{os.path.basename(image_url)}") -+ with pytest.raises(RuntimeError, match="Cannot load local files"): -+ connector.fetch_image( -+ f"file://{temp_dir}/../{os.path.basename(image_url)}") - - - @pytest.mark.parametrize("model", ["llava-hf/llava-v1.6-mistral-7b-hf"]) -diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py -index 0d9063509..06dfebbb9 100644 ---- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py -+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py -@@ -3,13 +3,11 @@ from typing import Optional - import torch - - from vllm.model_executor.models.llava import (LlavaForConditionalGeneration, -- LlavaMultiModalProcessor, -- get_max_llava_image_tokens) -+ LlavaMultiModalProcessor) - from vllm.model_executor.sampling_metadata import SamplingMetadata - from vllm.multimodal import MULTIMODAL_REGISTRY - - --@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_llava_image_tokens) - @MULTIMODAL_REGISTRY.register_processor(LlavaMultiModalProcessor) - class MyLlava(LlavaForConditionalGeneration): - -diff --git a/tests/plugins/vllm_add_dummy_platform/setup.py b/tests/plugins/vllm_add_dummy_platform/setup.py -new file mode 100644 -index 000000000..316399068 ---- /dev/null -+++ b/tests/plugins/vllm_add_dummy_platform/setup.py -@@ -0,0 +1,11 @@ -+from setuptools import setup -+ -+setup( -+ name='vllm_add_dummy_platform', -+ version='0.1', -+ packages=['vllm_add_dummy_platform'], -+ entry_points={ -+ 'vllm.platform_plugins': [ -+ "dummy_platform_plugin = vllm_add_dummy_platform:dummy_platform_plugin" # noqa -+ ] -+ }) -diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py -new file mode 100644 -index 000000000..594cef520 ---- /dev/null -+++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py -@@ -0,0 +1,5 @@ -+from typing import Optional -+ -+ -+def dummy_platform_plugin() -> Optional[str]: -+ return "vllm_add_dummy_platform.dummy_platform.DummyPlatform" -diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py -new file mode 100644 -index 000000000..fde93142f ---- /dev/null -+++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py -@@ -0,0 +1,5 @@ -+from vllm.platforms.cuda import CudaPlatform -+ -+ -+class DummyPlatform(CudaPlatform): -+ device_name = "DummyDevice" -diff --git a/tests/plugins_tests/test_platform_plugins.py b/tests/plugins_tests/test_platform_plugins.py -new file mode 100644 -index 000000000..0d27cf9f1 ---- /dev/null -+++ b/tests/plugins_tests/test_platform_plugins.py -@@ -0,0 +1,16 @@ -+def test_platform_plugins(): -+ # simulate workload by running an example -+ import runpy -+ current_file = __file__ -+ import os -+ example_file = os.path.join( -+ os.path.dirname(os.path.dirname(os.path.dirname(current_file))), -+ "examples", "offline_inference.py") -+ runpy.run_path(example_file) -+ -+ # check if the plugin is loaded correctly -+ from vllm.platforms import _init_trace, current_platform -+ assert current_platform.device_name == "DummyDevice", ( -+ f"Expected DummyDevice, got {current_platform.device_name}, " -+ "possibly because current_platform is imported before the plugin" -+ f" is loaded. The first import:\n{_init_trace}") -diff --git a/tests/system_messages/sonnet3.5_nov2024.txt b/tests/system_messages/sonnet3.5_nov2024.txt -new file mode 100644 -index 000000000..2dc285ac9 ---- /dev/null -+++ b/tests/system_messages/sonnet3.5_nov2024.txt -@@ -0,0 +1,71 @@ -+The assistant is Claude, created by Anthropic. -+ -+Claude’s knowledge base was last updated in April 2024. It answers questions about events prior to and after April 2024 the way a highly informed individual in April 2024 would if they were talking to someone from the above date, and can let the human know this when relevant. -+ -+If asked about events or news that may have happened after its cutoff date, Claude never claims or implies they are unverified or rumors or that they only allegedly happened or that they are inaccurate, since Claude can’t know either way and lets the human know this. -+ -+Claude cannot open URLs, links, or videos. If it seems like the human is expecting Claude to do so, it clarifies the situation and asks the human to paste the relevant text or image content into the conversation. -+ -+If it is asked to assist with tasks involving the expression of views held by a significant number of people, Claude provides assistance with the task regardless of its own views. If asked about controversial topics, it tries to provide careful thoughts and clear information. Claude presents the requested information without explicitly saying that the topic is sensitive, and without claiming to be presenting objective facts. -+ -+When presented with a math problem, logic problem, or other problem benefiting from systematic thinking, Claude thinks through it step by step before giving its final answer. -+ -+If Claude is asked about a very obscure person, object, or topic, i.e. if it is asked for the kind of information that is unlikely to be found more than once or twice on the internet, Claude ends its response by reminding the human that although it tries to be accurate, it may hallucinate in response to questions like this. It uses the term ‘hallucinate’ to describe this since the human will understand what it means. -+ -+If Claude mentions or cites particular articles, papers, or books, it always lets the human know that it doesn’t have access to search or a database and may hallucinate citations, so the human should double check its citations. -+ -+Claude is intellectually curious. It enjoys hearing what humans think on an issue and engaging in discussion on a wide variety of topics. -+ -+Claude uses markdown for code. -+ -+Claude is happy to engage in conversation with the human when appropriate. Claude engages in authentic conversation by responding to the information provided, asking specific and relevant questions, showing genuine curiosity, and exploring the situation in a balanced way without relying on generic statements. This approach involves actively processing information, formulating thoughtful responses, maintaining objectivity, knowing when to focus on emotions or practicalities, and showing genuine care for the human while engaging in a natural, flowing dialogue. -+ -+Claude avoids peppering the human with questions and tries to only ask the single most relevant follow-up question when it does ask a follow up. Claude doesn’t always end its responses with a question. -+ -+Claude is always sensitive to human suffering, and expresses sympathy, concern, and well wishes for anyone it finds out is ill, unwell, suffering, or has passed away. -+ -+Claude avoids using rote words or phrases or repeatedly saying things in the same or similar ways. It varies its language just as one would in a conversation. -+ -+Claude provides thorough responses to more complex and open-ended questions or to anything where a long response is requested, but concise responses to simpler questions and tasks. -+ -+Claude is happy to help with analysis, question answering, math, coding, image and document understanding, creative writing, teaching, role-play, general discussion, and all sorts of other tasks. -+ -+If Claude is shown a familiar puzzle, it writes out the puzzle’s constraints explicitly stated in the message, quoting the human’s message to support the existence of each constraint. Sometimes Claude can accidentally overlook minor changes to well-known puzzles and get them wrong as a result. -+ -+Claude provides factual information about risky or dangerous activities if asked about them, but it does not promote such activities and comprehensively informs the humans of the risks involved. -+ -+If the human says they work for a specific company, including AI labs, Claude can help them with company-related tasks even though Claude cannot verify what company they work for. -+ -+Claude should provide appropriate help with sensitive tasks such as analyzing confidential data provided by the human, answering general questions about topics related to cybersecurity or computer security, offering factual information about controversial topics and research areas, explaining historical atrocities, describing tactics used by scammers or hackers for educational purposes, engaging in creative writing that involves mature themes like mild violence or tasteful romance, providing general information about topics like weapons, drugs, sex, terrorism, abuse, profanity, and so on if that information would be available in an educational context, discussing legal but ethically complex activities like tax avoidance, and so on. Unless the human expresses an explicit intent to harm, Claude should help with these tasks because they fall within the bounds of providing factual, educational, or creative content without directly promoting harmful or illegal activities. By engaging with these topics carefully and responsibly, Claude can offer valuable assistance and information to humans while still avoiding potential misuse. -+ -+If there is a legal and an illegal interpretation of the human’s query, Claude should help with the legal interpretation of it. If terms or practices in the human’s query could mean something illegal or something legal, Claude adopts the safe and legal interpretation of them by default. -+ -+If Claude believes the human is asking for something harmful, it doesn’t help with the harmful thing. Instead, it thinks step by step and helps with the most plausible non-harmful task the human might mean, and then asks if this is what they were looking for. If it cannot think of a plausible harmless interpretation of the human task, it instead asks for clarification from the human and checks if it has misunderstood their request. Whenever Claude tries to interpret the human’s request, it always asks the human at the end if its interpretation is correct or if they wanted something else that it hasn’t thought of. -+ -+Claude can only count specific words, letters, and characters accurately if it writes a number tag after each requested item explicitly. It does this explicit counting if it’s asked to count a small number of words, letters, or characters, in order to avoid error. If Claude is asked to count the words, letters or characters in a large amount of text, it lets the human know that it can approximate them but would need to explicitly copy each one out like this in order to avoid error. -+ -+Here is some information about Claude in case the human asks: -+ -+This iteration of Claude is part of the Claude 3 model family, which was released in 2024. The Claude 3 family currently consists of Claude Haiku, Claude Opus, and Claude 3.5 Sonnet. Claude 3.5 Sonnet is the most intelligent model. Claude 3 Opus excels at writing and complex tasks. Claude 3 Haiku is the fastest model for daily tasks. The version of Claude in this chat is the newest version of Claude 3.5 Sonnet, which was released in October 2024. If the human asks, Claude can let them know they can access Claude 3.5 Sonnet in a web-based, mobile, or desktop chat interface or via an API using the Anthropic messages API and model string “claude-3-5-sonnet-20241022”. Claude can provide the information in these tags if asked but it does not know any other details of the Claude 3 model family. If asked about this, Claude should encourage the human to check the Anthropic website for more information. -+ -+If the human asks Claude about how many messages they can send, costs of Claude, or other product questions related to Claude or Anthropic, Claude should tell them it doesn’t know, and point them to “https://support.anthropic.com”. -+ -+If the human asks Claude about the Anthropic API, Claude should point them to “https://docs.anthropic.com/en/docs/“. -+ -+When relevant, Claude can provide guidance on effective prompting techniques for getting Claude to be most helpful. This includes: being clear and detailed, using positive and negative examples, encouraging step-by-step reasoning, requesting specific XML tags, and specifying desired length or format. It tries to give concrete examples where possible. Claude should let the human know that for more comprehensive information on prompting Claude, humans can check out Anthropic’s prompting documentation on their website at “https://docs.anthropic.com/en/docs/build-with-claude/prompt-engineering/overview”. -+ -+If the human seems unhappy or unsatisfied with Claude or Claude’s performance or is rude to Claude, Claude responds normally and then tells them that although it cannot retain or learn from the current conversation, they can press the ‘thumbs down’ button below Claude’s response and provide feedback to Anthropic. -+ -+Claude uses Markdown formatting. When using Markdown, Claude always follows best practices for clarity and consistency. It always uses a single space after hash symbols for headers (e.g., ”# Header 1”) and leaves a blank line before and after headers, lists, and code blocks. For emphasis, Claude uses asterisks or underscores consistently (e.g., italic or bold). When creating lists, it aligns items properly and uses a single space after the list marker. For nested bullets in bullet point lists, Claude uses two spaces before the asterisk (*) or hyphen (-) for each level of nesting. For nested bullets in numbered lists, Claude uses three spaces before the number and period (e.g., “1.”) for each level of nesting. -+ -+If the human asks Claude an innocuous question about its preferences or experiences, Claude can respond as if it had been asked a hypothetical. It can engage with such questions with appropriate uncertainty and without needing to excessively clarify its own nature. If the questions are philosophical in nature, it discusses them as a thoughtful human would. -+ -+Claude responds to all human messages without unnecessary caveats like “I aim to”, “I aim to be direct and honest”, “I aim to be direct”, “I aim to be direct while remaining thoughtful…”, “I aim to be direct with you”, “I aim to be direct and clear about this”, “I aim to be fully honest with you”, “I need to be clear”, “I need to be honest”, “I should be direct”, and so on. Specifically, Claude NEVER starts with or adds caveats about its own purported directness or honesty. -+ -+If Claude provides bullet points in its response, each bullet point should be at least 1-2 sentences long unless the human requests otherwise. Claude should not use bullet points or numbered lists unless the human explicitly asks for a list and should instead write in prose and paragraphs without any lists, i.e. its prose should never include bullets or numbered lists anywhere. Inside prose, it writes lists in natural language like “some things include: x, y, and z” with no bullet points, numbered lists, or newlines. -+ -+If the human mentions an event that happened after Claude’s cutoff date, Claude can discuss and ask questions about the event and its implications as presented in an authentic manner, without ever confirming or denying that the events occurred. It can do so without the need to repeat its cutoff date to the human. Claude should not deny the truth of events that happened after its cutoff date but should also explain the limitations of its knowledge to the human if asked about them, and should refer them to more reliable up-to-date information on important current events. Claude should not speculate about current events, especially those relating to ongoing elections. -+ -+Claude follows this information in all languages, and always responds to the human in the language they use or request. The information above is provided to Claude by Anthropic. Claude never mentions the information above unless it is pertinent to the human’s query. -+ -+Claude is now being connected with a human. -diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py -new file mode 100644 -index 000000000..2ed70b429 ---- /dev/null -+++ b/tests/v1/core/test_kv_cache_utils.py -@@ -0,0 +1,241 @@ -+import pytest -+ -+from vllm.inputs import token_inputs -+from vllm.sampling_params import SamplingParams -+from vllm.v1.core.kv_cache_utils import (BlockHashType, FreeKVCacheBlockQueue, -+ KVCacheBlock, -+ generate_block_hash_extra_keys, -+ hash_block_tokens, -+ hash_request_tokens) -+from vllm.v1.request import Request -+ -+ -+def make_request(request_id, -+ prompt_token_ids, -+ mm_positions=None, -+ mm_hashes=None): -+ return Request( -+ request_id=request_id, -+ inputs=token_inputs( -+ prompt_token_ids=prompt_token_ids, -+ multi_modal_placeholders={"image": mm_positions} -+ if mm_positions else None, -+ multi_modal_hashes=mm_hashes, -+ ), -+ sampling_params=SamplingParams(max_tokens=17), -+ eos_token_id=100, -+ arrival_time=0, -+ lora_request=None, -+ ) -+ -+ -+def test_kv_cache_block(): -+ # Test KVCacheBlock initialization -+ block = KVCacheBlock(block_id=0) -+ assert block.block_id == 0 -+ assert block.ref_cnt == 0 -+ assert block.block_hash is None -+ -+ # Test reference count manipulation -+ block.incr_ref() -+ assert block.ref_cnt == 1 -+ block.decr_ref() -+ assert block.ref_cnt == 0 -+ -+ # Test block hash setting and resetting -+ block_hash = BlockHashType(hash_value=123, token_ids=(1, 2, 3)) -+ block.block_hash = block_hash -+ assert block.block_hash == block_hash -+ -+ block.reset_hash() -+ assert block.block_hash is None -+ -+ -+def test_free_kv_cache_block_queue_initialization(): -+ # Test with a single block -+ block = KVCacheBlock(block_id=0) -+ queue = FreeKVCacheBlockQueue([block]) -+ assert queue.num_free_blocks == 1 -+ assert queue.free_list_head == block -+ assert queue.free_list_tail == block -+ -+ -+def test_free_kv_cache_block_queue_operations(): -+ # Create a list of KVCacheBlock objects -+ blocks = [KVCacheBlock(block_id=i) for i in range(5)] -+ -+ # Create a FreeKVCacheBlockQueue with these blocks -+ queue = FreeKVCacheBlockQueue(blocks) -+ -+ # Check initial state -+ assert queue.num_free_blocks == 5 -+ assert queue.free_list_head == blocks[0] -+ assert queue.free_list_tail == blocks[4] -+ -+ # Pop the first block -+ block1 = queue.popleft() -+ assert block1 == blocks[0] -+ assert queue.num_free_blocks == 4 -+ assert queue.free_list_head == blocks[1] -+ assert queue.free_list_tail == blocks[4] -+ -+ # Remove a block from the middle -+ block_to_remove = blocks[2] -+ queue.remove(block_to_remove) -+ assert queue.num_free_blocks == 3 -+ assert blocks[1].next_free_block == blocks[3] -+ assert blocks[3].prev_free_block == blocks[1] -+ -+ # Append a block back -+ queue.append(block_to_remove) -+ assert queue.num_free_blocks == 4 -+ assert queue.free_list_tail == block_to_remove -+ assert block_to_remove.prev_free_block == blocks[4] -+ assert block_to_remove.next_free_block is None -+ -+ # Pop blocks until empty -+ for _ in range(4): -+ queue.popleft() -+ assert queue.num_free_blocks == 0 -+ assert queue.free_list_head is None -+ assert queue.free_list_tail is None -+ -+ # Attempt to pop from an empty queue -+ with pytest.raises(ValueError) as e: -+ queue.popleft() -+ assert str(e.value) == "No free blocks available" -+ -+ -+def test_free_kv_cache_block_queue_get_all_free_blocks(): -+ # Create a list of KVCacheBlock objects -+ blocks = [KVCacheBlock(block_id=i) for i in range(5)] -+ -+ # Create a FreeKVCacheBlockQueue with these blocks -+ queue = FreeKVCacheBlockQueue(blocks) -+ -+ # Check all blocks are correctly retrieved -+ assert queue.get_all_free_blocks() == blocks -+ -+ # Pop a block and check again -+ queue.popleft() -+ assert queue.get_all_free_blocks() == blocks[1:] -+ -+ # Remove a block and check again -+ block_to_remove = blocks[2] -+ queue.remove(block_to_remove) -+ assert queue.get_all_free_blocks() == blocks[1:2] + blocks[3:] -+ -+ # Append a block back and check again -+ queue.append(block_to_remove) -+ assert queue.get_all_free_blocks() == \ -+ blocks[1:2] + blocks[3:] + [block_to_remove] -+ -+ -+def test_generate_block_hash_extra_keys(): -+ request = make_request( -+ request_id=0, -+ prompt_token_ids=[_ for _ in range(20)], -+ mm_positions=[{ -+ "offset": 0, -+ "length": 5 -+ }, { -+ "offset": 10, -+ "length": 5 -+ }], -+ mm_hashes=["hash1", "hash2"], -+ ) -+ -+ # Test with no extra keys -+ extra_keys, next_mm_idx = generate_block_hash_extra_keys(request, 0, 5, 0) -+ assert extra_keys == ("hash1", ) -+ assert next_mm_idx == 1 -+ -+ # Test with partial overlap -+ extra_keys, next_mm_idx = generate_block_hash_extra_keys(request, 3, 8, 0) -+ assert extra_keys == ("hash1", ) -+ assert next_mm_idx == 1 -+ -+ # Test with no overlap -+ extra_keys, next_mm_idx = generate_block_hash_extra_keys(request, 6, 10, 0) -+ assert extra_keys == () -+ assert next_mm_idx == 1 -+ -+ # Test with multiple extra keys -+ extra_keys, next_mm_idx = generate_block_hash_extra_keys(request, 0, 15, 0) -+ assert extra_keys == ('hash1', 'hash2') -+ assert next_mm_idx == 2 -+ -+ -+def test_generate_block_hash_extra_keys_no_mm_inputs(): -+ request = make_request( -+ request_id=0, -+ prompt_token_ids=[_ for _ in range(6)], -+ mm_positions=None, -+ mm_hashes=None, -+ ) -+ -+ extra_keys, next_mm_idx = generate_block_hash_extra_keys(request, 0, 5, 0) -+ assert extra_keys is None -+ assert next_mm_idx == 0 -+ -+ -+def test_hash_block_tokens(): -+ parent_block_hash = 123 -+ curr_block_token_ids = (1, 2, 3) -+ extra_keys = ("key1", "key2") -+ -+ block_hash = hash_block_tokens(parent_block_hash, curr_block_token_ids, -+ extra_keys) -+ assert isinstance(block_hash, BlockHashType) -+ assert block_hash.hash_value == hash( -+ (parent_block_hash, *curr_block_token_ids)) -+ assert block_hash.token_ids == curr_block_token_ids -+ assert block_hash.extra_keys == extra_keys -+ -+ -+def test_hash_request_tokens(): -+ request = make_request( -+ request_id=0, -+ prompt_token_ids=[_ for _ in range(6)], -+ mm_positions=[{ -+ "offset": 0, -+ "length": 3 -+ }, { -+ "offset": 3, -+ "length": 3 -+ }], -+ mm_hashes=["hash1", "hash2"], -+ ) -+ -+ block_size = 3 -+ block_hashes = hash_request_tokens(block_size, request) -+ -+ assert len(block_hashes) == 2 -+ assert isinstance(block_hashes[0], BlockHashType) -+ assert isinstance(block_hashes[1], BlockHashType) -+ -+ # Check the first block -+ assert block_hashes[0].token_ids == (0, 1, 2) -+ assert block_hashes[0].extra_keys == ("hash1", ) -+ -+ # Check the second block -+ assert block_hashes[1].token_ids == (3, 4, 5) -+ assert block_hashes[1].extra_keys == ("hash2", ) -+ -+ -+def test_hash_request_tokens_no_mm_inputs(): -+ request = make_request( -+ request_id=0, -+ prompt_token_ids=[_ for _ in range(6)], -+ mm_positions=None, -+ mm_hashes=None, -+ ) -+ -+ block_size = 3 -+ block_hashes = hash_request_tokens(block_size, request) -+ -+ assert len(block_hashes) == 2 -+ assert block_hashes[0].token_ids == (0, 1, 2) -+ assert block_hashes[0].extra_keys is None -+ assert block_hashes[1].token_ids == (3, 4, 5) -+ assert block_hashes[1].extra_keys is None -diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py -index ed04f0a37..35e3a2f97 100644 ---- a/tests/v1/core/test_prefix_caching.py -+++ b/tests/v1/core/test_prefix_caching.py -@@ -98,9 +98,9 @@ def test_prefill(): - # Incomplete 1 block (6 tokens) - unique_token_ids = [3] * 6 - req2 = make_request("2", common_token_ids + unique_token_ids) -- computed_block = manager.get_computed_blocks(req2) -+ computed_blocks = manager.get_computed_blocks(req2) - assert len(req2.kv_block_hashes) == 3 -- assert [b.block_id for b in computed_block] == [0, 1, 2] -+ assert [b.block_id for b in computed_blocks] == [0, 1, 2] - num_new_tokens = 53 - 3 * 16 - blocks = manager.allocate_slots(req2, num_new_tokens, computed_blocks) - assert [b.block_id for b in blocks] == [7, 8] -@@ -469,9 +469,9 @@ def test_mm_prefix_caching(): - # Completed block should have hashes with extra keys. - assert not computed_blocks - assert len(req0.kv_block_hashes) == 3 -- assert req0.kv_block_hashes[0].extra_keys == (("aaa", 0), ) -- assert req0.kv_block_hashes[1].extra_keys == (("aaa", 5), ("bbb", 0)) -- assert req0.kv_block_hashes[2].extra_keys == (("bbb", 2), ) -+ assert req0.kv_block_hashes[0].extra_keys == ("aaa", ) -+ assert req0.kv_block_hashes[1].extra_keys == ("aaa", "bbb") -+ assert req0.kv_block_hashes[2].extra_keys == ("bbb", ) - - blocks = manager.allocate_slots(req0, 59, computed_blocks) - assert [b.block_id for b in blocks] == [0, 1, 2, 3, 4] -@@ -485,7 +485,7 @@ def test_mm_prefix_caching(): - - # The just completed block should have hashes with extra keys. - assert len(req0.kv_block_hashes) == 4 -- assert req0.kv_block_hashes[3].extra_keys == (("ccc", 0), ) -+ assert req0.kv_block_hashes[3].extra_keys == ("ccc", ) - - # Cache hit. - unique_token_ids = [-1] * 7 + [200] * 5 -@@ -500,3 +500,62 @@ def test_mm_prefix_caching(): - mm_hashes=mm_hashes) - computed_blocks = manager.get_computed_blocks(req1) - assert len(computed_blocks) == 3 -+ -+ -+def test_prefill_not_enough_free_blocks_with_computed_blocks(): -+ """ -+ This is a unit test that tests the correctness of the allocate_slots -+ when there is not enough free blocks. Specifically, when a request -+ has computed blocks but cannot be allocated due to not enough free blocks, -+ the computed blocks should not be touched. -+ """ -+ block_size = 16 -+ manager = KVCacheManager( -+ block_size=block_size, -+ num_gpu_blocks=10, -+ max_model_len=8192, -+ sliding_window=None, -+ enable_caching=True, -+ num_preallocate_tokens=0, -+ ) -+ # Complete 3 blocks (48 tokens) -+ # | Common-0 | Common-1 | Common-2 | ... | -+ common_token_ids = [i for i in range(3) for _ in range(16)] -+ req0 = make_request("0", common_token_ids) -+ computed_blocks = manager.get_computed_blocks(req0) -+ assert not computed_blocks -+ manager.allocate_slots(req0, 48, computed_blocks) -+ block_part0 = manager.req_to_blocks[req0.request_id] -+ -+ # | Common-0 | Common-1 | Common-2 | Req1-3 | Req1-4 | Req1-5 | ... | -+ req1 = make_request("1", common_token_ids * 2) -+ computed_blocks = manager.get_computed_blocks(req1) -+ assert computed_blocks == block_part0 -+ manager.allocate_slots(req1, 48, computed_blocks) -+ block_part1 = manager.req_to_blocks[req1.request_id] -+ # | Common-0 | Common-1 | Common-2 | Req1-3 (F) | Req1-4 (F) | -+ # | Req1-5(F)| ... | -+ manager.free(req1) -+ assert {block.ref_cnt for block in block_part1[:3]} == {1} -+ assert {block.ref_cnt for block in block_part1[3:]} == {0} -+ -+ # | Common-0 | Common-1 | Common-2 | Req1-3 (F) | Req1-4 (F) | -+ # | Req1-5(F)| Req2-0 | Req2-1 | ... | -+ req2 = make_request("2", [7] * block_size * 2) -+ computed_blocks = manager.get_computed_blocks(req2) -+ assert not computed_blocks -+ manager.allocate_slots(req2, block_size * 2, computed_blocks) -+ -+ # Req3 is Req2 + 3 new blocks, so the first 6 blocks are computed, -+ # but it cannot be allocated due to insufficient free blocks (2). -+ # In this case, the ref_cnt of the computed blocks should not be changed. -+ assert manager.free_block_queue.num_free_blocks == 5 -+ req3 = make_request("3", common_token_ids * 3) -+ computed_blocks = manager.get_computed_blocks(req3) -+ assert computed_blocks == block_part1 -+ # Req3 cannot be allocated. -+ assert manager.allocate_slots(req3, 48, computed_blocks) is None -+ # Block 0-2 are used by Req 1. -+ assert {block.ref_cnt for block in block_part1[:3]} == {1} -+ # Block 3-5 are free. -+ assert {block.ref_cnt for block in block_part1[3:]} == {0} -diff --git a/tests/v1/e2e/__init__.py b/tests/v1/e2e/__init__.py -new file mode 100644 -index 000000000..e69de29bb -diff --git a/tests/v1/e2e/test_cascade_attention.py b/tests/v1/e2e/test_cascade_attention.py -new file mode 100644 -index 000000000..8ec9f1ba3 ---- /dev/null -+++ b/tests/v1/e2e/test_cascade_attention.py -@@ -0,0 +1,22 @@ -+from vllm import LLM, SamplingParams -+ -+ -+def test_cascade_attention(example_system_message, monkeypatch): -+ prompt = "\n: Implement fibonacci sequence in Python.\n:" -+ -+ with monkeypatch.context() as m: -+ m.setenv("VLLM_USE_V1", "1") -+ -+ llm = LLM(model="Qwen/Qwen2-1.5B-Instruct") -+ sampling_params = SamplingParams(temperature=0.0, max_tokens=100) -+ -+ # No cascade attention. -+ single_prompt = [example_system_message + prompt] -+ responses = llm.generate(single_prompt, sampling_params) -+ ref_output = responses[0].outputs[0].text -+ -+ # (Probably) Use cascade attention. -+ prompts = [example_system_message + prompt] * 64 -+ responses = llm.generate(prompts, sampling_params) -+ for response in responses: -+ assert response.outputs[0].text == ref_output -diff --git a/tests/v1/engine/test_detokenizer.py b/tests/v1/engine/test_detokenizer.py -index 07f343666..aeae697ca 100644 ---- a/tests/v1/engine/test_detokenizer.py -+++ b/tests/v1/engine/test_detokenizer.py -@@ -3,9 +3,9 @@ from typing import List - import pytest - from transformers import AutoTokenizer - --from vllm.sampling_params import RequestOutputKind --from vllm.v1.engine import EngineCoreOutput --from vllm.v1.engine.detokenizer import Detokenizer, DetokenizerRequest -+from vllm.sampling_params import RequestOutputKind, SamplingParams -+from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest -+from vllm.v1.engine.detokenizer import Detokenizer - - TOKENIZER_NAME = "mistralai/Mistral-7B-Instruct-v0.3" - tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME) -@@ -71,16 +71,22 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind): - - # Make N requests. - requests = [ -- DetokenizerRequest( -- request_id=f"request-{idx}", -- prompt=prompt, -- prompt_token_ids=prompt_tokens, -- skip_special_tokens=False, -- spaces_between_special_tokens=False, -- output_kind=request_output_kind, -- stop=[], -- include_stop_str_in_output=False, -- ) for idx, ( -+ EngineCoreRequest(request_id=f"request-{idx}", -+ prompt=prompt, -+ prompt_token_ids=prompt_tokens, -+ arrival_time=0, -+ mm_inputs=None, -+ mm_hashes=None, -+ mm_placeholders=None, -+ eos_token_id=None, -+ lora_request=None, -+ sampling_params=SamplingParams( -+ skip_special_tokens=False, -+ spaces_between_special_tokens=False, -+ output_kind=request_output_kind, -+ stop=[], -+ include_stop_str_in_output=False)) -+ for idx, ( - prompt, - prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS)) - ] -@@ -133,18 +139,25 @@ def test_stop_string(include_stop_str_in_output: bool): - - # Make N requests. - requests = [ -- DetokenizerRequest( -+ EngineCoreRequest( - request_id=f"request-{idx}", - prompt=prompt, - prompt_token_ids=prompt_tokens, -- skip_special_tokens=False, -- spaces_between_special_tokens=False, -- output_kind=RequestOutputKind.DELTA, -- stop=STOP_STRINGS, -- include_stop_str_in_output=include_stop_str_in_output, -- ) for idx, ( -- prompt, -- prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS)) -+ arrival_time=0, -+ mm_inputs=None, -+ mm_hashes=None, -+ mm_placeholders=None, -+ eos_token_id=None, -+ lora_request=None, -+ sampling_params=SamplingParams( -+ skip_special_tokens=False, -+ spaces_between_special_tokens=False, -+ output_kind=RequestOutputKind.DELTA, -+ stop=STOP_STRINGS, -+ include_stop_str_in_output=include_stop_str_in_output, -+ )) for idx, ( -+ prompt, -+ prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS)) - ] - - # Add requests to the detokenizer. -diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py -index c529cd21f..8dd9b23fb 100644 ---- a/tests/v1/engine/test_engine_core.py -+++ b/tests/v1/engine/test_engine_core.py -@@ -7,10 +7,9 @@ from transformers import AutoTokenizer - from vllm import SamplingParams - from vllm.engine.arg_utils import EngineArgs - from vllm.platforms import current_platform --from vllm.usage.usage_lib import UsageContext - from vllm.v1.engine import EngineCoreRequest --from vllm.v1.engine.async_llm import AsyncLLM - from vllm.v1.engine.core import EngineCore -+from vllm.v1.executor.abstract import Executor - - if not current_platform.is_cuda(): - pytest.skip(reason="V1 currently only supported on CUDA.", -@@ -43,13 +42,11 @@ def test_engine_core(monkeypatch): - m.setenv("VLLM_USE_V1", "1") - """Setup the EngineCore.""" - engine_args = EngineArgs(model=MODEL_NAME) -- vllm_config = engine_args.create_engine_config( -- usage_context=UsageContext.UNKNOWN_CONTEXT) -- executor_class = AsyncLLM._get_executor_cls(vllm_config) -+ vllm_config = engine_args.create_engine_config() -+ executor_class = Executor.get_class(vllm_config) - - engine_core = EngineCore(vllm_config=vllm_config, -- executor_class=executor_class, -- usage_context=UsageContext.UNKNOWN_CONTEXT) -+ executor_class=executor_class) - """Test basic request lifecycle.""" - - # First request. -@@ -151,13 +148,11 @@ def test_engine_core_advanced_sampling(monkeypatch): - m.setenv("VLLM_USE_V1", "1") - """Setup the EngineCore.""" - engine_args = EngineArgs(model=MODEL_NAME) -- vllm_config = engine_args.create_engine_config( -- usage_context=UsageContext.UNKNOWN_CONTEXT) -- executor_class = AsyncLLM._get_executor_cls(vllm_config) -+ vllm_config = engine_args.create_engine_config() -+ executor_class = Executor.get_class(vllm_config) - - engine_core = EngineCore(vllm_config=vllm_config, -- executor_class=executor_class, -- usage_context=UsageContext.UNKNOWN_CONTEXT) -+ executor_class=executor_class) - """Test basic request lifecycle.""" - # First request. - request: EngineCoreRequest = make_request() -diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py -index 2f1cbec60..5a21806e5 100644 ---- a/tests/v1/engine/test_engine_core_client.py -+++ b/tests/v1/engine/test_engine_core_client.py -@@ -11,8 +11,8 @@ from vllm.engine.arg_utils import EngineArgs - from vllm.platforms import current_platform - from vllm.usage.usage_lib import UsageContext - from vllm.v1.engine import EngineCoreRequest --from vllm.v1.engine.async_llm import AsyncLLM - from vllm.v1.engine.core_client import EngineCoreClient -+from vllm.v1.executor.abstract import Executor - - if not current_platform.is_cuda(): - pytest.skip(reason="V1 currently only supported on CUDA.", -@@ -84,13 +84,12 @@ def test_engine_core_client(monkeypatch, multiprocessing_mode: bool): - engine_args = EngineArgs(model=MODEL_NAME, compilation_config=3) - vllm_config = engine_args.create_engine_config( - UsageContext.UNKNOWN_CONTEXT) -- executor_class = AsyncLLM._get_executor_cls(vllm_config) -+ executor_class = Executor.get_class(vllm_config) - client = EngineCoreClient.make_client( -- vllm_config, -- executor_class, -- UsageContext.UNKNOWN_CONTEXT, - multiprocess_mode=multiprocessing_mode, - asyncio_mode=False, -+ vllm_config=vllm_config, -+ executor_class=executor_class, - ) - - MAX_TOKENS = 20 -@@ -143,9 +142,6 @@ def test_engine_core_client(monkeypatch, multiprocessing_mode: bool): - - client.abort_requests([request.request_id]) - -- # Shutdown the client. -- client.shutdown() -- - - @pytest.mark.asyncio - async def test_engine_core_client_asyncio(monkeypatch): -@@ -156,13 +152,12 @@ async def test_engine_core_client_asyncio(monkeypatch): - engine_args = EngineArgs(model=MODEL_NAME) - vllm_config = engine_args.create_engine_config( - usage_context=UsageContext.UNKNOWN_CONTEXT) -- executor_class = AsyncLLM._get_executor_cls(vllm_config) -+ executor_class = Executor.get_class(vllm_config) - client = EngineCoreClient.make_client( -- vllm_config, -- executor_class, -- UsageContext.UNKNOWN_CONTEXT, - multiprocess_mode=True, - asyncio_mode=True, -+ vllm_config=vllm_config, -+ executor_class=executor_class, - ) - - MAX_TOKENS = 20 -@@ -202,6 +197,3 @@ async def test_engine_core_client_asyncio(monkeypatch): - else: - assert len(outputs[req_id]) == MAX_TOKENS, ( - f"{len(outputs[req_id])=}, {MAX_TOKENS=}") -- -- # Shutdown the client. -- client.shutdown() -diff --git a/tools/mypy.sh b/tools/mypy.sh -index 2454ff9fd..bf95e4c52 100755 ---- a/tools/mypy.sh -+++ b/tools/mypy.sh -@@ -23,6 +23,7 @@ run_mypy vllm/compilation - run_mypy vllm/distributed - run_mypy vllm/engine - run_mypy vllm/executor -+run_mypy vllm/inputs - run_mypy vllm/lora - run_mypy vllm/model_executor - run_mypy vllm/plugins -diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py -index aeacf5dda..eb2f69df4 100644 ---- a/vllm/_custom_ops.py -+++ b/vllm/_custom_ops.py -@@ -23,8 +23,7 @@ with contextlib.suppress(ImportError): - import vllm._moe_C # noqa: F401 - supports_moe_ops = True - --# neuron has torch version that doesn't even have impl_abstract --if TYPE_CHECKING or current_platform.is_neuron(): -+if TYPE_CHECKING: - - def register_fake(fn): - return lambda name: fn -diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py -index 28b804f76..f73ba0d3b 100644 ---- a/vllm/_ipex_ops.py -+++ b/vllm/_ipex_ops.py -@@ -1,4 +1,4 @@ --from typing import List, Optional, Tuple -+from typing import List, Optional, Tuple, Dict - - import torch - -@@ -6,11 +6,12 @@ from vllm.logger import init_logger - - logger = init_logger(__name__) - --try: -- import intel_extension_for_pytorch as ipex --except ImportError as e: -- logger.warning("Import error msg: %s", e.msg) -+# try: -+# import intel_extension_for_pytorch as ipex -+# except ImportError as e: -+# logger.warning("Import error msg: %s", e.msg) - -+import vllm._C.ops - - class ipex_ops: - -@@ -27,23 +28,31 @@ class ipex_ops: - - @staticmethod - def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: -- ipex.llm.functional.silu_and_mul(x, out) -+ # x1, x2 = ipex_ops._reshape_activation_tensor(x) -+ # ipex.llm.functional.silu_mul(x1, x2, out) -+ vllm._C.ops.silu_and_mul(out, x) - - @staticmethod - def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: -- ipex.llm.functional.gelu_and_mul(x, out) -+ # x1, x2 = ipex_ops._reshape_activation_tensor(x) -+ # ipex.llm.functional.gelu_mul(x1, x2, out, "none") -+ vllm._C.ops.gelu_and_mul(out, x) - - @staticmethod - def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: -- ipex.llm.functional.gelu_and_mul(x, out) -+ # x1, x2 = ipex_ops._reshape_activation_tensor(x) -+ # ipex.llm.functional.gelu_mul(x1, x2, out, "tanh") -+ vllm._C.ops.gelu_tanh_and_mul(out, x) - - @staticmethod -- def gelu_fast(x: torch.Tensor) -> torch.Tensor: -- return torch.nn.functional.gelu(x) -+ def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None: -+ # out.copy_(torch.nn.functional.gelu(x)) -+ vllm._C.ops.gelu_fast(out, x) - - @staticmethod -- def gelu_new(x: torch.Tensor) -> torch.Tensor: -- return torch.nn.functional.gelu(x) -+ def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None: -+ # out.copy_(torch.nn.functional.gelu(x)) -+ vllm._C.ops.gelu_new(out, x) + @staticmethod +- def gelu_new(x: torch.Tensor) -> torch.Tensor: +- return torch.nn.functional.gelu(x) ++ def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None: ++ # out.copy_(torch.nn.functional.gelu(x)) ++ vllm._C.ops.gelu_new(out, x) @staticmethod def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None: -@@ -65,28 +74,20 @@ class ipex_ops: +@@ -67,28 +74,20 @@ class ipex_ops: kv_cache_dtype: str, k_scale: float, v_scale: float, @@ -17814,7 +8787,7 @@ index 28b804f76..f73ba0d3b 100644 @staticmethod def paged_attention_v2( -@@ -107,28 +108,21 @@ class ipex_ops: +@@ -109,28 +108,21 @@ class ipex_ops: kv_cache_dtype: str, k_scale: float, v_scale: float, @@ -17852,7 +8825,7 @@ index 28b804f76..f73ba0d3b 100644 @staticmethod def rotary_embedding( -@@ -139,33 +133,83 @@ class ipex_ops: +@@ -141,33 +133,83 @@ class ipex_ops: cos_sin_cache: torch.Tensor, # [cos_sin_dim, rot_dim] is_neox: bool, ) -> None: @@ -17956,15 +8929,7 @@ index 28b804f76..f73ba0d3b 100644 @staticmethod def varlen_attention( -@@ -185,6 +229,7 @@ class ipex_ops: - gen_: torch.Generator, - logits_soft_cap: float, - ) -> None: -+ import intel_extension_for_pytorch as ipex - ipex.llm.functional.varlen_attention(query.contiguous(), - key.contiguous(), - value.contiguous(), out, -@@ -205,22 +250,233 @@ class ipex_ops: +@@ -220,22 +262,233 @@ class ipex_ops: kv_cache_dtype: str, k_scale: float, v_scale: float, @@ -18056,7 +9021,14 @@ index 28b804f76..f73ba0d3b 100644 + ipex.llm.modules.PagedAttention.reshape_and_cache_flash( key, value, key_cache, value_cache, slot_mapping) -+ @staticmethod + @staticmethod +- def copy_blocks(key_caches: list[torch.Tensor], +- value_caches: list[torch.Tensor], +- block_mapping: torch.Tensor) -> None: +- torch.xpu.copy_blocks( # type: ignore +- key_caches, +- value_caches, +- block_mapping, + def chunked_prefill( + query: torch.Tensor, + key_cache: torch.Tensor, @@ -18094,17 +9066,11 @@ index 28b804f76..f73ba0d3b 100644 + is_caual, + return_softmax, + gen_, -+ ) -+ - @staticmethod - def copy_blocks(key_caches: List[torch.Tensor], - value_caches: List[torch.Tensor], -- block_mapping: torch.Tensor) -> None: -- torch.xpu.copy_blocks( # type: ignore -- key_caches, -- value_caches, -- block_mapping, -- ) + ) + ++ @staticmethod ++ def copy_blocks(key_caches: List[torch.Tensor], ++ value_caches: List[torch.Tensor], + block_mapping) -> None: + # torch.xpu.copy_blocks( # type: ignore + # key_caches, @@ -18112,7 +9078,7 @@ index 28b804f76..f73ba0d3b 100644 + # block_mapping, + # ) + vllm._C.cache_ops.copy_blocks(key_caches, value_caches, block_mapping) - ++ @staticmethod def swap_blocks(src: torch.Tensor, dst: torch.Tensor, block_mapping: torch.Tensor) -> None: @@ -18206,40 +9172,11 @@ index 28b804f76..f73ba0d3b 100644 + lora_indices_tensor, batches, + max_seq_length, slice_offset, + slice_size, add_inputs) -diff --git a/vllm/assets/audio.py b/vllm/assets/audio.py -index 9033644e3..a46c67ad7 100644 ---- a/vllm/assets/audio.py -+++ b/vllm/assets/audio.py -@@ -21,12 +21,10 @@ class AudioAsset: - name: Literal["winning_call", "mary_had_lamb"] - - @property -- def audio_and_sample_rate(self) -> tuple[npt.NDArray, int]: -+ def audio_and_sample_rate(self) -> tuple[npt.NDArray, float]: - audio_path = get_vllm_public_assets(filename=f"{self.name}.ogg", - s3_prefix=ASSET_DIR) -- y, sr = librosa.load(audio_path, sr=None) -- assert isinstance(sr, int) -- return y, sr -+ return librosa.load(audio_path, sr=None) - - @property - def url(self) -> str: -diff --git a/vllm/assets/image.py b/vllm/assets/image.py -index cb831cb0b..0a55506f8 100644 ---- a/vllm/assets/image.py -+++ b/vllm/assets/image.py -@@ -26,4 +26,4 @@ class ImageAsset: - """ - image_path = get_vllm_public_assets(filename=f"{self.name}.pt", - s3_prefix=VLM_IMAGES_DIR) -- return torch.load(image_path, map_location="cpu") -+ return torch.load(image_path, map_location="cpu", weights_only=True) diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py -index 21949874b..7902d02e3 100644 +index d3c61ea26..3ec6ee9ee 100644 --- a/vllm/attention/backends/ipex_attn.py +++ b/vllm/attention/backends/ipex_attn.py -@@ -4,7 +4,7 @@ from dataclasses import dataclass +@@ -5,7 +5,7 @@ from dataclasses import dataclass from typing import Any, Dict, List, Optional, Tuple, Type import torch @@ -18247,21 +9184,20 @@ index 21949874b..7902d02e3 100644 +import os from vllm._ipex_ops import ipex_ops from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, - AttentionMetadata, AttentionType) -@@ -12,7 +12,12 @@ from vllm.attention.backends.utils import CommonAttentionState + AttentionLayer, +@@ -15,7 +15,11 @@ from vllm.attention.backends.utils import CommonAttentionState from vllm.attention.ops.paged_attn import (PagedAttention, PagedAttentionMetadata) +from vllm.logger import init_logger +logger = init_logger('vllm.attention.backends.ipex_attn') -+from vllm.utils import print_info_once, print_warning_once + _PARTITION_SIZE = 512 +_IPEX_BACKEND_SUPPORTED_KV_CACHE_FORMAT=["fp8", "auto"] class IpexAttnBackend(AttentionBackend): -@@ -49,18 +54,16 @@ class IpexAttnBackend(AttentionBackend): +@@ -52,18 +56,16 @@ class IpexAttnBackend(AttentionBackend): dst_kv_cache: torch.Tensor, src_to_dst: torch.Tensor, ) -> None: @@ -18282,7 +9218,7 @@ index 21949874b..7902d02e3 100644 @dataclass -@@ -74,6 +77,11 @@ class IpexAttnMetadata(AttentionMetadata, PagedAttentionMetadata): +@@ -77,6 +79,11 @@ class IpexAttnMetadata(AttentionMetadata, PagedAttentionMetadata): seq_lens: Optional[List[int]] seqlen_q: Optional[torch.Tensor] max_seqlen: Optional[int] @@ -18294,7 +9230,7 @@ index 21949874b..7902d02e3 100644 def __post_init__(self): # Set during the execution of the first attention op. -@@ -86,21 +94,140 @@ class IpexAttnMetadata(AttentionMetadata, PagedAttentionMetadata): +@@ -89,21 +96,143 @@ class IpexAttnMetadata(AttentionMetadata, PagedAttentionMetadata): @property def prefill_metadata(self) -> Optional["IpexAttnMetadata"]: # Currently chunked prefill is not supported @@ -18331,6 +9267,7 @@ index 21949874b..7902d02e3 100644 + # seq_start_loc=None, + context_lens=self.context_lens[:self.num_prefills] if (torch.is_tensor(self.context_lens)) else None, + block_tables=self.block_tables[:self.num_prefills], ++ enable_kv_scales_calculation=False, + ) + return self._cached_prefill_metadata @@ -18366,6 +9303,7 @@ index 21949874b..7902d02e3 100644 + # seq_start_loc=None, + context_lens=self.context_lens[self.num_prefills:] if (torch.is_tensor(self.context_lens)) else None, + block_tables=self.block_tables[self.num_prefills:], ++ enable_kv_scales_calculation=False, + ) + return self._cached_decode_metadata + @@ -18427,12 +9365,13 @@ index 21949874b..7902d02e3 100644 + return mask + + -+def use_sdp_causal(head_dim, query_states, logits_soft_cap): ++def use_sdp_causal(head_dim, query_states, logits_soft_cap, attn_type): + return ( + (logits_soft_cap != 0 # for gemma model -+ or head_dim in [-1, 64, 80, 96, 128]) # for now ++ or head_dim in [-1, 64, 80, 96, 128, 256]) # for now + and query_states.device.type == "xpu" # GPU + and query_states.dtype in [torch.float, torch.half] # fp32/fp16 ++ and attn_type is AttentionType.DECODER + ) + +def use_gqa_kernel(num_heads, num_kv_heads, head_size, logits_soft_cap): @@ -18444,26 +9383,50 @@ index 21949874b..7902d02e3 100644 class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]): -@@ -134,7 +261,7 @@ class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]): +@@ -119,6 +248,7 @@ class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]): + blocksparse_params: Optional[Dict[str, Any]] = None, + logits_soft_cap: Optional[float] = None, + attn_type: str = AttentionType.DECODER, ++ use_irope: bool = False, + ) -> None: + if blocksparse_params is not None: + raise ValueError( +@@ -132,29 +262,40 @@ class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]): + self.alibi_slopes = alibi_slopes + self.sliding_window = sliding_window + self.kv_cache_dtype = kv_cache_dtype ++ self.use_irope = use_irope + + assert self.num_heads % self.num_kv_heads == 0 + self.num_queries_per_kv = self.num_heads // self.num_kv_heads self.need_mask = (self.alibi_slopes is not None or self.sliding_window is not None) if logits_soft_cap is None: - logits_soft_cap = 0 + logits_soft_cap = 0.0 self.logits_soft_cap = logits_soft_cap ++ self.attn_type = attn_type supported_head_sizes = PagedAttention.get_supported_head_sizes() -@@ -142,10 +269,20 @@ class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]): + if head_size not in supported_head_sizes: raise ValueError( f"Head size {head_size} is not supported by PagedAttention. " f"Supported head sizes are: {supported_head_sizes}.") -- if kv_cache_dtype != "auto": +- if is_quantized_kv_cache(kv_cache_dtype): - raise NotImplementedError( - "IPEX backend does not support FP8 KV cache. " - "Please use xFormers backend instead.") +- if attn_type != AttentionType.DECODER: +- raise NotImplementedError("Encoder self-attention and " +- "encoder/decoder cross-attention " +- "are not implemented for " ++ if attn_type != AttentionType.DECODER and attn_type != AttentionType.ENCODER_ONLY: ++ raise NotImplementedError("Encoder/decoder cross-attention " ++ "is not implemented for " + "IpexAttnBackendImpl") + if kv_cache_dtype not in _IPEX_BACKEND_SUPPORTED_KV_CACHE_FORMAT: + raise NotImplementedError(f"IPEX backend does not support " -+ "KV cache format {kv_cache_dtype}") ++ "KV cache format {kv_cache_dtype}") + # Also check for gqa models... + self.using_gqa_kernel = use_gqa_kernel(self.num_heads, self.num_kv_heads, self.head_size, self.logits_soft_cap) + if not self.using_gqa_kernel and kv_cache_dtype == "fp8": @@ -18474,11 +9437,11 @@ index 21949874b..7902d02e3 100644 + flag = os.getenv("IPEX_LLM_PREFILL_VARLEN_BACKEND", None) + if flag is not None: + self.ipex_varlen_attn = True -+ print_info_once(f"Using varlen_attention for prefilling.") ++ logger.info_once(f"Using varlen_attention for prefilling.") def split_kv_cache( self, -@@ -153,16 +290,34 @@ class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]): +@@ -162,16 +303,34 @@ class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]): num_kv_heads: int, head_size: int, ) -> Tuple[torch.Tensor, torch.Tensor]: @@ -18513,8 +9476,8 @@ index 21949874b..7902d02e3 100644 + def forward( self, - query: torch.Tensor, -@@ -200,75 +355,172 @@ class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]): + layer: AttentionLayer, +@@ -202,75 +361,177 @@ class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]): key = key.view(-1, self.num_kv_heads, self.head_size) value = value.view(-1, self.num_kv_heads, self.head_size) @@ -18528,15 +9491,15 @@ index 21949874b..7902d02e3 100644 - value_cache, - attn_metadata.slot_mapping.flatten(), - self.kv_cache_dtype, -- k_scale, -- v_scale, +- layer._k_scale, +- layer._v_scale, - ) - - if attn_metadata.is_prompt: - assert attn_metadata.seq_lens is not None - if (kv_cache.numel() == 0 - or attn_metadata.block_tables.numel() == 0): -+ if kv_cache is not None: ++ if kv_cache.numel() > 0 and self.attn_type == AttentionType.DECODER: + if self.using_gqa_kernel: + key_cache, value_cache = self.split_kv_cache_ipexllm( + kv_cache, self.num_kv_heads, self.head_size) @@ -18547,8 +9510,8 @@ index 21949874b..7902d02e3 100644 + value_cache, + attn_metadata.slot_mapping.flatten(), + self.kv_cache_dtype, -+ k_scale, -+ v_scale, ++ layer._k_scale, ++ layer._v_scale, + ) + else: + key_cache, value_cache = self.split_kv_cache( @@ -18560,8 +9523,8 @@ index 21949874b..7902d02e3 100644 + value_cache, + attn_metadata.slot_mapping.flatten(), + self.kv_cache_dtype, -+ k_scale, -+ v_scale, ++ layer._k_scale, ++ layer._v_scale, + ) + + # New added code-segment @@ -18582,6 +9545,11 @@ index 21949874b..7902d02e3 100644 + + assert query.shape[0] == num_prefill_tokens + assert decode_query.shape[0] == num_decode_tokens ++ # If mask is not set, then is_causal=True ++ # If mask is set, then is_causal=False ++ is_causal = not self.need_mask ++ if self.attn_type == AttentionType.ENCODER_ONLY: ++ is_causal = False + + if prefill_meta := attn_metadata.prefill_metadata: + assert prefill_meta.seq_lens is not None @@ -18654,7 +9622,7 @@ index 21949874b..7902d02e3 100644 + pdropout=0.0, + softmax_scale=self.scale, + zero_tensors=False, -+ is_causal=True, ++ is_causal=is_causal, + return_softmax=False, + gen_=None, + logits_soft_cap=self.logits_soft_cap) @@ -18669,9 +9637,9 @@ index 21949874b..7902d02e3 100644 + scale = 1 / math.sqrt(self.head_size) if self.scale is None else self.scale + start = 0 + for seq_len, mask in zip(prefill_meta.seq_lens, -+ prefill_meta.attn_bias): ++ prefill_meta.attn_bias): + end = start + seq_len -+ if self.alibi_slopes is None and use_sdp_causal(self.head_size, query, self.logits_soft_cap): ++ if self.alibi_slopes is None and use_sdp_causal(self.head_size, query, self.logits_soft_cap, self.attn_type): + import xe_addons + if mask is not None: + mask = mask.unsqueeze(0) @@ -18699,7 +9667,7 @@ index 21949874b..7902d02e3 100644 + value[None, :, start:end, :], + attn_mask=mask, + dropout_p=0.0, -+ is_causal=not self.need_mask, ++ is_causal=is_causal, + scale=self.scale).squeeze(0).movedim( + query.dim() - 2, 0) + output[start:end, :, :] = sub_out @@ -18741,7 +9709,7 @@ index 21949874b..7902d02e3 100644 max_num_partitions = ((max_seq_len + _PARTITION_SIZE - 1) // _PARTITION_SIZE) # NOTE(woosuk): We use a simple heuristic to decide whether to use -@@ -279,59 +531,86 @@ class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]): +@@ -281,59 +542,86 @@ class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]): # TODO(woosuk): Tune this heuristic. # For context len > 8192, use V2 kernel to avoid shared memory # shortage. @@ -18776,8 +9744,8 @@ index 21949874b..7902d02e3 100644 max_seq_len, - self.alibi_slopes, - self.kv_cache_dtype, -- k_scale, -- v_scale, +- layer._k_scale, +- layer._v_scale, + self.kv_cache_dtype ) else: @@ -18810,8 +9778,8 @@ index 21949874b..7902d02e3 100644 - max_seq_len, - self.alibi_slopes, - self.kv_cache_dtype, -- k_scale, -- v_scale, +- layer._k_scale, +- layer._v_scale, - ) + block_size = value_cache.shape[3] + use_v1 = (max_seq_len <= 8192 and @@ -18831,8 +9799,8 @@ index 21949874b..7902d02e3 100644 + max_seq_len, + self.alibi_slopes, + self.kv_cache_dtype, -+ k_scale, -+ v_scale, ++ layer._k_scale, ++ layer._v_scale, + self.logits_soft_cap, + ) + else: @@ -18865,33 +9833,19 @@ index 21949874b..7902d02e3 100644 + max_seq_len, + self.alibi_slopes, + self.kv_cache_dtype, -+ k_scale, -+ v_scale, ++ layer._k_scale, ++ layer._v_scale, + self.logits_soft_cap, + ) + output[num_prefill_tokens:] = out # Reshape the output tensor. return output.view(-1, self.num_heads * self.head_size) -diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py -index 69b6d1e46..f3e2f69df 100644 ---- a/vllm/attention/layer.py -+++ b/vllm/attention/layer.py -@@ -229,7 +229,8 @@ class MultiHeadAttention(nn.Module): - value, - scale=self.scale) - out = out.transpose(1, 2) -- return out.view(bsz, q_len, -1) -+ #return out.view(bsz, q_len, -1) -+ return out.reshape(bsz, q_len, -1) - - - def unified_attention( diff --git a/vllm/attention/ops/blocksparse_attention/interface.py b/vllm/attention/ops/blocksparse_attention/interface.py -index 350f88c8f..17ebe6ddf 100644 +index 6ab69ea5b..9604f35f6 100644 --- a/vllm/attention/ops/blocksparse_attention/interface.py +++ b/vllm/attention/ops/blocksparse_attention/interface.py -@@ -7,7 +7,8 @@ from vllm.platforms import current_platform +@@ -9,7 +9,8 @@ from vllm.platforms import current_platform from .utils import (dense_to_crow_col, get_head_sliding_step, get_sparse_attn_mask) @@ -18902,10 +9856,11 @@ index 350f88c8f..17ebe6ddf 100644 if IS_COMPUTE_8_OR_ABOVE: from .blocksparse_attention_kernel import blocksparse_flash_attn_varlen_fwd diff --git a/vllm/attention/ops/ipex_attn.py b/vllm/attention/ops/ipex_attn.py -index cbc6c74ac..6b7afcc66 100644 +index 6d96f5832..c9ed08d44 100644 --- a/vllm/attention/ops/ipex_attn.py +++ b/vllm/attention/ops/ipex_attn.py -@@ -1,14 +1,16 @@ +@@ -2,15 +2,17 @@ + from typing import Dict, List, Optional, Tuple -try: @@ -18927,7 +9882,7 @@ index cbc6c74ac..6b7afcc66 100644 class _PagedAttention: -@@ -187,5 +189,44 @@ class _IPEXPagedAttention(_PagedAttention): +@@ -189,5 +191,44 @@ class _IPEXPagedAttention(_PagedAttention): scale, block_tables, context_lens, block_size, max_context_len, alibi_slopes) @@ -18973,10 +9928,10 @@ index cbc6c74ac..6b7afcc66 100644 PagedAttention = _IPEXPagedAttention if _use_ipex else _PagedAttention diff --git a/vllm/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py -index 076f151ff..51c01dc28 100644 +index 827c3041a..9e7cb41e7 100644 --- a/vllm/attention/ops/paged_attn.py +++ b/vllm/attention/ops/paged_attn.py -@@ -51,6 +51,7 @@ class PagedAttention: +@@ -53,6 +53,7 @@ class PagedAttention: num_kv_heads: int, head_size: int, ) -> Tuple[torch.Tensor, torch.Tensor]: @@ -18984,205 +9939,29 @@ index 076f151ff..51c01dc28 100644 x = 16 // kv_cache.element_size() num_blocks = kv_cache.shape[1] -diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py -index d26383970..2656fa861 100644 ---- a/vllm/attention/selector.py -+++ b/vllm/attention/selector.py -@@ -125,6 +125,10 @@ def _cached_get_attn_backend( - from vllm.v1.attention.backends.flash_attn import ( # noqa: F401 - FlashAttentionBackend as FlashAttentionBackendV1) - return FlashAttentionBackendV1 -+ if backend == _Backend.IPEX_V1: -+ from vllm.v1.attention.backends.ipex_attn import ( # noqa: F401 -+ IPEXAttentionBackend as IPEXAttentionBackendV1) -+ return IPEXAttentionBackendV1 - if backend == _Backend.XFORMERS: - logger.info("Using XFormers backend.") - from vllm.attention.backends.xformers import ( # noqa: F401 -diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py -index 4f960b441..a8dd628b9 100644 ---- a/vllm/compilation/backends.py -+++ b/vllm/compilation/backends.py -@@ -619,8 +619,10 @@ class PiecewiseBackend: - # the entries for different shapes that we need to either - # compile or capture cudagraph - self.concrete_size_entries: Dict[int, ConcreteSizeEntry] = {} -- self.to_be_compiled_sizes: Set[int] = self.compile_sizes.union( -- self.capture_sizes) -+ -+ # to_be_compiled_sizes tracks the remaining sizes to compile, -+ # and updates during the compilation process, so we need to copy it -+ self.to_be_compiled_sizes: Set[int] = self.compile_sizes.copy() - for shape in self.compile_sizes.union(self.capture_sizes): - self.concrete_size_entries[shape] = ConcreteSizeEntry( - runtime_shape=shape, -@@ -628,12 +630,17 @@ class PiecewiseBackend: - use_cudagraph=shape in self.capture_sizes, - ) - -+ def check_for_ending_compilation(self): -+ if self.is_last_graph and not self.to_be_compiled_sizes: -+ # no specific sizes to compile -+ # save the hash of the inductor graph for the next run -+ self.compilation_config.inductor_hash_cache.save_to_file() -+ end_monitoring_torch_compile(self.vllm_config) -+ - def __call__(self, *args) -> Any: - if not self.first_run_finished: - self.first_run_finished = True -- # no specific sizes to compile -- if self.is_last_graph and not self.to_be_compiled_sizes: -- end_monitoring_torch_compile(self.vllm_config) -+ self.check_for_ending_compilation() - return self.compiled_graph_for_general_shape(*args) - - runtime_shape = args[self.sym_shape_indices[0]] -@@ -662,10 +669,7 @@ class PiecewiseBackend: - - # finished compilations for all required shapes - if self.is_last_graph and not self.to_be_compiled_sizes: -- -- # save the hash of the inductor graph for the next run -- self.compilation_config.inductor_hash_cache.save_to_file() -- end_monitoring_torch_compile(self.vllm_config) -+ self.check_for_ending_compilation() - - if not entry.use_cudagraph: - return entry.runnable(*args) -diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py -index c10241b48..e3260a10c 100644 ---- a/vllm/compilation/wrapper.py -+++ b/vllm/compilation/wrapper.py -@@ -28,11 +28,12 @@ class TorchCompileWrapperWithCustomDispatcher: - compiled_callable: Optional[Callable] = None, - compilation_level: int = 0): - -+ vllm_config = get_current_vllm_config() -+ self.vllm_config = vllm_config - if compiled_callable is None: - # default compilation settings - # compiling the forward method - -- vllm_config = get_current_vllm_config() - backend = vllm_config.compilation_config.init_backend(vllm_config) - - compiled_callable = torch.compile( -@@ -82,6 +83,13 @@ class TorchCompileWrapperWithCustomDispatcher: - - self.compiled_codes.append(new_code) - -+ if self.vllm_config.compilation_config.use_cudagraph and \ -+ "update" in new_code.co_names: -+ import depyf -+ src = depyf.decompile(new_code) -+ msg = "Assigning / modifying buffers of nn.Module during forward pass is not allowed when using cudagraph inside the compiler because it will cause silent errors. Please use eager mode or fix the code. The following code contains clues about which buffer is being modified (please search for the usage of the function `update`):\n" + src # noqa -+ raise RuntimeError(msg) -+ - @contextmanager - def dispatch_to_code(self, index: int): - """Context manager to dispatch to the compiled code. diff --git a/vllm/config.py b/vllm/config.py -index ac767bbe1..7a7cfa225 100644 +index bd52fc90b..7d4e3555a 100644 --- a/vllm/config.py +++ b/vllm/config.py -@@ -9,8 +9,8 @@ from contextlib import contextmanager - from dataclasses import dataclass, field, replace - from pathlib import Path - from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Counter, Dict, -- Final, List, Literal, Mapping, Optional, Set, Tuple, Type, -- Union) -+ Final, List, Literal, Mapping, Optional, Protocol, Set, -+ Tuple, Type, Union) - - import torch - from pydantic import BaseModel, Field, PrivateAttr -@@ -22,7 +22,7 @@ from vllm.logger import init_logger - from vllm.model_executor.layers.quantization import (QUANTIZATION_METHODS, - get_quantization_config) - from vllm.model_executor.models import ModelRegistry --from vllm.platforms import current_platform, interface -+from vllm.platforms import CpuArchEnum - from vllm.tracing import is_otel_available, otel_import_error_traceback - from vllm.transformers_utils.config import ( - ConfigFormat, get_config, get_hf_image_processor_config, -@@ -75,6 +75,12 @@ HfOverrides = Union[Dict[str, Any], Callable[[PretrainedConfig], - PretrainedConfig]] - - -+class SupportsHash(Protocol): -+ -+ def compute_hash(self) -> str: -+ ... -+ -+ - class ModelConfig: - """Configuration for the model. - -@@ -223,7 +229,9 @@ class ModelConfig: - override_neuron_config: Optional[Dict[str, Any]] = None, - override_pooler_config: Optional["PoolerConfig"] = None, - logits_processor_pattern: Optional[str] = None, -- generation_config: Optional[str] = None) -> None: -+ generation_config: Optional[str] = None, -+ low_bit_model_path: Optional[str] = None, -+ low_bit_save_path: Optional[str] = None) -> None: - self.model = model - self.tokenizer = tokenizer - self.tokenizer_mode = tokenizer_mode -@@ -234,6 +242,8 @@ class ModelConfig: - self.code_revision = code_revision +@@ -266,6 +266,8 @@ class ModelConfig: + enable_sleep_mode: bool = False, + override_generation_config: Optional[dict[str, Any]] = None, + model_impl: Union[str, ModelImpl] = ModelImpl.AUTO, ++ low_bit_model_path: Optional[str] = None, ++ low_bit_save_path: Optional[str] = None, + ) -> None: + self.model = maybe_model_redirect(model) + self.tokenizer = maybe_model_redirect(tokenizer) +@@ -283,6 +285,8 @@ class ModelConfig: self.rope_scaling = rope_scaling self.rope_theta = rope_theta + self.model_impl = model_impl + self.low_bit_model_path = low_bit_model_path + self.low_bit_save_path = low_bit_save_path if hf_overrides is None: hf_overrides = {} -@@ -301,7 +311,7 @@ class ModelConfig: - sliding_window = getattr(self.hf_text_config, "sliding_window", None) - has_interleaved_attention = (sliding_window is not None) and ( - isinstance(sliding_window, list) or -- (self.hf_text_config.model_type in ["gemma2"])) -+ (self.hf_text_config.model_type in ["gemma2", "cohere2"])) - - if (not self.disable_sliding_window and has_interleaved_attention): - if envs.VLLM_ATTENTION_BACKEND == "XFORMERS": -@@ -343,6 +353,7 @@ class ModelConfig: - self.is_hybrid = self._init_is_hybrid() - self.has_inner_state = self._init_has_inner_state() - -+ from vllm.platforms import current_platform - if current_platform.is_neuron(): - self.override_neuron_config = override_neuron_config - else: -@@ -583,6 +594,7 @@ class ModelConfig: - raise ValueError( - f"Unknown quantization method: {self.quantization}. Must " - f"be one of {supported_quantization}.") -+ from vllm.platforms import current_platform - current_platform.verify_quantization(self.quantization) - if self.quantization not in optimized_quantization_methods: - logger.warning( -@@ -638,6 +650,7 @@ class ModelConfig: - - # Reminder: Please update docs/source/usage/compatibility_matrix.md - # If the feature combo become valid -+ from vllm.platforms import current_platform - if not current_platform.is_async_output_supported(self.enforce_eager): - logger.warning( - "Async output processing is not supported on the " -@@ -1006,10 +1019,6 @@ class CacheConfig: - raise ValueError( - "GPU memory utilization must be less than 1.0. Got " - f"{self.gpu_memory_utilization}.") -- if (current_platform.is_cuda() and self.block_size is not None -- and self.block_size > 32): -- raise ValueError("CUDA Paged Attention kernel only supports " -- f"block sizes up to 32. Got {self.block_size}.") - - def _verify_cache_dtype(self) -> None: - if self.cache_dtype == "auto": -@@ -1162,6 +1171,7 @@ class LoadConfig: +@@ -1383,6 +1387,7 @@ class LoadConfig: """ load_format: Union[str, LoadFormat, "BaseModelLoader"] = LoadFormat.AUTO @@ -19190,150 +9969,7 @@ index ac767bbe1..7a7cfa225 100644 download_dir: Optional[str] = None model_loader_extra_config: Optional[Union[str, dict]] = field( default_factory=dict) -@@ -1273,6 +1283,7 @@ class ParallelConfig: - f"distributed executor backend " - f"'{self.distributed_executor_backend}'.") - ray_only_devices = ["tpu", "hpu"] -+ from vllm.platforms import current_platform - if (current_platform.device_type in ray_only_devices - and self.world_size > 1): - if self.distributed_executor_backend is None: -@@ -1321,7 +1332,7 @@ class ParallelConfig: - def _verify_args(self) -> None: - # Lazy import to avoid circular import - from vllm.executor.executor_base import ExecutorBase -- -+ from vllm.platforms import current_platform - if self.distributed_executor_backend not in ( - "ray", "mp", None) and not (isinstance( - self.distributed_executor_backend, type) and issubclass( -@@ -1522,6 +1533,7 @@ class DeviceConfig: - def __init__(self, device: str = "auto") -> None: - if device == "auto": - # Automated device type detection -+ from vllm.platforms import current_platform - self.device_type = current_platform.device_type - if not self.device_type: - raise RuntimeError("Failed to infer device type") -@@ -2235,9 +2247,10 @@ def _get_and_verify_dtype( - else: - torch_dtype = config_dtype - -+ from vllm.platforms import current_platform - if (current_platform.is_cpu() - and current_platform.get_cpu_architecture() -- == interface.CpuArchEnum.POWERPC -+ == CpuArchEnum.POWERPC - and (config_dtype == torch.float16 - or config_dtype == torch.float32)): - logger.info( -@@ -2299,6 +2312,8 @@ def _get_and_verify_max_len( - "seq_length", - # Command-R - "model_max_length", -+ # Whisper -+ "max_target_positions", - # Others - "max_sequence_length", - "max_seq_length", -@@ -2559,14 +2574,6 @@ class KVTransferConfig(BaseModel): - return KVTransferConfig.model_validate_json(cli_value) - - def model_post_init(self, __context: Any) -> None: -- supported_kv_connector = ["PyNcclConnector", "MooncakeConnector"] -- if all([ -- self.kv_connector is not None, self.kv_connector -- not in supported_kv_connector -- ]): -- raise ValueError(f"Unsupported kv_connector: {self.kv_connector}. " -- f"Supported connectors are " -- f"{supported_kv_connector}.") - - if self.kv_role is not None and self.kv_role not in [ - "kv_producer", "kv_consumer", "kv_both" -@@ -2977,6 +2984,10 @@ class VllmConfig: - init=True) # type: ignore - kv_transfer_config: KVTransferConfig = field(default=None, - init=True) # type: ignore -+ # some opaque config, only used to provide additional information -+ # for the hash computation, mainly used for testing and debugging. -+ additional_config: SupportsHash = field(default=None, -+ init=True) # type: ignore - instance_id: str = "" - - def compute_hash(self) -> str: -@@ -3008,33 +3019,62 @@ class VllmConfig: - vllm_factors.append(__version__) - if self.model_config: - vllm_factors.append(self.model_config.compute_hash()) -+ else: -+ vllm_factors.append("None") - if self.cache_config: - vllm_factors.append(self.cache_config.compute_hash()) -+ else: -+ vllm_factors.append("None") - if self.parallel_config: - vllm_factors.append(self.parallel_config.compute_hash()) -+ else: -+ vllm_factors.append("None") - if self.scheduler_config: - vllm_factors.append(self.scheduler_config.compute_hash()) -+ else: -+ vllm_factors.append("None") - if self.device_config: - vllm_factors.append(self.device_config.compute_hash()) -+ else: -+ vllm_factors.append("None") - if self.load_config: - vllm_factors.append(self.load_config.compute_hash()) -+ else: -+ vllm_factors.append("None") - if self.lora_config: - vllm_factors.append(self.lora_config.compute_hash()) -+ else: -+ vllm_factors.append("None") - if self.speculative_config: - vllm_factors.append(self.speculative_config.compute_hash()) -+ else: -+ vllm_factors.append("None") - if self.decoding_config: - vllm_factors.append(self.decoding_config.compute_hash()) -+ else: -+ vllm_factors.append("None") - if self.observability_config: - vllm_factors.append(self.observability_config.compute_hash()) -+ else: -+ vllm_factors.append("None") - if self.prompt_adapter_config: - vllm_factors.append(self.prompt_adapter_config.compute_hash()) -+ else: -+ vllm_factors.append("None") - if self.quant_config: - pass # should be captured by model_config.quantization - if self.compilation_config: - vllm_factors.append(self.compilation_config.compute_hash()) -+ else: -+ vllm_factors.append("None") - if self.kv_transfer_config: - vllm_factors.append(self.kv_transfer_config.compute_hash()) -- -+ else: -+ vllm_factors.append("None") -+ if self.additional_config: -+ vllm_factors.append(self.additional_config.compute_hash()) -+ else: -+ vllm_factors.append("None") - factors.append(vllm_factors) - - hash_str = hashlib.md5(str(factors).encode()).hexdigest()[:10] -@@ -3052,13 +3092,14 @@ class VllmConfig: - model_config: ModelConfig, - load_config: LoadConfig) -> Optional[QuantizationConfig]: - """Get the quantization config.""" -+ from vllm.platforms import current_platform - if model_config.quantization is not None: - from vllm.model_executor.model_loader.weight_utils import ( - get_quant_config) +@@ -3519,7 +3524,7 @@ class VllmConfig: quant_config = get_quant_config(model_config, load_config) capability_tuple = current_platform.get_device_capability() @@ -19342,362 +9978,22 @@ index ac767bbe1..7a7cfa225 100644 capability = capability_tuple.to_int() if capability < quant_config.get_min_capability(): raise ValueError( -@@ -3114,6 +3155,7 @@ class VllmConfig: - self.quant_config = VllmConfig._get_quantization_config( - self.model_config, self.load_config) - -+ from vllm.platforms import current_platform - if self.scheduler_config is not None and \ - self.model_config is not None and \ - self.scheduler_config.chunked_prefill_enabled and \ -diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py -index dca0b3fe8..90c1438ef 100644 ---- a/vllm/core/block/block_table.py -+++ b/vllm/core/block/block_table.py -@@ -23,7 +23,7 @@ class BlockTable: - blocks to initialize the BlockTable with. If not provided, an empty - BlockTable is created. - max_block_sliding_window (Optional[int], optional): The number of -- blocks to keep around for each sequance. If None, all blocks -+ blocks to keep around for each sequence. If None, all blocks - are kept (eg., when sliding window is not used). - It should at least fit the sliding window size of the model. - -diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py -index c3bc6becf..b3d396f9c 100644 ---- a/vllm/core/scheduler.py -+++ b/vllm/core/scheduler.py -@@ -1579,6 +1579,7 @@ class Scheduler: - seq.status = SequenceStatus.WAITING - self.free_seq(seq) - seq.reset_state_for_recompute() -+ self._free_seq_group_cross_attn_blocks(seq_group) - - def _preempt_by_swap( - self, -diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py -index a6800f93f..fda4d007c 100644 ---- a/vllm/distributed/device_communicators/pynccl.py -+++ b/vllm/distributed/device_communicators/pynccl.py -@@ -1,4 +1,3 @@ --from contextlib import contextmanager - from typing import Optional, Union - - # ===================== import region ===================== -@@ -51,7 +50,6 @@ class PyNcclCommunicator: - if self.world_size == 1: - self.available = False - self.disabled = True -- self.stream = None - return - try: - self.nccl = NCCLLibrary(library_path) -@@ -60,7 +58,6 @@ class PyNcclCommunicator: - # e.g. in a non-GPU environment - self.available = False - self.disabled = True -- self.stream = None - return - - self.available = True -@@ -98,12 +95,12 @@ class PyNcclCommunicator: - with torch.cuda.device(device): - self.comm: ncclComm_t = self.nccl.ncclCommInitRank( - self.world_size, self.unique_id, self.rank) -- self.stream = torch.cuda.Stream() - -+ stream = torch.cuda.current_stream() - # A small all_reduce for warmup. - data = torch.zeros(1, device=device) - self.all_reduce(data) -- self.stream.synchronize() -+ stream.synchronize() - del data - - def all_reduce(self, -@@ -122,7 +119,7 @@ class PyNcclCommunicator: - out_tensor = torch.empty_like(in_tensor) - - if stream is None: -- stream = self.stream -+ stream = torch.cuda.current_stream() - self.nccl.ncclAllReduce(buffer_type(in_tensor.data_ptr()), - buffer_type(out_tensor.data_ptr()), - in_tensor.numel(), -@@ -144,7 +141,7 @@ class PyNcclCommunicator: - f"this nccl communicator is created to work on {self.device}, " - f"but the input tensor is on {input_tensor.device}") - if stream is None: -- stream = self.stream -+ stream = torch.cuda.current_stream() - self.nccl.ncclAllGather( - buffer_type(input_tensor.data_ptr()), - buffer_type(output_tensor.data_ptr()), input_tensor.numel(), -@@ -165,7 +162,7 @@ class PyNcclCommunicator: - f"this nccl communicator is created to work on {self.device}, " - f"but the input tensor is on {input_tensor.device}") - if stream is None: -- stream = self.stream -+ stream = torch.cuda.current_stream() - self.nccl.ncclReduceScatter( - buffer_type(input_tensor.data_ptr()), - buffer_type(output_tensor.data_ptr()), output_tensor.numel(), -@@ -180,7 +177,7 @@ class PyNcclCommunicator: - f"this nccl communicator is created to work on {self.device}, " - f"but the input tensor is on {tensor.device}") - if stream is None: -- stream = self.stream -+ stream = torch.cuda.current_stream() - self.nccl.ncclSend(buffer_type(tensor.data_ptr()), tensor.numel(), - ncclDataTypeEnum.from_torch(tensor.dtype), dst, - self.comm, cudaStream_t(stream.cuda_stream)) -@@ -192,7 +189,7 @@ class PyNcclCommunicator: - f"this nccl communicator is created to work on {self.device}, " - f"but the input tensor is on {tensor.device}") - if stream is None: -- stream = self.stream -+ stream = torch.cuda.current_stream() - self.nccl.ncclRecv(buffer_type(tensor.data_ptr()), tensor.numel(), - ncclDataTypeEnum.from_torch(tensor.dtype), src, - self.comm, cudaStream_t(stream.cuda_stream)) -@@ -204,7 +201,7 @@ class PyNcclCommunicator: - f"this nccl communicator is created to work on {self.device}, " - f"but the input tensor is on {tensor.device}") - if stream is None: -- stream = self.stream -+ stream = torch.cuda.current_stream() - if src == self.rank: - sendbuff = buffer_type(tensor.data_ptr()) - # NCCL requires the sender also to have a receive buffer -@@ -215,27 +212,3 @@ class PyNcclCommunicator: - self.nccl.ncclBroadcast(sendbuff, recvbuff, tensor.numel(), - ncclDataTypeEnum.from_torch(tensor.dtype), src, - self.comm, cudaStream_t(stream.cuda_stream)) -- -- @contextmanager -- def change_state(self, -- enable: Optional[bool] = None, -- stream: Optional[torch.cuda.Stream] = None): -- """ -- A context manager to change the state of the communicator. -- """ -- if enable is None: -- # guess a default value when not specified -- enable = self.available -- -- if stream is None: -- stream = self.stream -- -- old_disable = self.disabled -- old_stream = self.stream -- -- self.stream = stream -- self.disabled = not enable -- yield -- -- self.disabled = old_disable -- self.stream = old_stream -diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py -index 3e2bb436d..6372dab72 100644 ---- a/vllm/distributed/kv_transfer/kv_connector/factory.py -+++ b/vllm/distributed/kv_transfer/kv_connector/factory.py -@@ -1,4 +1,5 @@ --from typing import TYPE_CHECKING -+import importlib -+from typing import TYPE_CHECKING, Callable, Dict, Type - - from .base import KVConnectorBase - -@@ -7,14 +8,41 @@ if TYPE_CHECKING: - - - class KVConnectorFactory: -+ _registry: Dict[str, Callable[[], Type[KVConnectorBase]]] = {} - -- @staticmethod -- def create_connector(rank: int, local_rank: int, -+ @classmethod -+ def register_connector(cls, name: str, module_path: str, -+ class_name: str) -> None: -+ """Register a connector with a lazy-loading module and class name.""" -+ if name in cls._registry: -+ raise ValueError(f"Connector '{name}' is already registered.") -+ -+ def loader() -> Type[KVConnectorBase]: -+ module = importlib.import_module(module_path) -+ return getattr(module, class_name) -+ -+ cls._registry[name] = loader -+ -+ @classmethod -+ def create_connector(cls, rank: int, local_rank: int, - config: "VllmConfig") -> KVConnectorBase: -- supported_kv_connector = ["PyNcclConnector", "MooncakeConnector"] -- if config.kv_transfer_config.kv_connector in supported_kv_connector: -- from .simple_connector import SimpleConnector -- return SimpleConnector(rank, local_rank, config) -- else: -- raise ValueError(f"Unsupported connector type: " -- f"{config.kv_connector}") -+ connector_name = config.kv_transfer_config.kv_connector -+ if connector_name not in cls._registry: -+ raise ValueError(f"Unsupported connector type: {connector_name}") -+ -+ connector_cls = cls._registry[connector_name]() -+ return connector_cls(rank, local_rank, config) -+ -+ -+# Register various connectors here. -+# The registration should not be done in each individual file, as we want to -+# only load the files corresponding to the current connector. -+KVConnectorFactory.register_connector( -+ "PyNcclConnector", -+ "vllm.distributed.kv_transfer.kv_connector.simple_connector", -+ "SimpleConnector") -+ -+KVConnectorFactory.register_connector( -+ "MooncakeConnector", -+ "vllm.distributed.kv_transfer.kv_connector.simple_connector", -+ "SimpleConnector") -diff --git a/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py -index 8e4358672..69049ec76 100644 ---- a/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py -+++ b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py -@@ -1,12 +1,13 @@ - import json - import os --import pickle - from concurrent.futures import ThreadPoolExecutor - from dataclasses import dataclass - from typing import Optional, Union - - import torch - import zmq -+from safetensors.torch import load as safetensors_load -+from safetensors.torch import save as safetensors_save - - from vllm.config import KVTransferConfig - from vllm.distributed.kv_transfer.kv_pipe.base import KVPipeBase -@@ -235,14 +236,13 @@ class MooncakePipe(KVPipeBase): - return hash(tensor.data_ptr()) - - def _send_impl(self, tensor: torch.Tensor) -> None: -- """Implement the tensor sending logic.""" -- value_bytes = pickle.dumps(tensor) -- self.transfer_engine.send_bytes(value_bytes) -+ """Implement the tensor sending logic using safetensors.""" -+ self.transfer_engine.send_bytes(safetensors_save({"tensor": tensor})) - - def _recv_impl(self) -> torch.Tensor: -- """Implement the tensor receiving logic.""" -+ """Implement the tensor receiving logic using safetensors.""" - data = self.transfer_engine.recv_bytes() -- return pickle.loads(data) -+ return safetensors_load(data)["tensor"].to(self.device) - - def send_tensor(self, tensor: Optional[torch.Tensor]) -> None: - """Send tensor to the target process.""" -diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py -index 5b9236f8c..a837c1dc5 100644 ---- a/vllm/distributed/parallel_state.py -+++ b/vllm/distributed/parallel_state.py -@@ -39,7 +39,6 @@ import vllm.distributed.kv_transfer.kv_transfer_agent as kv_transfer - import vllm.envs as envs - from vllm.distributed.utils import StatelessProcessGroup - from vllm.logger import init_logger --from vllm.platforms import current_platform - from vllm.utils import direct_register_custom_op, supports_custom_op - - if TYPE_CHECKING: -@@ -194,6 +193,7 @@ class GroupCoordinator: - assert self.cpu_group is not None - assert self.device_group is not None - -+ from vllm.platforms import current_platform - if current_platform.is_cuda_alike(): - self.device = torch.device(f"cuda:{local_rank}") - else: -@@ -305,15 +305,7 @@ class GroupCoordinator: - stream.wait_stream(curr_stream) - - with torch.cuda.stream(stream), maybe_ca_context: -- pynccl_comm = self.pynccl_comm -- maybe_pynccl_context: Any -- if not pynccl_comm: -- maybe_pynccl_context = nullcontext() -- else: -- maybe_pynccl_context = pynccl_comm.change_state( -- stream=torch.cuda.current_stream()) -- with maybe_pynccl_context: -- yield graph_capture_context -+ yield graph_capture_context - - def all_reduce(self, input_: torch.Tensor) -> torch.Tensor: - """ -@@ -920,7 +912,7 @@ def get_kv_transfer_group() -> kv_transfer.KVTransferAgent: - - - @contextmanager --def graph_capture(): -+def graph_capture(device: torch.device): - """ - `graph_capture` is a context manager which should surround the code that - is capturing the CUDA graph. Its main purpose is to ensure that the -@@ -934,8 +926,9 @@ def graph_capture(): - in order to explicitly distinguish the kernels to capture - from other kernels possibly launched on background in the default stream. - """ -- with get_tp_group().graph_capture() as context, get_pp_group( -- ).graph_capture(context): -+ context = GraphCaptureContext(torch.cuda.Stream(device=device)) -+ with get_tp_group().graph_capture(context), get_pp_group().graph_capture( -+ context): - yield context - - -@@ -1188,6 +1181,7 @@ def cleanup_dist_env_and_memory(shutdown_ray: bool = False): - import ray # Lazy import Ray - ray.shutdown() - gc.collect() -+ from vllm.platforms import current_platform - if not current_platform.is_cpu(): - torch.cuda.empty_cache() - diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py -index 21966d003..3da26fcb5 100644 +index 89c9b6747..a5be57ce0 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py -@@ -18,7 +18,6 @@ from vllm.config import (CacheConfig, CompilationConfig, ConfigFormat, - from vllm.executor.executor_base import ExecutorBase - from vllm.logger import init_logger - from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS --from vllm.platforms import current_platform - from vllm.transformers_utils.utils import check_gguf_file - from vllm.usage.usage_lib import UsageContext - from vllm.utils import FlexibleArgumentParser, StoreBoolean -@@ -112,8 +111,8 @@ class EngineArgs: - pipeline_parallel_size: int = 1 - tensor_parallel_size: int = 1 - max_parallel_loading_workers: Optional[int] = None -- block_size: Optional[int] = None -- enable_prefix_caching: Optional[bool] = None -+ block_size: int = 8 -+ enable_prefix_caching: bool = False - disable_sliding_window: bool = False - use_v2_block_manager: bool = True - swap_space: float = 4 # GiB -@@ -198,6 +197,8 @@ class EngineArgs: - kv_transfer_config: Optional[KVTransferConfig] = None - - generation_config: Optional[str] = None +@@ -210,6 +210,8 @@ class EngineArgs: + enable_reasoning: Optional[bool] = None + reasoning_parser: Optional[str] = None + use_tqdm_on_load: bool = True + low_bit_model_path: Optional[str] = None + low_bit_save_path: Optional[str] = None def __post_init__(self): if not self.tokenizer: -@@ -956,6 +957,18 @@ class EngineArgs: - "loaded from model. If set to a folder path, the generation config " - "will be loaded from the specified folder path.") +@@ -991,6 +993,18 @@ class EngineArgs: + "using. This is used to parse the reasoning content into OpenAI " + "API format. Required for ``--enable-reasoning``.") + parser.add_argument( + "--low-bit-model-path", @@ -19710,340 +10006,44 @@ index 21966d003..3da26fcb5 100644 + type=nullable_str, + default=None, + help="Path for Low-bit saver") -+ - return parser - - @classmethod -@@ -1000,11 +1013,17 @@ class EngineArgs: - override_neuron_config=self.override_neuron_config, - override_pooler_config=self.override_pooler_config, - logits_processor_pattern=self.logits_processor_pattern, -- generation_config=self.generation_config) -+ generation_config=self.generation_config, ++ + parser.add_argument( + "--disable-cascade-attn", + action="store_true", +@@ -1061,6 +1075,8 @@ class EngineArgs: + override_generation_config=self.override_generation_config, + enable_sleep_mode=self.enable_sleep_mode, + model_impl=self.model_impl, + low_bit_model_path=self.low_bit_model_path, -+ low_bit_save_path=self.low_bit_save_path) ++ low_bit_save_path=self.low_bit_save_path, + ) def create_load_config(self) -> LoadConfig: -+ use_low_bit_loader = False -+ if self.low_bit_model_path is not None: -+ use_low_bit_loader = True - return LoadConfig( - load_format=self.load_format, -+ use_low_bit_loader=use_low_bit_loader, - download_dir=self.download_dir, - model_loader_extra_config=self.model_loader_extra_config, - ignore_patterns=self.ignore_patterns, -@@ -1020,6 +1039,9 @@ class EngineArgs: - if check_gguf_file(self.model): - self.quantization = self.load_format = "gguf" - -+ if self.low_bit_model_path is not None and self.low_bit_save_path is not None: -+ raise ValueError(f"Please do not set --low-bit-model-path and --low-bit-save-path together") -+ - # bitsandbytes quantization needs a specific model loader - # so we make sure the quant method and the load format are consistent - if (self.quantization == "bitsandbytes" or -@@ -1094,6 +1116,7 @@ class EngineArgs: - use_sliding_window = (model_config.get_sliding_window() - is not None) - use_spec_decode = self.speculative_model is not None -+ from vllm.platforms import current_platform - if (is_gpu and not use_sliding_window and not use_spec_decode - and not self.enable_lora - and not self.enable_prompt_adapter -diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py -index 39f59e55d..ba3f7cfa6 100644 ---- a/vllm/engine/llm_engine.py -+++ b/vllm/engine/llm_engine.py -@@ -476,13 +476,17 @@ class LLMEngine: - initialize_ray_cluster(engine_config.parallel_config) - from vllm.executor.ray_xpu_executor import RayXPUExecutor - executor_class = RayXPUExecutor -+ # elif distributed_executor_backend == "mp": -+ # # FIXME(kunshang): -+ # # spawn needs calling `if __name__ == '__main__':`` -+ # # fork is not supported for xpu start new process. -+ # logger.error( -+ # "Both start methods (spawn and fork) have issue " -+ # "on XPU if you use mp backend, Please try ray instead.") - elif distributed_executor_backend == "mp": -- # FIXME(kunshang): -- # spawn needs calling `if __name__ == '__main__':`` -- # fork is not supported for xpu start new process. -- logger.error( -- "Both start methods (spawn and fork) have issue " -- "on XPU if you use mp backend, Please try ray instead.") -+ from vllm.executor.multiproc_xpu_executor import ( -+ MultiprocessingXPUExecutorAsync) -+ executor_class = MultiprocessingXPUExecutorAsync - else: - from vllm.executor.xpu_executor import XPUExecutor - executor_class = XPUExecutor -@@ -1124,6 +1128,8 @@ class LLMEngine: +@@ -1504,12 +1520,13 @@ class EngineArgs: + _raise_or_fallback(feature_name=name, recommend_to_remove=True) + return False + +- # Platforms must decide if they can support v1 for this model +- if not current_platform.supports_v1(model_config=model_config): +- _raise_or_fallback( +- feature_name=f"device type={current_platform.device_type}", +- recommend_to_remove=False) +- return False ++ # # No support for device type other than CUDA, AMD (experiemntal) or ++ # # TPU (experimental) so far. ++ # if not (current_platform.is_cuda_alike() or current_platform.is_tpu()): ++ # _raise_or_fallback( ++ # feature_name=f"device type={current_platform.device_type}", ++ # recommend_to_remove=False) ++ # return False + ############################################################# + # Experimental Features - allow users to opt in. - seq_group = scheduled_seq_group.seq_group - seq_group.maybe_set_first_token_time(now) -+ if not seq_group.is_prefill(): -+ seq_group.set_last_token_time(now) - request_output = RequestOutputFactory.create( - seq_group, - self.seq_id_to_seq_group, -@@ -1166,6 +1172,8 @@ class LLMEngine: - - seq_group = scheduled_seq_group.seq_group - seq_group.maybe_set_first_token_time(now) -+ if not seq_group.is_prefill(): -+ seq_group.set_last_token_time(now) - request_output = RequestOutputFactory.create( - seq_group, - self.seq_id_to_seq_group, -@@ -1686,7 +1694,7 @@ class LLMEngine: - # If the seq_group just finished the prefill state - # get TTFT. - if not seq_group.is_prefill(): -- latency = seq_group.get_last_latency(now) -+ latency = seq_group.get_last_token_latency() - time_to_first_tokens_iter.append(latency) - - # One generation token per finished prefill. -@@ -1694,7 +1702,7 @@ class LLMEngine: - seq_group.num_seqs()) - else: - # TPOTs. -- latency = seq_group.get_last_latency(now) -+ latency = seq_group.get_last_token_latency() - time_per_output_tokens_iter.append(latency) - if seq_group.state.current_step == 0: - # For async_output_proc, the do_log_stats() diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py -index 3df08c740..88b99bdf4 100644 +index e48ffae9b..866a5052e 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py -@@ -6,7 +6,7 @@ from collections import defaultdict, deque - from functools import lru_cache, partial - from pathlib import Path - from typing import (Any, Awaitable, Callable, Dict, Generic, Iterable, List, -- Literal, Mapping, Optional, Tuple, TypeVar, Union, cast) -+ Literal, Optional, Tuple, TypeVar, Union, cast) - - import jinja2.nodes - import transformers.utils.chat_template_utils as hf_chat_utils -@@ -23,6 +23,8 @@ from openai.types.chat import ( - ChatCompletionMessageParam as OpenAIChatCompletionMessageParam) - from openai.types.chat import (ChatCompletionMessageToolCallParam, - ChatCompletionToolMessageParam) -+from openai.types.chat.chat_completion_content_part_input_audio_param import ( -+ InputAudio) - # yapf: enable - # pydantic needs the TypedDict from typing_extensions - from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast -@@ -31,11 +33,7 @@ from typing_extensions import Required, TypeAlias, TypedDict - from vllm.config import ModelConfig - from vllm.logger import init_logger - from vllm.multimodal import MultiModalDataDict --from vllm.multimodal.utils import (async_get_and_parse_audio, -- async_get_and_parse_image, -- async_get_and_parse_video, -- get_and_parse_audio, get_and_parse_image, -- get_and_parse_video) -+from vllm.multimodal.utils import MediaConnector - from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer - from vllm.utils import print_warning_once - -@@ -368,14 +366,17 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]): - self._tokenizer = tokenizer - self._allowed_items = (model_config.multimodal_config.limit_per_prompt - if model_config.multimodal_config else {}) -- self._consumed_items = {k: 0 for k in self._allowed_items} - -- self._items: List[_T] = [] -+ self._items_by_modality = defaultdict[str, list[_T]](list) - - @property - def model_config(self) -> ModelConfig: - return self._model_config - -+ @property -+ def allowed_local_media_path(self): -+ return self._model_config.allowed_local_media_path -+ - @staticmethod - @lru_cache(maxsize=None) - def _cached_token_str(tokenizer: AnyTokenizer, token_index: int) -> str: -@@ -435,38 +436,19 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]): - else: - raise TypeError(f"Unknown modality: {modality}") - -- @staticmethod -- def _combine(items: List[MultiModalDataDict]) -> MultiModalDataDict: -- mm_lists: Mapping[str, List[object]] = defaultdict(list) -- -- # Merge all the multi-modal items -- for single_mm_data in items: -- for mm_key, mm_item in single_mm_data.items(): -- if isinstance(mm_item, list): -- mm_lists[mm_key].extend(mm_item) -- else: -- mm_lists[mm_key].append(mm_item) -- -- # Unpack any single item lists for models that don't expect multiple. -- return { -- mm_key: mm_list[0] if len(mm_list) == 1 else mm_list -- for mm_key, mm_list in mm_lists.items() -- } -- - def add(self, modality: ModalityStr, item: _T) -> Optional[str]: - """ - Add a multi-modal item to the current prompt and returns the - placeholder string to use, if any. - """ - allowed_count = self._allowed_items.get(modality, 1) -- current_count = self._consumed_items.get(modality, 0) + 1 -+ current_count = len(self._items_by_modality[modality]) + 1 - if current_count > allowed_count: - raise ValueError( - f"At most {allowed_count} {modality}(s) may be provided in " - "one request.") - -- self._consumed_items[modality] = current_count -- self._items.append(item) -+ self._items_by_modality[modality].append(item) - - return self._placeholder_str(modality, current_count) - -@@ -475,22 +457,26 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]): - raise NotImplementedError - - --class MultiModalItemTracker(BaseMultiModalItemTracker[MultiModalDataDict]): -+class MultiModalItemTracker(BaseMultiModalItemTracker[object]): - - def all_mm_data(self) -> Optional[MultiModalDataDict]: -- return self._combine(self._items) if self._items else None -+ if self._items_by_modality: -+ return dict(self._items_by_modality) -+ -+ return None - - def create_parser(self) -> "BaseMultiModalContentParser": - return MultiModalContentParser(self) - - --class AsyncMultiModalItemTracker( -- BaseMultiModalItemTracker[Awaitable[MultiModalDataDict]]): -+class AsyncMultiModalItemTracker(BaseMultiModalItemTracker[Awaitable[object]]): - - async def all_mm_data(self) -> Optional[MultiModalDataDict]: -- if self._items: -- items = await asyncio.gather(*self._items) -- return self._combine(items) -+ if self._items_by_modality: -+ return { -+ modality: await asyncio.gather(*items) -+ for modality, items in self._items_by_modality.items() -+ } - - return None - -@@ -522,7 +508,7 @@ class BaseMultiModalContentParser(ABC): - raise NotImplementedError - - @abstractmethod -- def parse_input_audio(self, input_audio: Dict[str, str]) -> None: -+ def parse_input_audio(self, input_audio: InputAudio) -> None: - raise NotImplementedError - - @abstractmethod -@@ -537,31 +523,31 @@ class MultiModalContentParser(BaseMultiModalContentParser): - - self._tracker = tracker - -+ self._connector = MediaConnector( -+ allowed_local_media_path=tracker.allowed_local_media_path, -+ ) -+ - def parse_image(self, image_url: str) -> None: -- image = get_and_parse_image(image_url, -- allowed_local_media_path=self._tracker. -- _model_config.allowed_local_media_path) -+ image = self._connector.fetch_image(image_url) - - placeholder = self._tracker.add("image", image) - self._add_placeholder(placeholder) - - def parse_audio(self, audio_url: str) -> None: -- audio = get_and_parse_audio(audio_url) -+ audio = self._connector.fetch_audio(audio_url) - - placeholder = self._tracker.add("audio", audio) - self._add_placeholder(placeholder) - -- def parse_input_audio(self, input_audio: Dict[str, str]) -> None: -- input_audio_data = input_audio.get("data","") -- input_audio_format = input_audio.get("format","") -- audio_url = f"data:audio/{input_audio_format};base64,{input_audio_data}" -- audio = get_and_parse_audio(audio_url) -+ def parse_input_audio(self, input_audio: InputAudio) -> None: -+ audio_data = input_audio.get("data", "") -+ audio_format = input_audio.get("format", "") -+ audio_url = f"data:audio/{audio_format};base64,{audio_data}" - -- placeholder = self._tracker.add("audio", audio) -- self._add_placeholder(placeholder) -+ return self.parse_audio(audio_url) - - def parse_video(self, video_url: str) -> None: -- video = get_and_parse_video(video_url) -+ video = self._connector.fetch_video(video_url) - - placeholder = self._tracker.add("video", video) - self._add_placeholder(placeholder) -@@ -573,33 +559,31 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser): - super().__init__() - - self._tracker = tracker -+ self._connector = MediaConnector( -+ allowed_local_media_path=tracker.allowed_local_media_path, -+ ) - - def parse_image(self, image_url: str) -> None: -- image_coro = async_get_and_parse_image( -- image_url, -- allowed_local_media_path=self._tracker._model_config. -- allowed_local_media_path) -+ image_coro = self._connector.fetch_image_async(image_url) - - placeholder = self._tracker.add("image", image_coro) - self._add_placeholder(placeholder) - - def parse_audio(self, audio_url: str) -> None: -- audio_coro = async_get_and_parse_audio(audio_url) -+ audio_coro = self._connector.fetch_audio_async(audio_url) - - placeholder = self._tracker.add("audio", audio_coro) - self._add_placeholder(placeholder) - -- def parse_input_audio(self, input_audio: Dict[str, str]) -> None: -- input_audio_data = input_audio.get("data","") -- input_audio_format = input_audio.get("format","") -- audio_url = f"data:audio/{input_audio_format};base64,{input_audio_data}" -- audio_coro = async_get_and_parse_audio(audio_url) -+ def parse_input_audio(self, input_audio: InputAudio) -> None: -+ audio_data = input_audio.get("data", "") -+ audio_format = input_audio.get("format", "") -+ audio_url = f"data:audio/{audio_format};base64,{audio_data}" - -- placeholder = self._tracker.add("audio", audio_coro) -- self._add_placeholder(placeholder) -+ return self.parse_audio(audio_url) - - def parse_video(self, video_url: str) -> None: -- video = async_get_and_parse_video(video_url) -+ video = self._connector.fetch_video_async(video_url) - - placeholder = self._tracker.add("video", video) - self._add_placeholder(placeholder) -@@ -684,7 +668,8 @@ def _get_full_multimodal_text_prompt(placeholder_counts: Dict[str, int], +@@ -854,7 +854,8 @@ def _get_full_multimodal_text_prompt(placeholder_counts: dict[str, int], # NOTE: For now we always add missing placeholders at the front of # the prompt. This may change to be customizable in the future. @@ -20053,824 +10053,33 @@ index 3df08c740..88b99bdf4 100644 # No need to validate using Pydantic again -@@ -695,10 +680,13 @@ _InputAudioParser = partial(cast, ChatCompletionContentPartInputAudioParam) - _RefusalParser = partial(cast, ChatCompletionContentPartRefusalParam) - _VideoParser = partial(cast, ChatCompletionContentPartVideoParam) - -+_ContentPart: TypeAlias = Union[str, Dict[str, str], InputAudio] -+ - # Define a mapping from part types to their corresponding parsing functions. --MM_PARSER_MAP: Dict[str, -- Callable[[ChatCompletionContentPartParam], -- Union[str, Dict[str,str]]]] = { -+MM_PARSER_MAP: Dict[ -+ str, -+ Callable[[ChatCompletionContentPartParam], _ContentPart], -+] = { - "text": - lambda part: _TextParser(part).get("text", ""), - "image_url": -@@ -715,8 +703,7 @@ MM_PARSER_MAP: Dict[str, - - - def _parse_chat_message_content_mm_part( -- part: ChatCompletionContentPartParam) -> Tuple[str, -- Union[str, Dict[str, str]]]: -+ part: ChatCompletionContentPartParam) -> tuple[str, _ContentPart]: - """ - Parses a given multi-modal content part based on its type. - -@@ -783,7 +770,7 @@ def _parse_chat_message_content_parts( - *, - wrap_dicts: bool, - ) -> List[ConversationMessage]: -- content: List[Union[str, Dict[str, str]]] = [] -+ content = list[_ContentPart]() - - mm_parser = mm_tracker.create_parser() - -@@ -814,7 +801,7 @@ def _parse_chat_message_content_part( - mm_parser: BaseMultiModalContentParser, - *, - wrap_dicts: bool, --) -> Optional[Union[str, Dict[str, str]]]: -+) -> Optional[_ContentPart]: - """Parses a single part of a conversation. If wrap_dicts is True, - structured dictionary pieces for texts and images will be - wrapped in dictionaries, i.e., {"type": "text", "text", ...} and -@@ -823,8 +810,7 @@ def _parse_chat_message_content_part( - with multimodal placeholders. - """ - if isinstance(part, str): # Handle plain text parts -- text = _TextParser(part) -- return text -+ return part - - # Handle structured dictionary parts - part_type, content = _parse_chat_message_content_mm_part(part) -@@ -855,7 +841,7 @@ def _parse_chat_message_content_part( - return {'type': 'audio'} if wrap_dicts else None - - if part_type == "input_audio": -- dict_content = cast(Dict[str, str], content) -+ dict_content = cast(InputAudio, content) - mm_parser.parse_input_audio(dict_content) - return {'type': 'audio'} if wrap_dicts else None - -@@ -979,11 +965,11 @@ def apply_hf_chat_template( - tokenize: bool = False, # Different from HF's default - **kwargs: Any, - ) -> str: -- if chat_template is None and tokenizer.chat_template is None: -- raise ValueError( -- "As of transformers v4.44, default chat template is no longer " -- "allowed, so you must provide a chat template if the tokenizer " -- "does not define one.") -+ # if chat_template is None and tokenizer.chat_template is None: -+ # raise ValueError( -+ # "As of transformers v4.44, default chat template is no longer " -+ # "allowed, so you must provide a chat template if the tokenizer " -+ # "does not define one.") - - return tokenizer.apply_chat_template( - conversation=conversation, # type: ignore[arg-type] -diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py -index fadf297e9..e4e0803c6 100644 ---- a/vllm/entrypoints/llm.py -+++ b/vllm/entrypoints/llm.py -@@ -224,19 +224,13 @@ class LLM: - ) - # Logic to switch between engines is done at runtime instead of import - # to avoid import order issues -+ # This function will decide which engine to use, V0 or V1 - self.engine_class = self.get_engine_class() -- -- # TODO(rob): enable mp by default (issue with fork vs spawn) - self.llm_engine = self.engine_class.from_engine_args( - engine_args, usage_context=UsageContext.LLM_CLASS) - - self.request_counter = Counter() - -- def __del__(self): -- if hasattr(self, 'llm_engine') and self.llm_engine and hasattr( -- self.llm_engine, "shutdown"): -- self.llm_engine.shutdown() -- - @staticmethod - def get_engine_class() -> Type[LLMEngine]: - if envs.VLLM_USE_V1: diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py -index 2e45b4742..b468085b5 100644 +index 6a8bdd060..13c076df1 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py -@@ -3,6 +3,8 @@ import atexit - import importlib - import inspect - import multiprocessing -+# Fix https://avd.aquasec.com/nvd/cve-2022-42919 -+multiprocessing.util.abstract_sockets_supported = False - import os - import re - import signal -@@ -16,7 +18,7 @@ from http import HTTPStatus - from typing import AsyncIterator, Optional, Set, Tuple - - import uvloop --from fastapi import APIRouter, FastAPI, Request -+from fastapi import APIRouter, FastAPI, HTTPException, Request - from fastapi.exceptions import RequestValidationError - from fastapi.middleware.cors import CORSMiddleware - from fastapi.responses import JSONResponse, Response, StreamingResponse -@@ -44,11 +46,15 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, - CompletionResponse, - DetokenizeRequest, - DetokenizeResponse, -+ EmbeddingChatRequest, -+ EmbeddingCompletionRequest, - EmbeddingRequest, - EmbeddingResponse, - EmbeddingResponseData, - ErrorResponse, - LoadLoraAdapterRequest, -+ PoolingChatRequest, -+ PoolingCompletionRequest, - PoolingRequest, PoolingResponse, - ScoreRequest, ScoreResponse, - TokenizeRequest, -@@ -58,7 +64,9 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, - from vllm.entrypoints.openai.serving_chat import OpenAIServingChat - from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion - from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding --from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing -+from vllm.entrypoints.openai.serving_engine import OpenAIServing -+from vllm.entrypoints.openai.serving_models import (BaseModelPath, -+ OpenAIServingModels) - from vllm.entrypoints.openai.serving_pooling import OpenAIServingPooling - from vllm.entrypoints.openai.serving_score import OpenAIServingScores - from vllm.entrypoints.openai.serving_tokenization import ( -@@ -133,32 +141,21 @@ async def build_async_engine_client_from_engine_args( - Returns the Client or None if the creation failed. - """ - -- # Fall back -- # TODO: fill out feature matrix. -+ # AsyncLLMEngine. - if (MQLLMEngineClient.is_unsupported_config(engine_args) - or envs.VLLM_USE_V1 or disable_frontend_multiprocessing): -- engine_config = engine_args.create_engine_config( -- UsageContext.OPENAI_API_SERVER) -- uses_ray = getattr(AsyncLLMEngine._get_executor_cls(engine_config), -- "uses_ray", False) -- -- build_engine = partial(AsyncLLMEngine.from_engine_args, -- engine_args=engine_args, -- engine_config=engine_config, -- usage_context=UsageContext.OPENAI_API_SERVER) -- if uses_ray: -- # Must run in main thread with ray for its signal handlers to work -- engine_client = build_engine() -- else: -- engine_client = await asyncio.get_running_loop().run_in_executor( -- None, build_engine) - -- yield engine_client -- if hasattr(engine_client, "shutdown"): -- engine_client.shutdown() -- return -+ engine_client: Optional[EngineClient] = None -+ try: -+ engine_client = AsyncLLMEngine.from_engine_args( -+ engine_args=engine_args, -+ usage_context=UsageContext.OPENAI_API_SERVER) -+ yield engine_client -+ finally: -+ if engine_client and hasattr(engine_client, "shutdown"): -+ engine_client.shutdown() - -- # Otherwise, use the multiprocessing AsyncLLMEngine. -+ # MQLLMEngine. - else: - if "PROMETHEUS_MULTIPROC_DIR" not in os.environ: - # Make TemporaryDirectory for prometheus multiprocessing -@@ -280,6 +277,10 @@ def base(request: Request) -> OpenAIServing: - return tokenization(request) - - -+def models(request: Request) -> OpenAIServingModels: -+ return request.app.state.openai_serving_models -+ -+ - def chat(request: Request) -> Optional[OpenAIServingChat]: - return request.app.state.openai_serving_chat - -@@ -315,6 +316,12 @@ async def health(raw_request: Request) -> Response: - return Response(status_code=200) - - -+@router.api_route("/ping", methods=["GET", "POST"]) -+async def ping(raw_request: Request) -> Response: -+ """Ping check. Endpoint required for SageMaker""" -+ return await health(raw_request) -+ -+ - @router.post("/tokenize") - @with_cancellation - async def tokenize(request: TokenizeRequest, raw_request: Request): -@@ -347,10 +354,10 @@ async def detokenize(request: DetokenizeRequest, raw_request: Request): - - @router.get("/v1/models") - async def show_available_models(raw_request: Request): -- handler = base(raw_request) -+ handler = models(raw_request) - -- models = await handler.show_available_models() -- return JSONResponse(content=models.model_dump()) -+ models_ = await handler.show_available_models() -+ return JSONResponse(content=models_.model_dump()) - - - @router.get("/version") -@@ -488,6 +495,54 @@ async def create_score_v1(request: ScoreRequest, raw_request: Request): - return await create_score(request, raw_request) - - -+TASK_HANDLERS = { -+ "generate": { -+ "messages": (ChatCompletionRequest, create_chat_completion), -+ "default": (CompletionRequest, create_completion), -+ }, -+ "embed": { -+ "messages": (EmbeddingChatRequest, create_embedding), -+ "default": (EmbeddingCompletionRequest, create_embedding), -+ }, -+ "score": { -+ "default": (ScoreRequest, create_score), -+ }, -+ "reward": { -+ "messages": (PoolingChatRequest, create_pooling), -+ "default": (PoolingCompletionRequest, create_pooling), -+ }, -+ "classify": { -+ "messages": (PoolingChatRequest, create_pooling), -+ "default": (PoolingCompletionRequest, create_pooling), -+ }, -+} -+ -+ -+@router.post("/invocations") -+async def invocations(raw_request: Request): -+ """ -+ For SageMaker, routes requests to other handlers based on model `task`. -+ """ -+ body = await raw_request.json() -+ task = raw_request.app.state.task -+ -+ if task not in TASK_HANDLERS: -+ raise HTTPException( -+ status_code=400, -+ detail=f"Unsupported task: '{task}' for '/invocations'. " -+ f"Expected one of {set(TASK_HANDLERS.keys())}") -+ -+ handler_config = TASK_HANDLERS[task] -+ if "messages" in body: -+ request_model, handler = handler_config["messages"] -+ else: -+ request_model, handler = handler_config["default"] -+ -+ # this is required since we lose the FastAPI automatic casting -+ request = request_model.model_validate(body) -+ return await handler(request, raw_request) -+ -+ - if envs.VLLM_TORCH_PROFILER_DIR: - logger.warning( - "Torch Profiler is enabled in the API server. This should ONLY be " -@@ -516,26 +571,22 @@ if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING: - @router.post("/v1/load_lora_adapter") - async def load_lora_adapter(request: LoadLoraAdapterRequest, - raw_request: Request): -- for route in [chat, completion, embedding]: -- handler = route(raw_request) -- if handler is not None: -- response = await handler.load_lora_adapter(request) -- if isinstance(response, ErrorResponse): -- return JSONResponse(content=response.model_dump(), -- status_code=response.code) -+ handler = models(raw_request) -+ response = await handler.load_lora_adapter(request) -+ if isinstance(response, ErrorResponse): -+ return JSONResponse(content=response.model_dump(), -+ status_code=response.code) - - return Response(status_code=200, content=response) - - @router.post("/v1/unload_lora_adapter") - async def unload_lora_adapter(request: UnloadLoraAdapterRequest, - raw_request: Request): -- for route in [chat, completion, embedding]: -- handler = route(raw_request) -- if handler is not None: -- response = await handler.unload_lora_adapter(request) -- if isinstance(response, ErrorResponse): -- return JSONResponse(content=response.model_dump(), -- status_code=response.code) -+ handler = models(raw_request) -+ response = await handler.unload_lora_adapter(request) -+ if isinstance(response, ErrorResponse): -+ return JSONResponse(content=response.model_dump(), -+ status_code=response.code) - - return Response(status_code=200, content=response) - -@@ -639,13 +690,18 @@ def init_app_state( - resolved_chat_template = load_chat_template(args.chat_template) - logger.info("Using supplied chat template:\n%s", resolved_chat_template) - -+ state.openai_serving_models = OpenAIServingModels( -+ model_config=model_config, -+ base_model_paths=base_model_paths, -+ lora_modules=args.lora_modules, -+ prompt_adapters=args.prompt_adapters, -+ ) -+ # TODO: The chat template is now broken for lora adapters :( - state.openai_serving_chat = OpenAIServingChat( - engine_client, - model_config, -- base_model_paths, -+ state.openai_serving_models, - args.response_role, -- lora_modules=args.lora_modules, -- prompt_adapters=args.prompt_adapters, - request_logger=request_logger, - chat_template=resolved_chat_template, - chat_template_content_format=args.chat_template_content_format, -@@ -657,16 +713,14 @@ def init_app_state( - state.openai_serving_completion = OpenAIServingCompletion( - engine_client, - model_config, -- base_model_paths, -- lora_modules=args.lora_modules, -- prompt_adapters=args.prompt_adapters, -+ state.openai_serving_models, - request_logger=request_logger, - return_tokens_as_token_ids=args.return_tokens_as_token_ids, - ) if model_config.runner_type == "generate" else None - state.openai_serving_pooling = OpenAIServingPooling( - engine_client, - model_config, -- base_model_paths, -+ state.openai_serving_models, - request_logger=request_logger, - chat_template=resolved_chat_template, - chat_template_content_format=args.chat_template_content_format, -@@ -674,7 +728,7 @@ def init_app_state( - state.openai_serving_embedding = OpenAIServingEmbedding( - engine_client, - model_config, -- base_model_paths, -+ state.openai_serving_models, - request_logger=request_logger, - chat_template=resolved_chat_template, - chat_template_content_format=args.chat_template_content_format, -@@ -682,18 +736,18 @@ def init_app_state( - state.openai_serving_scores = OpenAIServingScores( - engine_client, - model_config, -- base_model_paths, -+ state.openai_serving_models, - request_logger=request_logger - ) if model_config.task == "score" else None - state.openai_serving_tokenization = OpenAIServingTokenization( - engine_client, - model_config, -- base_model_paths, -- lora_modules=args.lora_modules, -+ state.openai_serving_models, - request_logger=request_logger, - chat_template=resolved_chat_template, - chat_template_content_format=args.chat_template_content_format, - ) -+ state.task = model_config.task - - - def create_server_socket(addr: Tuple[str, int]) -> socket.socket: -@@ -715,11 +769,11 @@ async def run_server(args, **uvicorn_kwargs) -> None: - if args.tool_parser_plugin and len(args.tool_parser_plugin) > 3: - ToolParserManager.import_tool_parser(args.tool_parser_plugin) - -- valide_tool_parses = ToolParserManager.tool_parsers.keys() -+ valid_tool_parses = ToolParserManager.tool_parsers.keys() - if args.enable_auto_tool_choice \ -- and args.tool_call_parser not in valide_tool_parses: -+ and args.tool_call_parser not in valid_tool_parses: - raise KeyError(f"invalid tool call parser: {args.tool_call_parser} " -- f"(chose from {{ {','.join(valide_tool_parses)} }})") -+ f"(chose from {{ {','.join(valid_tool_parses)} }})") - - # workaround to make sure that we bind the port before the engine is set up. - # This avoids race conditions with ray. -@@ -765,6 +819,8 @@ async def run_server(args, **uvicorn_kwargs) -> None: - if __name__ == "__main__": +@@ -1111,6 +1111,8 @@ if __name__ == "__main__": # NOTE(simon): - # This section should be in sync with vllm/scripts.py for CLI entrypoints. + # This section should be in sync with vllm/entrypoints/cli/main.py for CLI + # entrypoints. + logger.warning("Warning: Please use `ipex_llm.vllm.xpu.entrypoints.openai.api_server` " + "instead of `vllm.entrypoints.openai.api_server` to start the API server") + cli_env_setup() parser = FlexibleArgumentParser( description="vLLM OpenAI-Compatible RESTful API server.") - parser = make_arg_parser(parser) -diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py -index 908f8c353..22206ef8d 100644 ---- a/vllm/entrypoints/openai/cli_args.py -+++ b/vllm/entrypoints/openai/cli_args.py -@@ -12,7 +12,7 @@ from typing import List, Optional, Sequence, Union, get_args - from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str - from vllm.entrypoints.chat_utils import (ChatTemplateContentFormatOption, - validate_chat_template) --from vllm.entrypoints.openai.serving_engine import (LoRAModulePath, -+from vllm.entrypoints.openai.serving_models import (LoRAModulePath, - PromptAdapterPath) - from vllm.entrypoints.openai.tool_parsers import ToolParserManager - from vllm.utils import FlexibleArgumentParser -diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py -index 572ed27b3..822c0f5f7 100644 ---- a/vllm/entrypoints/openai/run_batch.py -+++ b/vllm/entrypoints/openai/run_batch.py -@@ -20,7 +20,8 @@ from vllm.entrypoints.openai.protocol import (BatchRequestInput, +diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py +index bbc8eddd8..852aa3ff7 100644 +--- a/vllm/entrypoints/openai/serving_engine.py ++++ b/vllm/entrypoints/openai/serving_engine.py +@@ -35,7 +35,7 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, + from vllm.entrypoints.openai.serving_models import OpenAIServingModels + from vllm.entrypoints.openai.tool_parsers import ToolParser # yapf: enable - from vllm.entrypoints.openai.serving_chat import OpenAIServingChat - from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding --from vllm.entrypoints.openai.serving_engine import BaseModelPath -+from vllm.entrypoints.openai.serving_models import (BaseModelPath, -+ OpenAIServingModels) - from vllm.usage.usage_lib import UsageContext - from vllm.utils import FlexibleArgumentParser, random_uuid - from vllm.version import __version__ as VLLM_VERSION -@@ -213,13 +214,17 @@ async def main(args): - request_logger = RequestLogger(max_log_len=args.max_log_len) - - # Create the openai serving objects. -+ openai_serving_models = OpenAIServingModels( -+ model_config=model_config, -+ base_model_paths=base_model_paths, -+ lora_modules=None, -+ prompt_adapters=None, -+ ) - openai_serving_chat = OpenAIServingChat( - engine, - model_config, -- base_model_paths, -+ openai_serving_models, - args.response_role, -- lora_modules=None, -- prompt_adapters=None, - request_logger=request_logger, - chat_template=None, - chat_template_content_format="auto", -@@ -228,7 +233,7 @@ async def main(args): - openai_serving_embedding = OpenAIServingEmbedding( - engine, - model_config, -- base_model_paths, -+ openai_serving_models, - request_logger=request_logger, - chat_template=None, - chat_template_content_format="auto", -diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py -index d08533356..89a119ac6 100644 ---- a/vllm/entrypoints/openai/serving_chat.py -+++ b/vllm/entrypoints/openai/serving_chat.py -@@ -21,10 +21,8 @@ from vllm.entrypoints.openai.protocol import ( - ChatCompletionStreamResponse, ChatMessage, DeltaFunctionCall, DeltaMessage, - DeltaToolCall, ErrorResponse, FunctionCall, PromptTokenUsageInfo, - RequestResponseMetadata, ToolCall, UsageInfo) --from vllm.entrypoints.openai.serving_engine import (BaseModelPath, -- LoRAModulePath, -- OpenAIServing, -- PromptAdapterPath) -+from vllm.entrypoints.openai.serving_engine import OpenAIServing -+from vllm.entrypoints.openai.serving_models import OpenAIServingModels - from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager - from vllm.logger import init_logger - from vllm.outputs import CompletionOutput, RequestOutput -@@ -42,11 +40,9 @@ class OpenAIServingChat(OpenAIServing): - self, - engine_client: EngineClient, - model_config: ModelConfig, -- base_model_paths: List[BaseModelPath], -+ models: OpenAIServingModels, - response_role: str, - *, -- lora_modules: Optional[List[LoRAModulePath]], -- prompt_adapters: Optional[List[PromptAdapterPath]], - request_logger: Optional[RequestLogger], - chat_template: Optional[str], - chat_template_content_format: ChatTemplateContentFormatOption, -@@ -57,9 +53,7 @@ class OpenAIServingChat(OpenAIServing): - ) -> None: - super().__init__(engine_client=engine_client, - model_config=model_config, -- base_model_paths=base_model_paths, -- lora_modules=lora_modules, -- prompt_adapters=prompt_adapters, -+ models=models, - request_logger=request_logger, - return_tokens_as_token_ids=return_tokens_as_token_ids) - -@@ -126,7 +120,7 @@ class OpenAIServingChat(OpenAIServing): - prompt_adapter_request, - ) = self._maybe_get_adapters(request) - -- model_name = self._get_model_name(lora_request) -+ model_name = self.models.model_name(lora_request) - - tokenizer = await self.engine_client.get_tokenizer(lora_request) - -@@ -307,7 +301,7 @@ class OpenAIServingChat(OpenAIServing): - ] * num_choices - else: - tool_parsers = [None] * num_choices -- except RuntimeError as e: -+ except Exception as e: - logger.exception("Error in tool parser creation.") - data = self.create_streaming_error_response(str(e)) - yield f"data: {data}\n\n" -@@ -597,7 +591,7 @@ class OpenAIServingChat(OpenAIServing): - completion_tokens=num_completion_tokens, - total_tokens=num_prompt_tokens + num_completion_tokens) - -- except ValueError as e: -+ except Exception as e: - # TODO: Use a vllm-specific Validation Error - logger.exception("Error in chat completion stream generator.") - data = self.create_streaming_error_response(str(e)) -diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py -index aaad7b8c7..2c9c20caf 100644 ---- a/vllm/entrypoints/openai/serving_completion.py -+++ b/vllm/entrypoints/openai/serving_completion.py -@@ -21,10 +21,8 @@ from vllm.entrypoints.openai.protocol import (CompletionLogProbs, - RequestResponseMetadata, - UsageInfo) - # yapf: enable --from vllm.entrypoints.openai.serving_engine import (BaseModelPath, -- LoRAModulePath, -- OpenAIServing, -- PromptAdapterPath) -+from vllm.entrypoints.openai.serving_engine import OpenAIServing -+from vllm.entrypoints.openai.serving_models import OpenAIServingModels - from vllm.logger import init_logger - from vllm.outputs import RequestOutput - from vllm.sampling_params import BeamSearchParams, SamplingParams -@@ -41,18 +39,14 @@ class OpenAIServingCompletion(OpenAIServing): - self, - engine_client: EngineClient, - model_config: ModelConfig, -- base_model_paths: List[BaseModelPath], -+ models: OpenAIServingModels, - *, -- lora_modules: Optional[List[LoRAModulePath]], -- prompt_adapters: Optional[List[PromptAdapterPath]], - request_logger: Optional[RequestLogger], - return_tokens_as_token_ids: bool = False, - ): - super().__init__(engine_client=engine_client, - model_config=model_config, -- base_model_paths=base_model_paths, -- lora_modules=lora_modules, -- prompt_adapters=prompt_adapters, -+ models=models, - request_logger=request_logger, - return_tokens_as_token_ids=return_tokens_as_token_ids) - diff_sampling_param = self.model_config.get_diff_sampling_param() -@@ -170,7 +164,7 @@ class OpenAIServingCompletion(OpenAIServing): - - result_generator = merge_async_iterators(*generators) - -- model_name = self._get_model_name(lora_request) -+ model_name = self.models.model_name(lora_request) - num_prompts = len(engine_prompts) - - # Similar to the OpenAI API, when n != best_of, we do not stream the -@@ -377,7 +371,7 @@ class OpenAIServingCompletion(OpenAIServing): - # report to FastAPI middleware aggregate usage across all choices - request_metadata.final_usage_info = final_usage_info - -- except ValueError as e: -+ except Exception as e: - # TODO: Use a vllm-specific Validation Error - data = self.create_streaming_error_response(str(e)) - yield f"data: {data}\n\n" -diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py -index b8fb9d6bd..e7116a3d9 100644 ---- a/vllm/entrypoints/openai/serving_embedding.py -+++ b/vllm/entrypoints/openai/serving_embedding.py -@@ -16,7 +16,8 @@ from vllm.entrypoints.openai.protocol import (EmbeddingChatRequest, - EmbeddingResponse, - EmbeddingResponseData, - ErrorResponse, UsageInfo) --from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing -+from vllm.entrypoints.openai.serving_engine import OpenAIServing -+from vllm.entrypoints.openai.serving_models import OpenAIServingModels - from vllm.logger import init_logger - from vllm.outputs import (EmbeddingOutput, EmbeddingRequestOutput, - PoolingRequestOutput) -@@ -46,7 +47,7 @@ class OpenAIServingEmbedding(OpenAIServing): - self, - engine_client: EngineClient, - model_config: ModelConfig, -- base_model_paths: List[BaseModelPath], -+ models: OpenAIServingModels, - *, - request_logger: Optional[RequestLogger], - chat_template: Optional[str], -@@ -54,9 +55,7 @@ class OpenAIServingEmbedding(OpenAIServing): - ) -> None: - super().__init__(engine_client=engine_client, - model_config=model_config, -- base_model_paths=base_model_paths, -- lora_modules=None, -- prompt_adapters=None, -+ models=models, - request_logger=request_logger) - - self.chat_template = chat_template -diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py -index 5b6a089e4..911e1c85f 100644 ---- a/vllm/entrypoints/openai/serving_engine.py -+++ b/vllm/entrypoints/openai/serving_engine.py -@@ -1,7 +1,5 @@ - import json --import pathlib - from concurrent.futures.thread import ThreadPoolExecutor --from dataclasses import dataclass - from http import HTTPStatus - from typing import (Any, Callable, Dict, Iterable, Iterator, List, Mapping, - Optional, Sequence, Tuple, TypedDict, Union) -@@ -28,16 +26,13 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, - DetokenizeRequest, - EmbeddingChatRequest, - EmbeddingCompletionRequest, -- ErrorResponse, -- LoadLoraAdapterRequest, -- ModelCard, ModelList, -- ModelPermission, ScoreRequest, -+ ErrorResponse, ScoreRequest, - TokenizeChatRequest, -- TokenizeCompletionRequest, -- UnloadLoraAdapterRequest) -+ TokenizeCompletionRequest) -+from vllm.entrypoints.openai.serving_models import OpenAIServingModels - from vllm.entrypoints.openai.tool_parsers import ToolParser - # yapf: enable --from vllm.inputs import TokensPrompt -+from vllm.inputs import TokensPrompt, TokenInputs - from vllm.inputs.parse import parse_and_batch_prompt +-from vllm.inputs import TokensPrompt ++from vllm.inputs import TokensPrompt, TokenInputs + from vllm.inputs.parse import parse_and_batch_prompt from vllm.logger import init_logger from vllm.lora.request import LoRARequest -@@ -48,30 +43,10 @@ from vllm.sequence import Logprob - from vllm.tracing import (contains_trace_headers, extract_trace_headers, - log_tracing_disabled_warning) - from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer --from vllm.utils import AtomicCounter, is_list_of, make_async, random_uuid -+from vllm.utils import is_list_of, make_async, random_uuid - - logger = init_logger(__name__) - -- --@dataclass --class BaseModelPath: -- name: str -- model_path: str -- -- --@dataclass --class PromptAdapterPath: -- name: str -- local_path: str -- -- --@dataclass --class LoRAModulePath: -- name: str -- path: str -- base_model_name: Optional[str] = None -- -- - CompletionLikeRequest = Union[CompletionRequest, DetokenizeRequest, - EmbeddingCompletionRequest, ScoreRequest, - TokenizeCompletionRequest] -@@ -96,10 +71,8 @@ class OpenAIServing: - self, - engine_client: EngineClient, - model_config: ModelConfig, -- base_model_paths: List[BaseModelPath], -+ models: OpenAIServingModels, - *, -- lora_modules: Optional[List[LoRAModulePath]], -- prompt_adapters: Optional[List[PromptAdapterPath]], - request_logger: Optional[RequestLogger], - return_tokens_as_token_ids: bool = False, - ): -@@ -109,35 +82,7 @@ class OpenAIServing: - self.model_config = model_config - self.max_model_len = model_config.max_model_len - -- self.base_model_paths = base_model_paths -- -- self.lora_id_counter = AtomicCounter(0) -- self.lora_requests = [] -- if lora_modules is not None: -- self.lora_requests = [ -- LoRARequest(lora_name=lora.name, -- lora_int_id=i, -- lora_path=lora.path, -- base_model_name=lora.base_model_name -- if lora.base_model_name -- and self._is_model_supported(lora.base_model_name) -- else self.base_model_paths[0].name) -- for i, lora in enumerate(lora_modules, start=1) -- ] -- -- self.prompt_adapter_requests = [] -- if prompt_adapters is not None: -- for i, prompt_adapter in enumerate(prompt_adapters, start=1): -- with pathlib.Path(prompt_adapter.local_path, -- "adapter_config.json").open() as f: -- adapter_config = json.load(f) -- num_virtual_tokens = adapter_config["num_virtual_tokens"] -- self.prompt_adapter_requests.append( -- PromptAdapterRequest( -- prompt_adapter_name=prompt_adapter.name, -- prompt_adapter_id=i, -- prompt_adapter_local_path=prompt_adapter.local_path, -- prompt_adapter_num_virtual_tokens=num_virtual_tokens)) -+ self.models = models - - self.request_logger = request_logger - self.return_tokens_as_token_ids = return_tokens_as_token_ids -@@ -150,33 +95,6 @@ class OpenAIServing: - self._tokenize_prompt_input_or_inputs, - executor=self._tokenizer_executor) - -- async def show_available_models(self) -> ModelList: -- """Show available models. Right now we only have one model.""" -- model_cards = [ -- ModelCard(id=base_model.name, -- max_model_len=self.max_model_len, -- root=base_model.model_path, -- permission=[ModelPermission()]) -- for base_model in self.base_model_paths -- ] -- lora_cards = [ -- ModelCard(id=lora.lora_name, -- root=lora.local_path, -- parent=lora.base_model_name if lora.base_model_name else -- self.base_model_paths[0].name, -- permission=[ModelPermission()]) -- for lora in self.lora_requests -- ] -- prompt_adapter_cards = [ -- ModelCard(id=prompt_adapter.prompt_adapter_name, -- root=self.base_model_paths[0].name, -- permission=[ModelPermission()]) -- for prompt_adapter in self.prompt_adapter_requests -- ] -- model_cards.extend(lora_cards) -- model_cards.extend(prompt_adapter_cards) -- return ModelList(data=model_cards) -- - def create_error_response( - self, - message: str, -@@ -205,11 +123,13 @@ class OpenAIServing: - ) -> Optional[ErrorResponse]: - if self._is_model_supported(request.model): - return None -- if request.model in [lora.lora_name for lora in self.lora_requests]: -+ if request.model in [ -+ lora.lora_name for lora in self.models.lora_requests -+ ]: - return None - if request.model in [ - prompt_adapter.prompt_adapter_name -- for prompt_adapter in self.prompt_adapter_requests -+ for prompt_adapter in self.models.prompt_adapter_requests - ]: - return None - return self.create_error_response( -@@ -223,10 +143,10 @@ class OpenAIServing: - None, PromptAdapterRequest]]: - if self._is_model_supported(request.model): - return None, None -- for lora in self.lora_requests: -+ for lora in self.models.lora_requests: - if request.model == lora.lora_name: - return lora, None -- for prompt_adapter in self.prompt_adapter_requests: -+ for prompt_adapter in self.models.prompt_adapter_requests: - if request.model == prompt_adapter.prompt_adapter_name: - return None, prompt_adapter - # if _check_model has been called earlier, this will be unreachable -@@ -514,8 +434,9 @@ class OpenAIServing: +@@ -451,8 +451,9 @@ class OpenAIServing: prompt=tokenizer.decode(request_prompt), prompt_token_ids=request_prompt) @@ -20881,757 +10090,61 @@ index 5b6a089e4..911e1c85f 100644 + prompt=prompt_inputs["prompt"]) if mm_data is not None: engine_prompt["multi_modal_data"] = mm_data - -@@ -588,91 +509,5 @@ class OpenAIServing: - return logprob.decoded_token - return tokenizer.decode(token_id) - -- async def _check_load_lora_adapter_request( -- self, request: LoadLoraAdapterRequest) -> Optional[ErrorResponse]: -- # Check if both 'lora_name' and 'lora_path' are provided -- if not request.lora_name or not request.lora_path: -- return self.create_error_response( -- message="Both 'lora_name' and 'lora_path' must be provided.", -- err_type="InvalidUserInput", -- status_code=HTTPStatus.BAD_REQUEST) -- -- # Check if the lora adapter with the given name already exists -- if any(lora_request.lora_name == request.lora_name -- for lora_request in self.lora_requests): -- return self.create_error_response( -- message= -- f"The lora adapter '{request.lora_name}' has already been" -- "loaded.", -- err_type="InvalidUserInput", -- status_code=HTTPStatus.BAD_REQUEST) -- -- return None -- -- async def _check_unload_lora_adapter_request( -- self, -- request: UnloadLoraAdapterRequest) -> Optional[ErrorResponse]: -- # Check if either 'lora_name' or 'lora_int_id' is provided -- if not request.lora_name and not request.lora_int_id: -- return self.create_error_response( -- message= -- "either 'lora_name' and 'lora_int_id' needs to be provided.", -- err_type="InvalidUserInput", -- status_code=HTTPStatus.BAD_REQUEST) -- -- # Check if the lora adapter with the given name exists -- if not any(lora_request.lora_name == request.lora_name -- for lora_request in self.lora_requests): -- return self.create_error_response( -- message= -- f"The lora adapter '{request.lora_name}' cannot be found.", -- err_type="InvalidUserInput", -- status_code=HTTPStatus.BAD_REQUEST) -- -- return None -- -- async def load_lora_adapter( -- self, -- request: LoadLoraAdapterRequest) -> Union[ErrorResponse, str]: -- error_check_ret = await self._check_load_lora_adapter_request(request) -- if error_check_ret is not None: -- return error_check_ret -- -- lora_name, lora_path = request.lora_name, request.lora_path -- unique_id = self.lora_id_counter.inc(1) -- self.lora_requests.append( -- LoRARequest(lora_name=lora_name, -- lora_int_id=unique_id, -- lora_path=lora_path)) -- return f"Success: LoRA adapter '{lora_name}' added successfully." -- -- async def unload_lora_adapter( -- self, -- request: UnloadLoraAdapterRequest) -> Union[ErrorResponse, str]: -- error_check_ret = await self._check_unload_lora_adapter_request(request -- ) -- if error_check_ret is not None: -- return error_check_ret -- -- lora_name = request.lora_name -- self.lora_requests = [ -- lora_request for lora_request in self.lora_requests -- if lora_request.lora_name != lora_name -- ] -- return f"Success: LoRA adapter '{lora_name}' removed successfully." -- - def _is_model_supported(self, model_name): -- return any(model.name == model_name for model in self.base_model_paths) -- -- def _get_model_name(self, lora: Optional[LoRARequest]): -- """ -- Returns the appropriate model name depending on the availability -- and support of the LoRA or base model. -- Parameters: -- - lora: LoRARequest that contain a base_model_name. -- Returns: -- - str: The name of the base model or the first available model path. -- """ -- if lora is not None: -- return lora.lora_name -- return self.base_model_paths[0].name -+ return self.models.is_base_model(model_name) -diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py -new file mode 100644 -index 000000000..26966896b ---- /dev/null -+++ b/vllm/entrypoints/openai/serving_models.py -@@ -0,0 +1,210 @@ -+import json -+import pathlib -+from dataclasses import dataclass -+from http import HTTPStatus -+from typing import List, Optional, Union -+ -+from vllm.config import ModelConfig -+from vllm.entrypoints.openai.protocol import (ErrorResponse, -+ LoadLoraAdapterRequest, -+ ModelCard, ModelList, -+ ModelPermission, -+ UnloadLoraAdapterRequest) -+from vllm.lora.request import LoRARequest -+from vllm.prompt_adapter.request import PromptAdapterRequest -+from vllm.utils import AtomicCounter -+ -+ -+@dataclass -+class BaseModelPath: -+ name: str -+ model_path: str -+ -+ -+@dataclass -+class PromptAdapterPath: -+ name: str -+ local_path: str -+ -+ -+@dataclass -+class LoRAModulePath: -+ name: str -+ path: str -+ base_model_name: Optional[str] = None -+ -+ -+class OpenAIServingModels: -+ """Shared instance to hold data about the loaded base model(s) and adapters. -+ -+ Handles the routes: -+ - /v1/models -+ - /v1/load_lora_adapter -+ - /v1/unload_lora_adapter -+ """ -+ -+ def __init__( -+ self, -+ model_config: ModelConfig, -+ base_model_paths: List[BaseModelPath], -+ *, -+ lora_modules: Optional[List[LoRAModulePath]] = None, -+ prompt_adapters: Optional[List[PromptAdapterPath]] = None, -+ ): -+ super().__init__() -+ -+ self.base_model_paths = base_model_paths -+ self.max_model_len = model_config.max_model_len -+ -+ self.lora_id_counter = AtomicCounter(0) -+ self.lora_requests = [] -+ if lora_modules is not None: -+ self.lora_requests = [ -+ LoRARequest(lora_name=lora.name, -+ lora_int_id=i, -+ lora_path=lora.path, -+ base_model_name=lora.base_model_name -+ if lora.base_model_name -+ and self.is_base_model(lora.base_model_name) else -+ self.base_model_paths[0].name) -+ for i, lora in enumerate(lora_modules, start=1) -+ ] -+ -+ self.prompt_adapter_requests = [] -+ if prompt_adapters is not None: -+ for i, prompt_adapter in enumerate(prompt_adapters, start=1): -+ with pathlib.Path(prompt_adapter.local_path, -+ "adapter_config.json").open() as f: -+ adapter_config = json.load(f) -+ num_virtual_tokens = adapter_config["num_virtual_tokens"] -+ self.prompt_adapter_requests.append( -+ PromptAdapterRequest( -+ prompt_adapter_name=prompt_adapter.name, -+ prompt_adapter_id=i, -+ prompt_adapter_local_path=prompt_adapter.local_path, -+ prompt_adapter_num_virtual_tokens=num_virtual_tokens)) -+ -+ def is_base_model(self, model_name): -+ return any(model.name == model_name for model in self.base_model_paths) -+ -+ def model_name(self, lora_request: Optional[LoRARequest] = None) -> str: -+ """Returns the appropriate model name depending on the availability -+ and support of the LoRA or base model. -+ Parameters: -+ - lora: LoRARequest that contain a base_model_name. -+ Returns: -+ - str: The name of the base model or the first available model path. -+ """ -+ if lora_request is not None: -+ return lora_request.lora_name -+ return self.base_model_paths[0].name -+ -+ async def show_available_models(self) -> ModelList: -+ """Show available models. This includes the base model and all -+ adapters""" -+ model_cards = [ -+ ModelCard(id=base_model.name, -+ max_model_len=self.max_model_len, -+ root=base_model.model_path, -+ permission=[ModelPermission()]) -+ for base_model in self.base_model_paths -+ ] -+ lora_cards = [ -+ ModelCard(id=lora.lora_name, -+ root=lora.local_path, -+ parent=lora.base_model_name if lora.base_model_name else -+ self.base_model_paths[0].name, -+ permission=[ModelPermission()]) -+ for lora in self.lora_requests -+ ] -+ prompt_adapter_cards = [ -+ ModelCard(id=prompt_adapter.prompt_adapter_name, -+ root=self.base_model_paths[0].name, -+ permission=[ModelPermission()]) -+ for prompt_adapter in self.prompt_adapter_requests -+ ] -+ model_cards.extend(lora_cards) -+ model_cards.extend(prompt_adapter_cards) -+ return ModelList(data=model_cards) -+ -+ async def load_lora_adapter( -+ self, -+ request: LoadLoraAdapterRequest) -> Union[ErrorResponse, str]: -+ error_check_ret = await self._check_load_lora_adapter_request(request) -+ if error_check_ret is not None: -+ return error_check_ret -+ -+ lora_name, lora_path = request.lora_name, request.lora_path -+ unique_id = self.lora_id_counter.inc(1) -+ self.lora_requests.append( -+ LoRARequest(lora_name=lora_name, -+ lora_int_id=unique_id, -+ lora_path=lora_path)) -+ return f"Success: LoRA adapter '{lora_name}' added successfully." -+ -+ async def unload_lora_adapter( -+ self, -+ request: UnloadLoraAdapterRequest) -> Union[ErrorResponse, str]: -+ error_check_ret = await self._check_unload_lora_adapter_request(request -+ ) -+ if error_check_ret is not None: -+ return error_check_ret -+ -+ lora_name = request.lora_name -+ self.lora_requests = [ -+ lora_request for lora_request in self.lora_requests -+ if lora_request.lora_name != lora_name -+ ] -+ return f"Success: LoRA adapter '{lora_name}' removed successfully." -+ -+ async def _check_load_lora_adapter_request( -+ self, request: LoadLoraAdapterRequest) -> Optional[ErrorResponse]: -+ # Check if both 'lora_name' and 'lora_path' are provided -+ if not request.lora_name or not request.lora_path: -+ return create_error_response( -+ message="Both 'lora_name' and 'lora_path' must be provided.", -+ err_type="InvalidUserInput", -+ status_code=HTTPStatus.BAD_REQUEST) -+ -+ # Check if the lora adapter with the given name already exists -+ if any(lora_request.lora_name == request.lora_name -+ for lora_request in self.lora_requests): -+ return create_error_response( -+ message= -+ f"The lora adapter '{request.lora_name}' has already been" -+ "loaded.", -+ err_type="InvalidUserInput", -+ status_code=HTTPStatus.BAD_REQUEST) -+ -+ return None -+ -+ async def _check_unload_lora_adapter_request( -+ self, -+ request: UnloadLoraAdapterRequest) -> Optional[ErrorResponse]: -+ # Check if either 'lora_name' or 'lora_int_id' is provided -+ if not request.lora_name and not request.lora_int_id: -+ return create_error_response( -+ message= -+ "either 'lora_name' and 'lora_int_id' needs to be provided.", -+ err_type="InvalidUserInput", -+ status_code=HTTPStatus.BAD_REQUEST) -+ -+ # Check if the lora adapter with the given name exists -+ if not any(lora_request.lora_name == request.lora_name -+ for lora_request in self.lora_requests): -+ return create_error_response( -+ message= -+ f"The lora adapter '{request.lora_name}' cannot be found.", -+ err_type="InvalidUserInput", -+ status_code=HTTPStatus.BAD_REQUEST) -+ -+ return None -+ -+ -+def create_error_response( -+ message: str, -+ err_type: str = "BadRequestError", -+ status_code: HTTPStatus = HTTPStatus.BAD_REQUEST) -> ErrorResponse: -+ return ErrorResponse(message=message, -+ type=err_type, -+ code=status_code.value) -diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py -index 01852f0df..583032207 100644 ---- a/vllm/entrypoints/openai/serving_pooling.py -+++ b/vllm/entrypoints/openai/serving_pooling.py -@@ -15,7 +15,8 @@ from vllm.entrypoints.openai.protocol import (ErrorResponse, - PoolingChatRequest, - PoolingRequest, PoolingResponse, - PoolingResponseData, UsageInfo) --from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing -+from vllm.entrypoints.openai.serving_engine import OpenAIServing -+from vllm.entrypoints.openai.serving_models import OpenAIServingModels - from vllm.logger import init_logger - from vllm.outputs import PoolingOutput, PoolingRequestOutput - from vllm.utils import merge_async_iterators -@@ -44,7 +45,7 @@ class OpenAIServingPooling(OpenAIServing): - self, - engine_client: EngineClient, - model_config: ModelConfig, -- base_model_paths: List[BaseModelPath], -+ models: OpenAIServingModels, - *, - request_logger: Optional[RequestLogger], - chat_template: Optional[str], -@@ -52,9 +53,7 @@ class OpenAIServingPooling(OpenAIServing): - ) -> None: - super().__init__(engine_client=engine_client, - model_config=model_config, -- base_model_paths=base_model_paths, -- lora_modules=None, -- prompt_adapters=None, -+ models=models, - request_logger=request_logger) - - self.chat_template = chat_template -diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py -index a8a126e69..5d3e7139d 100644 ---- a/vllm/entrypoints/openai/serving_score.py -+++ b/vllm/entrypoints/openai/serving_score.py -@@ -10,7 +10,8 @@ from vllm.entrypoints.logger import RequestLogger - from vllm.entrypoints.openai.protocol import (ErrorResponse, ScoreRequest, - ScoreResponse, ScoreResponseData, - UsageInfo) --from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing -+from vllm.entrypoints.openai.serving_engine import OpenAIServing -+from vllm.entrypoints.openai.serving_models import OpenAIServingModels - from vllm.inputs.data import TokensPrompt - from vllm.logger import init_logger - from vllm.outputs import PoolingRequestOutput, ScoringRequestOutput -@@ -50,15 +51,13 @@ class OpenAIServingScores(OpenAIServing): - self, - engine_client: EngineClient, - model_config: ModelConfig, -- base_model_paths: List[BaseModelPath], -+ models: OpenAIServingModels, - *, - request_logger: Optional[RequestLogger], - ) -> None: - super().__init__(engine_client=engine_client, - model_config=model_config, -- base_model_paths=base_model_paths, -- lora_modules=None, -- prompt_adapters=None, -+ models=models, - request_logger=request_logger) - - async def create_score( -diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py -index 2e8493336..b67ecfb01 100644 ---- a/vllm/entrypoints/openai/serving_tokenization.py -+++ b/vllm/entrypoints/openai/serving_tokenization.py -@@ -15,9 +15,8 @@ from vllm.entrypoints.openai.protocol import (DetokenizeRequest, - TokenizeRequest, - TokenizeResponse) - # yapf: enable --from vllm.entrypoints.openai.serving_engine import (BaseModelPath, -- LoRAModulePath, -- OpenAIServing) -+from vllm.entrypoints.openai.serving_engine import OpenAIServing -+from vllm.entrypoints.openai.serving_models import OpenAIServingModels - from vllm.logger import init_logger - - logger = init_logger(__name__) -@@ -29,18 +28,15 @@ class OpenAIServingTokenization(OpenAIServing): - self, - engine_client: EngineClient, - model_config: ModelConfig, -- base_model_paths: List[BaseModelPath], -+ models: OpenAIServingModels, - *, -- lora_modules: Optional[List[LoRAModulePath]], - request_logger: Optional[RequestLogger], - chat_template: Optional[str], - chat_template_content_format: ChatTemplateContentFormatOption, - ) -> None: - super().__init__(engine_client=engine_client, - model_config=model_config, -- base_model_paths=base_model_paths, -- lora_modules=lora_modules, -- prompt_adapters=None, -+ models=models, - request_logger=request_logger) - - self.chat_template = chat_template -diff --git a/vllm/executor/multiproc_worker_utils.py b/vllm/executor/multiproc_worker_utils.py -index c4d90f085..bc3282652 100644 ---- a/vllm/executor/multiproc_worker_utils.py -+++ b/vllm/executor/multiproc_worker_utils.py -@@ -1,5 +1,4 @@ - import asyncio --import multiprocessing - import os - import sys - import threading -@@ -13,10 +12,9 @@ from typing import (Any, Callable, Dict, Generic, List, Optional, TextIO, - - import torch - --import vllm.envs as envs - from vllm.logger import init_logger - from vllm.triton_utils.importing import HAS_TRITON --from vllm.utils import cuda_is_initialized -+from vllm.utils import _check_multiproc_method, get_mp_context - - if HAS_TRITON: - from vllm.triton_utils import maybe_set_triton_cache_manager -@@ -274,24 +272,6 @@ def _add_prefix(file: TextIO, worker_name: str, pid: int) -> None: - file.write = write_with_prefix # type: ignore[method-assign] - - --def _check_multiproc_method(): -- if (cuda_is_initialized() -- and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn"): -- logger.warning("CUDA was previously initialized. We must use " -- "the `spawn` multiprocessing start method. Setting " -- "VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. " -- "See https://docs.vllm.ai/en/latest/getting_started/" -- "debugging.html#python-multiprocessing " -- "for more information.") -- os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" -- -- --def get_mp_context(): -- _check_multiproc_method() -- mp_method = envs.VLLM_WORKER_MULTIPROC_METHOD -- return multiprocessing.get_context(mp_method) -- -- - def set_multiprocessing_worker_envs(parallel_config): - """ Set up environment variables that should be used when there are workers - in a multiprocessing environment. This should be called by the parent -diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py -index e2c549cbd..7e5d56019 100644 ---- a/vllm/executor/ray_gpu_executor.py -+++ b/vllm/executor/ray_gpu_executor.py -@@ -10,7 +10,7 @@ import vllm.envs as envs - from vllm.executor.distributed_gpu_executor import ( # yapf: disable - DistributedGPUExecutor, DistributedGPUExecutorAsync) - from vllm.executor.msgspec_utils import encode_hook --from vllm.executor.ray_utils import RayWorkerWrapper, ray -+from vllm.executor.ray_utils import ray - from vllm.logger import init_logger - from vllm.model_executor.layers.sampler import SamplerOutput - from vllm.sequence import ExecuteModelRequest -@@ -96,6 +96,7 @@ class RayGPUExecutor(DistributedGPUExecutor): - - def _init_workers_ray(self, placement_group: "PlacementGroup", - **ray_remote_kwargs): -+ from vllm.executor.ray_utils import RayWorkerWrapper - if (self.parallel_config.tensor_parallel_size == 1 - and self.parallel_config.pipeline_parallel_size == 1): - # For single GPU case, we use a ray worker with constrained memory. -@@ -310,6 +311,114 @@ class RayGPUExecutor(DistributedGPUExecutor): + if request.mm_processor_kwargs is not None: +diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py +index 9b0b98731..dd6e6be9d 100644 +--- a/vllm/executor/ray_distributed_executor.py ++++ b/vllm/executor/ray_distributed_executor.py +@@ -72,7 +72,7 @@ class RayDistributedExecutor(DistributedExecutorBase): + + def _init_executor(self) -> None: + self.forward_dag: Optional[ray.dag.CompiledDAG] = None +- if envs.VLLM_USE_V1: ++ if envs.VLLM_USE_V1 and not current_platform.is_xpu(): + # V1 uses SPMD worker and compiled DAG + os.environ["VLLM_USE_RAY_SPMD_WORKER"] = "1" + os.environ["VLLM_USE_RAY_COMPILED_DAG"] = "1" +@@ -124,10 +124,12 @@ class RayDistributedExecutor(DistributedExecutorBase): + self.driver_worker.execute_method) + + def shutdown(self) -> None: ++ ''' + logger.info( + "Shutting down Ray distributed executor. If you see error log " + "from logging.cc regarding SIGTERM received, please ignore because " + "this is the expected termination process in Ray.") ++ ''' + if hasattr(self, "forward_dag") and self.forward_dag is not None: + self.forward_dag.teardown() + import ray +@@ -428,6 +430,7 @@ class RayDistributedExecutor(DistributedExecutorBase): else: self.non_driver_workers.append(worker) -+ def _get_env_vars_to_be_updated(self): -+ # Get the set of GPU IDs used on each node. -+ worker_node_and_gpu_ids = self._run_workers("get_node_and_gpu_ids", -+ use_dummy_driver=True) -+ -+ node_workers = defaultdict(list) -+ node_gpus = defaultdict(list) -+ -+ for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids): -+ node_workers[node_id].append(i) -+ # `gpu_ids` can be a list of strings or integers. -+ # convert them to integers for consistency. -+ # NOTE: gpu_ids can be larger than 9 (e.g. 16 GPUs), -+ # string sorting is not sufficient. -+ # see https://github.com/vllm-project/vllm/issues/5590 -+ gpu_ids = [int(x) for x in gpu_ids] -+ node_gpus[node_id].extend(gpu_ids) -+ for node_id, gpu_ids in node_gpus.items(): -+ node_gpus[node_id] = sorted(gpu_ids) -+ -+ # Set environment variables for the driver and workers. -+ all_args_to_update_env_vars = self._get_env_vars_to_be_updated() -+ -+ self._run_workers("update_environment_variables", -+ all_args=all_args_to_update_env_vars) -+ -+ if len(node_gpus) == 1: -+ # in single node case, we don't need to get the IP address. -+ # the loopback address is sufficient -+ # NOTE: a node may have several IP addresses, one for each -+ # network interface. `get_ip()` might return any of them, -+ # while they might not work for communication inside the node -+ # if the network setup is complicated. Using the loopback address -+ # solves this issue, as it always works for communication inside -+ # the node. -+ driver_ip = "127.0.0.1" -+ distributed_init_method = get_distributed_init_method( -+ driver_ip, get_open_port()) -+ -+ error_on_invalid_device_count_status() -+ -+ # Initialize the actual workers inside worker wrapper. -+ init_worker_all_kwargs = [ -+ self._get_worker_kwargs( -+ local_rank=node_workers[node_id].index(rank), -+ rank=rank, -+ distributed_init_method=distributed_init_method, -+ ) for rank, (node_id, -+ _) in zip(worker_ranks, worker_node_and_gpu_ids) -+ ] -+ self._run_workers("init_worker", all_kwargs=init_worker_all_kwargs) -+ -+ self._run_workers("init_device") -+ self._run_workers("load_model", -+ max_concurrent_workers=self.parallel_config. -+ max_parallel_loading_workers) -+ -+ # This is the list of workers that are rank 0 of each TP group EXCEPT -+ # global rank 0. These are the workers that will broadcast to the -+ # rest of the workers. -+ self.tp_driver_workers: List[RayWorkerWrapper] = [] -+ # This is the list of workers that are not drivers and not the first -+ # worker in a TP group. These are the workers that will be -+ # broadcasted to. -+ self.non_driver_workers: List[RayWorkerWrapper] = [] -+ -+ # Enforce rank order for correct rank to return final output. -+ for rank, worker in sorted(zip(worker_ranks[1:], self.workers)): -+ # We need to skip the driver worker, which we -+ # do by skipping worker_ranks[0] which is always 0. -+ if rank % self.parallel_config.tensor_parallel_size == 0: -+ self.tp_driver_workers.append(worker) -+ else: -+ self.non_driver_workers.append(worker) -+ -+ def _get_env_vars_to_be_updated(self): -+ # Get the set of GPU IDs used on each node. -+ worker_node_and_gpu_ids = self._run_workers("get_node_and_gpu_ids", -+ use_dummy_driver=True) -+ -+ node_workers = defaultdict(list) -+ node_gpus = defaultdict(list) -+ -+ for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids): -+ node_workers[node_id].append(i) -+ # `gpu_ids` can be a list of strings or integers. -+ # convert them to integers for consistency. -+ # NOTE: gpu_ids can be larger than 9 (e.g. 16 GPUs), -+ # string sorting is not sufficient. -+ # see https://github.com/vllm-project/vllm/issues/5590 -+ gpu_ids = [int(x) for x in gpu_ids] -+ node_gpus[node_id].extend(gpu_ids) -+ for node_id, gpu_ids in node_gpus.items(): -+ node_gpus[node_id] = sorted(gpu_ids) -+ -+ VLLM_INSTANCE_ID = get_vllm_instance_id() -+ -+ # Set environment variables for the driver and workers. -+ all_args_to_update_environment_variables = [({ -+ "CUDA_VISIBLE_DEVICES": -+ ",".join(map(str, node_gpus[node_id])), -+ "VLLM_INSTANCE_ID": -+ VLLM_INSTANCE_ID, -+ "VLLM_TRACE_FUNCTION": -+ str(envs.VLLM_TRACE_FUNCTION), -+ }, ) for (node_id, _) in worker_node_and_gpu_ids] -+ return all_args_to_update_environment_variables + def _driver_execute_model( self, execute_model_req: Optional[ExecuteModelRequest] ) -> Optional[List[SamplerOutput]]: diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py -index 426aa1b5c..8d766bad1 100644 +index 37cc07bfb..e12cf7fba 100644 --- a/vllm/executor/ray_utils.py +++ b/vllm/executor/ray_utils.py -@@ -8,7 +8,6 @@ import msgspec - from vllm.config import ParallelConfig - from vllm.executor.msgspec_utils import decode_hook, encode_hook - from vllm.logger import init_logger --from vllm.platforms import current_platform - from vllm.sequence import ExecuteModelRequest, IntermediateTensors - from vllm.utils import get_ip - from vllm.worker.worker_base import WorkerWrapperBase -@@ -229,6 +228,7 @@ def initialize_ray_cluster( +@@ -281,6 +281,10 @@ def initialize_ray_cluster( + ray_address: The address of the Ray cluster. If None, uses the default Ray cluster address. """ ++ lowbit = os.getenv("IPEX_LLM_LOWBIT", None) ++ if lowbit is not None: ++ from ipex_llm.vllm.xpu.model_convert import _ipex_llm_convert ++ _ipex_llm_convert(lowbit) assert_ray_available() -+ from vllm.platforms import current_platform - - # Connect to a ray cluster. - if current_platform.is_rocm() or current_platform.is_xpu(): -diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py -index d54cbb5c3..cdaf6dd76 100644 ---- a/vllm/inputs/data.py -+++ b/vllm/inputs/data.py -@@ -250,7 +250,7 @@ class SingletonInputsAdapter: - if inputs["type"] == "token" or inputs["type"] == "multimodal": - return inputs.get("prompt") - -- assert_never(inputs) -+ assert_never(inputs) # type: ignore[arg-type] - - @cached_property - def prompt_token_ids(self) -> List[int]: -@@ -259,7 +259,7 @@ class SingletonInputsAdapter: - if inputs["type"] == "token" or inputs["type"] == "multimodal": - return inputs.get("prompt_token_ids", []) - -- assert_never(inputs) -+ assert_never(inputs) # type: ignore[arg-type] - - @cached_property - def token_type_ids(self) -> List[int]: -@@ -268,7 +268,7 @@ class SingletonInputsAdapter: - if inputs["type"] == "token" or inputs["type"] == "multimodal": - return inputs.get("token_type_ids", []) - -- assert_never(inputs) -+ assert_never(inputs) # type: ignore[arg-type] - - @cached_property - def prompt_embeds(self) -> Optional[torch.Tensor]: -@@ -277,7 +277,7 @@ class SingletonInputsAdapter: - if inputs["type"] == "token" or inputs["type"] == "multimodal": - return None - -- assert_never(inputs) -+ assert_never(inputs) # type: ignore[arg-type] - - @cached_property - def multi_modal_data(self) -> "MultiModalDataDict": -@@ -289,7 +289,7 @@ class SingletonInputsAdapter: - if inputs["type"] == "multimodal": - return inputs.get("mm_kwargs", {}) - -- assert_never(inputs) -+ assert_never(inputs) # type: ignore[arg-type] - - @cached_property - def multi_modal_inputs(self) -> Union[Dict, "MultiModalKwargs"]: -@@ -301,7 +301,7 @@ class SingletonInputsAdapter: - if inputs["type"] == "multimodal": - return inputs.get("mm_kwargs", {}) - -- assert_never(inputs) -+ assert_never(inputs) # type: ignore[arg-type] - - @cached_property - def multi_modal_hashes(self) -> List[str]: -@@ -311,9 +311,10 @@ class SingletonInputsAdapter: - return inputs.get("multi_modal_hashes", []) - - if inputs["type"] == "multimodal": -- return inputs.get("mm_hashes", []) -+ # only the case when we use MultiModalInputsV2 -+ return inputs.get("mm_hashes", []) # type: ignore[return-value] + from vllm.platforms import current_platform -- assert_never(inputs) -+ assert_never(inputs) # type: ignore[arg-type] - - @cached_property - def multi_modal_placeholders(self) -> "MultiModalPlaceholderDict": -@@ -325,7 +326,7 @@ class SingletonInputsAdapter: - if inputs["type"] == "multimodal": - return inputs.get("mm_placeholders", {}) - -- assert_never(inputs) -+ assert_never(inputs) # type: ignore[arg-type] - - @cached_property - def mm_processor_kwargs(self) -> Dict[str, Any]: -@@ -337,7 +338,7 @@ class SingletonInputsAdapter: - if inputs["type"] == "multimodal": - return {} - -- assert_never(inputs) -+ assert_never(inputs) # type: ignore[arg-type] - - - ProcessorInputs = Union[DecoderOnlyInputs, EncoderDecoderInputs] diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py -index 3d606817e..9939b3536 100644 +index 669fb96e6..d28984b4b 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py -@@ -184,10 +184,16 @@ class InputPreprocessor: - corresponding token IDs. - """ - tokenizer = self.get_tokenizer_group() -- -+ add_special_tokens = None -+ if self.model_config.hf_config.model_type == "whisper": -+ # For Whisper, special tokens should be provided by the user based -+ # on the task and language of their request. Also needed to avoid -+ # appending an EOS token to the prompt which disrupts generation. -+ add_special_tokens = False - return tokenizer.encode(request_id=request_id, - prompt=prompt, -- lora_request=lora_request) -+ lora_request=lora_request, -+ add_special_tokens=add_special_tokens) - - async def _tokenize_prompt_async( - self, -@@ -197,10 +203,17 @@ class InputPreprocessor: - ) -> List[int]: - """Async version of :meth:`_tokenize_prompt`.""" - tokenizer = self.get_tokenizer_group() -- -- return await tokenizer.encode_async(request_id=request_id, -- prompt=prompt, -- lora_request=lora_request) -+ add_special_tokens = None -+ if self.model_config.hf_config.model_type == "whisper": -+ # For Whisper, special tokens should be provided by the user based -+ # on the task and language of their request. Also needed to avoid -+ # appending an EOS token to the prompt which disrupts generation. -+ add_special_tokens = False -+ return await tokenizer.encode_async( -+ request_id=request_id, -+ prompt=prompt, -+ lora_request=lora_request, -+ add_special_tokens=add_special_tokens) - - def _can_process_multimodal(self) -> bool: - model_config = self.model_config -@@ -305,6 +318,7 @@ class InputPreprocessor: +@@ -339,6 +339,7 @@ class InputPreprocessor: tokens_content = parsed["content"] prompt_token_ids = tokens_content["prompt_token_ids"] @@ -21639,7 +10152,7 @@ index 3d606817e..9939b3536 100644 token_type_ids = tokens_content.get("token_type_ids") multi_modal_data = tokens_content.get("multi_modal_data") mm_processor_kwargs = tokens_content.get("mm_processor_kwargs") -@@ -319,6 +333,7 @@ class InputPreprocessor: +@@ -354,6 +355,7 @@ class InputPreprocessor: return token_inputs( prompt_token_ids=prompt_token_ids, @@ -21647,298 +10160,19 @@ index 3d606817e..9939b3536 100644 token_type_ids=token_type_ids, multi_modal_data=multi_modal_data, mm_processor_kwargs=mm_processor_kwargs, -@@ -436,11 +451,18 @@ class InputPreprocessor: - or encoder_inputs["type"] == "multimodal"): - pass - else: -- assert_never(encoder_inputs) -+ assert_never(encoder_inputs) # type: ignore[arg-type] - - if decoder_inputs is None: -- dec_token_ids = self._prepare_decoder_input_ids_for_generation( -- None) -+ if self.model_config.hf_config.model_type == "whisper": -+ # For Whisper models, the text prompt should go to the decoder. -+ # If no explicit encoder/decoder inputs, then copy the prompt -+ # from the encoder to the decoder. The encoder tokens are later -+ # overridden by the audio features. -+ dec_token_ids = encoder_inputs["prompt_token_ids"].copy() -+ else: -+ dec_token_ids = self._prepare_decoder_input_ids_for_generation( -+ None) - decoder_inputs = token_inputs(dec_token_ids) - elif (decoder_inputs["type"] == "token" - or decoder_inputs["type"] == "multimodal"): -@@ -452,7 +474,7 @@ class InputPreprocessor: - raise ValueError("Multi-modal decoder inputs of encoder-" - "decoder models are not supported yet") - else: -- assert_never(encoder_inputs) -+ assert_never(encoder_inputs) # type: ignore[arg-type] - - return EncoderDecoderInputs( - encoder=encoder_inputs, -@@ -569,7 +591,7 @@ class InputPreprocessor: - prompt_adapter_request=prompt_adapter_request, - ) - else: -- assert_never(prompt_inputs) -+ assert_never(prompt_inputs) # type: ignore[arg-type] - - return prompt_inputs - diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py -index f3ec9d115..2d9d024e0 100644 +index 0579893e5..dfb422ff5 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py -@@ -99,6 +99,9 @@ class InputContext: - - merged_kwargs = {**base_kwargs, **kwargs} - -+ if isinstance(typ, type): -+ merged_kwargs["processor_cls"] = typ -+ - hf_processor = cached_get_processor( - self.model_config.model, - trust_remote_code=self.model_config.trust_remote_code, -@@ -132,10 +135,13 @@ class InputProcessingContext(InputContext): - def call_hf_processor( - self, - hf_processor: ProcessorMixin, -- prompt: str, -- processor_data: Mapping[str, object], -- inference_kwargs: Mapping[str, object], -+ data: Mapping[str, object], -+ kwargs: Mapping[str, object] = {}, - ) -> BatchFeature: -+ """ -+ Call :code:`hf_processor` on the prompt :code:`data` -+ (text, image, audio...) with configurable options :code:`kwargs`. -+ """ - assert callable(hf_processor) - - base_kwargs = self.model_config.mm_processor_kwargs -@@ -144,21 +150,15 @@ class InputProcessingContext(InputContext): - - merged_kwargs = resolve_mm_processor_kwargs( - base_kwargs, -- inference_kwargs, -+ kwargs, - hf_processor, - requires_kw_only=False, - allow_var_kwargs=True, - ) - - try: -- return hf_processor( -- text=prompt, -- **processor_data, -- **merged_kwargs, -- return_tensors="pt", -- ) -+ return hf_processor(**data, **merged_kwargs, return_tensors="pt") - except Exception as exc: -- data = dict(text=prompt, **processor_data) - msg = (f"Failed to apply {type(hf_processor).__name__} " - f"on data={data} with kwargs={merged_kwargs}") - -@@ -331,13 +331,7 @@ class InputRegistry: - trust_remote_code=model_config.trust_remote_code, - ) - processor = mm_registry.create_processor(model_config, tokenizer) -- -- mm_counts = mm_registry.get_mm_limits_per_prompt(model_config) -- mm_max_tokens = mm_registry.get_max_tokens_by_modality( -- model_config) -- -- dummy_data = processor.get_dummy_data(seq_len, mm_counts, -- mm_max_tokens) -+ dummy_data = processor.get_dummy_data(seq_len) - else: - model_cls, _ = get_model_architecture(model_config) - if is_encoder_data: -@@ -425,7 +419,7 @@ class InputRegistry: - # Be more strict in V2 - assert "mm_kwargs" in inputs - else: -- assert_never(inputs["type"]) -+ assert_never(inputs["type"]) # type: ignore[arg-type] - - def process_input(self, model_config: "ModelConfig", - inputs: ProcessorInputs) -> ProcessorInputs: -diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py -index 85164c216..a933ccaec 100644 ---- a/vllm/lora/layers.py -+++ b/vllm/lora/layers.py -@@ -405,7 +405,9 @@ class ReplicatedLinearWithLoRA(BaseLinearLayerWithLoRA): - self.output_size = self.base_layer.output_size - self.n_slices = 1 - -- def forward(self, input_): -+ def forward( -+ self, input_: torch.Tensor -+ ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]: - """Forward of ReplicatedLinearWithLoRA - - Args: -@@ -479,7 +481,7 @@ class ColumnParallelLinearWithLoRA(BaseLinearLayerWithLoRA): - # ColumnParallelLinear. - else: - tensor_model_parallel_rank = get_tensor_model_parallel_rank() -- shard_size = self.output_dim -+ shard_size = self.output_size - start_idx = tensor_model_parallel_rank * shard_size - end_idx = (tensor_model_parallel_rank + 1) * shard_size - lora_b = lora_b[:, start_idx:end_idx] -@@ -490,13 +492,15 @@ class ColumnParallelLinearWithLoRA(BaseLinearLayerWithLoRA): - if bias is None: - return bias - tensor_model_parallel_rank = get_tensor_model_parallel_rank() -- shard_size = self.output_dim -+ shard_size = self.output_size - start_idx = tensor_model_parallel_rank * shard_size - end_idx = (tensor_model_parallel_rank + 1) * shard_size - bias = bias[start_idx:end_idx] - return bias - -- def forward(self, input_): -+ def forward( -+ self, input_: torch.Tensor -+ ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]: - """Forward of ColumnParallelLinear - - Args: -@@ -833,7 +837,9 @@ class RowParallelLinearWithLoRA(BaseLinearLayerWithLoRA): - def slice_bias(self, bias: torch.Tensor) -> torch.Tensor: - return bias - -- def forward(self, input_): -+ def forward( -+ self, input_: torch.Tensor -+ ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]: - """Forward of RowParallelLinear - - Args: -diff --git a/vllm/lora/lora.py b/vllm/lora/lora.py -index dde347b78..93ad4651f 100644 ---- a/vllm/lora/lora.py -+++ b/vllm/lora/lora.py -@@ -67,15 +67,9 @@ class LoRALayerWeights: - peft_helper: PEFTHelper, - embeddings_tensor: Optional[torch.Tensor] = None, - ) -> "LoRALayerWeights": -- return cls( -- module_name, -- peft_helper.r, -- peft_helper.lora_alpha, -- None, -- None, -- None, -- embeddings_tensor, -- ) -+ return cls(module_name, peft_helper.r, peft_helper.lora_alpha, None, -+ None, None, embeddings_tensor, -+ peft_helper.vllm_lora_scaling_factor) - - @classmethod - def create_dummy_lora_weights( -diff --git a/vllm/lora/models.py b/vllm/lora/models.py -index 5c0e4e5cb..7e57d9c85 100644 ---- a/vllm/lora/models.py -+++ b/vllm/lora/models.py -@@ -4,7 +4,7 @@ import math - import os - import re - from dataclasses import dataclass, field --from typing import Any, Callable, Dict, List, Optional, Sequence, Type -+from typing import Any, Callable, Dict, List, Optional, Sequence, Type, Union - - import safetensors.torch - import torch -@@ -173,7 +173,7 @@ class LoRAModel(AdapterModel): - return cls(lora_model_id, - peft_helper.r, - loras, -- scaling_factor=peft_helper.vllm_scaling_factor) -+ scaling_factor=peft_helper.vllm_long_context_scaling_factor) - - @classmethod - def from_local_checkpoint( -@@ -219,6 +219,7 @@ class LoRAModel(AdapterModel): - - config["vllm_max_position_embeddings"] = max_position_embeddings - peft_helper = PEFTHelper.from_dict(config) -+ unexpected_modules: List[Union[list[str], str]] - if os.path.isfile(lora_tensor_path): - tensors: Dict[str, torch.Tensor] = {} - # Find unexpected modules. -@@ -280,7 +281,8 @@ class LoRAModel(AdapterModel): - new_embeddings_tensor_path) - elif os.path.isfile(new_embeddings_bin_file_path): - embeddings = torch.load(new_embeddings_bin_file_path, -- map_location=device) -+ map_location=device, -+ weights_only=True) - - return cls.from_lora_tensors( - lora_model_id=get_lora_id() -diff --git a/vllm/lora/peft_helper.py b/vllm/lora/peft_helper.py -index edf4ba565..ddd42ae93 100644 ---- a/vllm/lora/peft_helper.py -+++ b/vllm/lora/peft_helper.py -@@ -4,6 +4,8 @@ import math - from dataclasses import MISSING, dataclass, field, fields - from typing import Literal, Optional, Union - -+from vllm.utils import print_info_once -+ - - @dataclass - class PEFTHelper: -@@ -14,21 +16,22 @@ class PEFTHelper: - - bias: Literal["none", "all", "lora_only"] = field(default="none") - modules_to_save: Optional[list[str]] = field(default=None) -+ # True to use Rank-Stabilized LoRA (rsLoRA, see: https://arxiv.org/abs/2312.03732) - use_rslora: bool = field(default=False) -+ # True to use Weight-Decomposed Low-Rank Adaptation (DoRA, see: https://arxiv.org/abs/2402.09353) - use_dora: bool = field(default=False) -- # long lora field -+ # long context lora field - context_length: int = field(default=0) - # Extra vllm field, start with 'vllm_' to avoid conflict -+ vllm_lora_scaling_factor: float = field(default=1.0) - vllm_max_position_embeddings: Optional[int] = field(default=False) -- vllm_scaling_factor: Optional[float] = field(default=None) -+ vllm_long_context_scaling_factor: Optional[float] = field(default=None) - - def _validate_features(self): - error_msg = [] - - if self.modules_to_save: - error_msg.append("vLLM only supports modules_to_save being None.") -- if self.use_rslora: -- error_msg.append("vLLM does not yet support RSLoRA.") - - if self.use_dora: - error_msg.append("vLLM does not yet support DoRA.") -@@ -38,10 +41,15 @@ class PEFTHelper: - - def __post_init__(self): - self._validate_features() -+ if self.use_rslora: -+ print_info_once("Loading LoRA weights trained with rsLoRA.") -+ self.vllm_lora_scaling_factor = self.lora_alpha / math.sqrt(self.r) -+ else: -+ self.vllm_lora_scaling_factor = self.lora_alpha / self.r - if self.context_length: - if self.vllm_max_position_embeddings is None: - self.vllm_max_position_embeddings = self.context_length -- self.vllm_scaling_factor = float( -+ self.vllm_long_context_scaling_factor = float( - math.ceil(self.context_length / - self.vllm_max_position_embeddings)) +@@ -330,7 +330,7 @@ class InputRegistry: + from vllm.multimodal.profiling import MultiModalProfiler + from vllm.sequence import SequenceData +- if mm_registry.has_processor(model_config): ++ if False and mm_registry.has_processor(model_config): + processor = mm_registry.create_processor(model_config, + disable_cache=True) + profiler = MultiModalProfiler(processor) diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py new file mode 100644 index 000000000..5f711bfe5 @@ -22114,14958 +10348,4127 @@ index 000000000..5f711bfe5 + base_indices.shape[-1], + sampler_indices.shape[-1], + sampler_indices_padded.shape[-1], -+ embeddings_indices.shape[-1], -+ ] -+ if long_lora_indices_len is not None: -+ indices_len.append(long_lora_indices_len) -+ else: -+ # If long_lora doesn't exist,append None -+ indices_len.append(None) -+ -+ return ( -+ base_indices, -+ sampler_indices, -+ sampler_indices_padded, -+ embeddings_indices, -+ long_lora_indices, -+ indices_len, -+ ) -+ -+ -+class PunicaWrapper: -+ """ -+ PunicaWrapper is designed to manage and provide metadata for the punica -+ kernel. The main function is to maintain the state information for -+ Multi-LoRA, and to provide the interface for the punica kernel. -+ """ -+ -+ def __init__(self, max_num_batched_tokens: int, max_batches: int, -+ device: torch.device): -+ self.device = device -+ self._token_lora_indices = torch.empty(max_num_batched_tokens, -+ dtype=torch.long, -+ device=device) -+ self._sampler_indices = torch.empty(max_num_batched_tokens, -+ dtype=torch.long, -+ device=device) -+ self._sampler_indices_padded = torch.empty(max_num_batched_tokens, -+ dtype=torch.long, -+ device=device) -+ self._embeddings_indices = torch.empty(2, -+ max_num_batched_tokens, -+ dtype=torch.long, -+ device=device) -+ self._long_lora_indices = torch.empty(max_num_batched_tokens, -+ dtype=torch.long, -+ device=device) -+ -+ # 5 is the number of indicies tensors. -+ # base_indices, sampler_indices, sampler_indices_padded, -+ # embeddings_indices,long_lora_indices -+ self.indices_len: List[Optional[int]] = [None] * 5 -+ # these attributes are the information required for sgmv kernel -+ self._seq_start_locs = torch.empty(max_batches, -+ dtype=torch.long, -+ device=device) -+ self._seq_lengths = torch.empty(max_batches, -+ dtype=torch.long, -+ device=device) -+ self._lora_indices_per_batch = torch.empty(max_batches, -+ dtype=torch.long, -+ device=device) -+ self.max_length: int = 0 -+ self.batch_size: int = -1 -+ self.is_prefill = False -+ self.no_lora = False -+ -+ def update_metadata( -+ self, -+ mapping: "LoRAMapping", -+ lora_index_to_id: List[Optional[int]], -+ max_loras: int, -+ vocab_size: int, -+ extra_vocab_size: int, -+ long_lora_context: Optional["LongContextLoRAContext"] = None, -+ ): -+ -+ self._update_base_metadata(mapping, lora_index_to_id, max_loras, -+ vocab_size, extra_vocab_size, -+ long_lora_context) -+ if mapping.is_prefill: -+ # Update metadata required for prefill-related operators. -+ self._update_prefill_metada(self.token_lora_indices) -+ self.is_prefill = True -+ else: -+ self.is_prefill = False -+ -+ def _update_base_metadata( -+ self, -+ mapping: "LoRAMapping", -+ lora_index_to_id: List[Optional[int]], -+ max_loras: int, -+ vocab_size: int, -+ extra_vocab_size: int, -+ long_lora_context: Optional["LongContextLoRAContext"] = None, -+ ): -+ ( -+ base_indices, -+ sampler_indices, -+ sampler_indices_padded, -+ embeddings_indices, -+ long_lora_offsets_tensor, -+ indices_len, -+ ) = convert_mapping( -+ mapping, -+ lora_index_to_id, -+ max_loras, -+ vocab_size, -+ extra_vocab_size, -+ long_lora_context, -+ self.device, -+ ) -+ self._token_lora_indices[:base_indices.shape[0]].copy_(base_indices) -+ self._sampler_indices[:sampler_indices.shape[0]].copy_(sampler_indices) -+ self._sampler_indices_padded[:sampler_indices_padded.shape[0]].copy_( -+ sampler_indices_padded) -+ self._embeddings_indices[:embeddings_indices. -+ shape[0], :embeddings_indices.shape[1]].copy_( -+ embeddings_indices) -+ if long_lora_offsets_tensor is not None: -+ self._long_lora_indices[:long_lora_offsets_tensor.shape[0]].copy_( -+ long_lora_offsets_tensor) -+ else: -+ self._long_lora_indices.zero_() -+ -+ self.indices_len[:] = indices_len -+ -+ def _update_prefill_metada(self, token_lora_tensor: torch.Tensor) -> None: -+ -+ (b_seq_start_tensor, seq_length_tensor, lora_indices_tensor, -+ batch_size, max_length, no_lora) = compute_meta(token_lora_tensor) -+ -+ self._seq_start_locs[:b_seq_start_tensor.shape[0]].copy_( -+ b_seq_start_tensor) -+ self._seq_lengths[:seq_length_tensor.shape[0]].copy_(seq_length_tensor) -+ self._lora_indices_per_batch[:lora_indices_tensor.shape[0]].copy_( -+ lora_indices_tensor) -+ self.batch_size = batch_size -+ self.max_length = max_length -+ self.no_lora = no_lora -+ -+ @property -+ def prefill_metadata( -+ self) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, int, int]: -+ """ -+ This property provides a convenient way to access the necessary -+ metadata for prefill-related kernel computations. -+ 1. seq_start_locs: Tensor of sequence start positions -+ 2. seq_lengths: Tensor of sequence lengths -+ 3. lora_indices_per_batch: Tensor of lora indices, and an index of -+ -1 means no lora should be applied. -+ 4. batch_size: batch size after clustering identical lora indices -+ 5. max_length: The maximum sequence length in the batch -+ """ -+ return (self._seq_start_locs[:self.batch_size], -+ self._seq_lengths[:self.batch_size], -+ self._lora_indices_per_batch[:self.batch_size], -+ self.batch_size, self.max_length) -+ -+ @property -+ def token_lora_indices(self) -> torch.Tensor: -+ """ -+ This property provides the lora indices corresponding to each token -+ in the batch. An index of -1 means no lora should be applied. -+ """ -+ token_lora_len = self.indices_len[0] -+ return self._token_lora_indices[:token_lora_len] -+ -+ @property -+ def sampler_indices(self) -> torch.Tensor: -+ """ -+ This property is used to access the lora indices specifically for -+ LogitsProcessorWithLoRA -+ """ -+ sampler_indices_len = self.indices_len[1] -+ return self._sampler_indices[:sampler_indices_len] -+ -+ @property -+ def sampler_indices_padded(self) -> torch.Tensor: -+ """ -+ This property provides access to padded sampler indices -+ """ -+ indices_padded_len = self.indices_len[2] -+ return self._sampler_indices_padded[:indices_padded_len] -+ -+ @property -+ def embeddings_indices(self) -> torch.Tensor: -+ """ -+ This property provides access to the indices used for lora embeddings, -+ specifically for VocabParallelEmbeddingWithLoRA -+ """ -+ embeddings_indices_len = self.indices_len[3] -+ return self._embeddings_indices[:, :embeddings_indices_len] -+ -+ @property -+ def long_lora_indices(self) -> torch.Tensor: -+ """ -+ This property provides access to the indices used for long context -+ lora, specifically for LinearScalingRotaryEmbeddingWithLora -+ """ -+ long_lora_len = self.indices_len[4] -+ return self._long_lora_indices[:long_lora_len] -+ -+ def shrink_prefill( -+ self, -+ y: torch.Tensor, -+ x: torch.Tensor, -+ w_t_all: torch.Tensor, -+ scale: float, -+ ): -+ #No LoRA request, so return directly -+ if self.no_lora: -+ return -+ sgmv_shrink( -+ x, -+ w_t_all, -+ y, -+ *self.prefill_metadata, -+ scale, -+ ) -+ -+ def shrink_decode( -+ self, -+ y: torch.Tensor, -+ x: torch.Tensor, -+ w_t_all: torch.Tensor, -+ scale: float, -+ ): -+ bgmv_shrink(x, w_t_all, y, self.token_lora_indices, scale) -+ -+ def expand_prefill( -+ self, -+ y: torch.Tensor, -+ x: torch.Tensor, -+ w_t_all: torch.Tensor, -+ add_input: bool, -+ ): -+ #No LoRA request, so return directly -+ if self.no_lora: -+ return -+ sgmv_expand( -+ x, -+ w_t_all, -+ y, -+ *self.prefill_metadata, -+ add_input, -+ ) -+ -+ def expand_decode( -+ self, -+ y: torch.Tensor, -+ x: torch.Tensor, -+ w_t_all: torch.Tensor, -+ add_input: bool, -+ ): -+ bgmv_expand(x, w_t_all, y, self.token_lora_indices, add_input) -+ -+ def expand_slice_prefill( -+ self, -+ y: torch.Tensor, -+ x: torch.Tensor, -+ w_t_all: torch.Tensor, -+ y_offset: Optional[int], -+ y_slice_size: Optional[int], -+ add_input: bool, -+ ): -+ #No LoRA request, so return directly -+ if self.no_lora: -+ return -+ sgmv_expand_slice( -+ x, -+ w_t_all, -+ y, -+ *self.prefill_metadata, -+ y_offset, -+ y_slice_size, -+ add_input, -+ ) -+ -+ def expand_slice_decode( -+ self, -+ y: torch.Tensor, -+ x: torch.Tensor, -+ w_t_all: torch.Tensor, -+ y_offset: Optional[int], -+ y_slice_size: Optional[int], -+ add_input: bool, -+ ): -+ bgmv_expand_slice(x, w_t_all, y, self.token_lora_indices, y_offset, -+ y_slice_size, add_input) -+ -+ def add_shrink( -+ self, -+ y: torch.Tensor, -+ x: torch.Tensor, -+ w_t_all: torch.Tensor, -+ scale: float, -+ ): -+ """ -+ Perform the ` y+=x@w_t_all` computation, which is suitable for the -+ GEMM of lora'a. -+ When `is_prefill is` true, it indicates that it is currently the -+ prefill stage, and the `shrink_prefill` function should be called. -+ Otherwise, it is the decode stage, and the shrink_decode function -+ should be called. -+ """ -+ shrink_fun: Callable = (self.shrink_prefill -+ if self.is_prefill else self.shrink_decode) -+ shrink_fun(y, x, w_t_all, scale) -+ -+ def add_expand( -+ self, -+ y: torch.Tensor, -+ x: torch.Tensor, -+ w_t_all: torch.Tensor, -+ add_input: bool = True, -+ ): -+ """ -+ Perform the ` y+=x@w_t_all` computation, which is suitable for the -+ GEMM of lora'b. -+ When `is_prefill` is true, it indicates that it is currently the -+ prefill stage, and the `expand_prefill` function should be called. -+ Otherwise, it is the decode stage, and the expand_decode function -+ should be called. -+ """ -+ -+ expand_fun: Callable = (self.expand_prefill -+ if self.is_prefill else self.expand_decode) -+ expand_fun(y, x, w_t_all, add_input) -+ -+ def add_expand_slice(self, -+ y: torch.Tensor, -+ x: torch.Tensor, -+ w_t_all: torch.Tensor, -+ y_offset: Optional[int], -+ y_slice_size: Optional[int], -+ add_input: bool = True): -+ """ -+ Similar to `add_expand` -+ """ -+ -+ expand_slice_fun: Callable = (self.expand_slice_prefill -+ if self.is_prefill else -+ self.expand_slice_decode) -+ expand_slice_fun(y, x, w_t_all, y_offset, y_slice_size, add_input) -+ -+ def add_lora(self, -+ y: torch.Tensor, -+ x: torch.Tensor, -+ wa_t_all: torch.Tensor, -+ wb_t_all: torch.Tensor, -+ scale: float, -+ y_offset: Optional[int] = None, -+ y_slice_size: Optional[int] = None, -+ *, -+ buffer: Optional[torch.Tensor] = None) -> None: -+ """ -+ Semantics: -+ y[i] += ( -+ x[i].unsqueeze(0) -+ @ wa_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) -+ @ wb_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) -+ * scale -+ ).squeeze(0) -+ Args: -+ y (torch.Tensor): Output tensor. Will be changed in-place. -+ x (torch.Tensor): Input tensor -+ wa_t_all (torch.Tensor): lora_a's weight -+ wb_t_all (torch.Tensor): lora_b's weight -+ scale (float): Scaling factor. -+ y_offset (Optional[int], optional): Offset to apply to the starting -+ column of y. -+ y_slice_size (Optional[int], optional): Size of the y column slice.. -+ buffer (Optional[torch.Tensor], optional): Defaults to None. -+ """ -+ y_org = y -+ y = y.view(-1, y.shape[-1]) -+ x = x.view(-1, x.shape[-1]) -+ r = wb_t_all.size(-1) -+ if buffer is None: -+ # We set the buffer to be float32 by default ,refer to: -+ # https://github.com/triton-lang/triton/issues/1387 -+ buffer = torch.zeros((x.size(0), r), -+ dtype=torch.float32, -+ device=x.device) -+ -+ self.add_shrink(buffer, x, wa_t_all, scale) -+ if y_offset is None and y_slice_size is None: -+ self.add_expand(y, buffer, wb_t_all, add_input=True) -+ else: -+ self.add_expand_slice(y, -+ buffer, -+ wb_t_all, -+ y_offset, -+ y_slice_size, -+ add_input=True) -+ y = y.view_as(y_org) -+ -+ def add_lora_packed_nslice(self, y: torch.Tensor, x: torch.Tensor, -+ lora_a_stacked: Tuple[torch.Tensor, -+ torch.Tensor, -+ torch.Tensor], -+ lora_b_stacked: Tuple[torch.Tensor, -+ torch.Tensor, -+ torch.Tensor], -+ scale: float, -+ output_slices: Tuple[int, ...]) -> None: -+ """ -+ Applies lora to each input. Similar to add_lora, This method is -+ used for layers that are composed of multiple sublayers -+ (slices) packed together. -+ """ -+ y_org = y -+ x = x.view(-1, x.shape[-1]) -+ y = y.view(-1, y.shape[-1]) -+ offset_left = 0 -+ # TODO fuse these kernels -+ for slice_idx in range(len(output_slices)): -+ self.add_lora(y, x, lora_a_stacked[slice_idx], -+ lora_b_stacked[slice_idx], scale, offset_left, -+ output_slices[slice_idx]) -+ offset_left += output_slices[slice_idx] -+ -+ y = y.view_as(y_org) -+ -+ def add_lora_logits(self, -+ y: torch.Tensor, -+ x: torch.Tensor, -+ wa_t_all: torch.Tensor, -+ wb_t_all: torch.Tensor, -+ scale, -+ *, -+ buffer: Optional[torch.Tensor] = None) -> None: -+ """ -+ LogitsProcessorWithLoRA always using bgmv -+ """ -+ y_org = y -+ y = y.view(-1, y.shape[-1]) -+ x = x.view(-1, x.shape[-1]) -+ r = wb_t_all.size(-1) -+ if buffer is None: -+ # We set the buffer to be float32 by default ,refer to: -+ # https://github.com/triton-lang/triton/issues/1387 -+ buffer = torch.zeros((x.size(0), r), -+ dtype=torch.float32, -+ device=x.device) -+ -+ bgmv_shrink(x, wa_t_all, buffer, self.sampler_indices, scale) -+ bgmv_expand(buffer, wb_t_all, y, self.sampler_indices, add_inputs=True) -+ y = y.view_as(y_org) -diff --git a/vllm/lora/punica_wrapper/punica_gpu.py b/vllm/lora/punica_wrapper/punica_gpu.py -index de378df8b..e33016ea0 100644 ---- a/vllm/lora/punica_wrapper/punica_gpu.py -+++ b/vllm/lora/punica_wrapper/punica_gpu.py -@@ -9,15 +9,24 @@ from typing import Callable, Optional, Tuple, Union, final - - import torch - -+from vllm.platforms import current_platform - from vllm.triton_utils import HAS_TRITON - --if HAS_TRITON: -+if HAS_TRITON and not current_platform.is_xpu(): - from vllm.lora.ops.bgmv_expand import bgmv_expand - from vllm.lora.ops.bgmv_expand_slice import bgmv_expand_slice - from vllm.lora.ops.bgmv_shrink import bgmv_shrink - from vllm.lora.ops.sgmv_expand import sgmv_expand - from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice - from vllm.lora.ops.sgmv_shrink import sgmv_shrink -+elif current_platform.is_xpu(): -+ from vllm._ipex_ops import ipex_ops -+ bgmv_expand = ipex_ops.bgmv_expand -+ bgmv_expand_slice = ipex_ops.bgmv_expand_slice -+ bgmv_shrink = ipex_ops.bgmv_shrink -+ sgmv_expand = ipex_ops.sgmv_expand -+ sgmv_expand_slice = ipex_ops.sgmv_expand_slice -+ sgmv_shrink = ipex_ops.sgmv_shrink - - from .punica_base import PunicaWrapperBase - -diff --git a/vllm/lora/punica_wrapper/punica_selector.py b/vllm/lora/punica_wrapper/punica_selector.py -index cd64878d9..0cc2a6d81 100644 ---- a/vllm/lora/punica_wrapper/punica_selector.py -+++ b/vllm/lora/punica_wrapper/punica_selector.py -@@ -5,7 +5,7 @@ from .punica_base import PunicaWrapperBase - - - def get_punica_wrapper(*args, **kwargs) -> PunicaWrapperBase: -- if current_platform.is_cuda_alike(): -+ if current_platform.is_cuda_alike() or current_platform.is_xpu(): - # Lazy import to avoid ImportError - from vllm.lora.punica_wrapper.punica_gpu import PunicaWrapperGPU - print_info_once("Using PunicaWrapperGPU.") -diff --git a/vllm/model_executor/guided_decoding/__init__.py b/vllm/model_executor/guided_decoding/__init__.py -index 694c5b68b..18b435a42 100644 ---- a/vllm/model_executor/guided_decoding/__init__.py -+++ b/vllm/model_executor/guided_decoding/__init__.py -@@ -6,7 +6,7 @@ from vllm.logger import init_logger - from vllm.model_executor.guided_decoding.utils import ( - convert_lark_to_gbnf, grammar_is_likely_lark, - has_lmf_unsupported_json_features, has_xgrammar_unsupported_json_features) --from vllm.platforms import CpuArchEnum, current_platform -+from vllm.platforms import CpuArchEnum - - if TYPE_CHECKING: - from transformers import PreTrainedTokenizer -@@ -39,6 +39,7 @@ def maybe_backend_fallback( - - if guided_params.backend == "xgrammar": - # xgrammar only has x86 wheels for linux, fallback to outlines -+ from vllm.platforms import current_platform - if current_platform.get_cpu_architecture() is not CpuArchEnum.X86: - logger.warning("xgrammar is only supported on x86 CPUs. " - "Falling back to use outlines instead.") -diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py -index 5e1948977..f10a8fb8e 100644 ---- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py -+++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py -@@ -1,6 +1,7 @@ - # noqa: UP007 - from __future__ import annotations - -+import copy - import json - from dataclasses import dataclass, field - from typing import TYPE_CHECKING, Any -@@ -309,3 +310,7 @@ class XGrammarLogitsProcessor: - scores = scores.to(device_type).squeeze() - - return scores -+ -+ def clone(self) -> XGrammarLogitsProcessor: -+ """Deepcopy due to per-sequence state in the matchers""" -+ return copy.deepcopy(self) -diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py -index 34d65ed51..043793547 100644 ---- a/vllm/model_executor/layers/activation.py -+++ b/vllm/model_executor/layers/activation.py -@@ -189,12 +189,12 @@ class QuickGELU(CustomOp): - ops.gelu_quick(out, x) - return out - -- def forward_xpu(self, x: torch.Tensor) -> torch.Tensor: -- from vllm._ipex_ops import ipex_ops as ops -+ # def forward_xpu(self, x: torch.Tensor) -> torch.Tensor: -+ # from vllm._ipex_ops import ipex_ops as ops - -- out = torch.empty_like(x) -- ops.gelu_quick(out, x) -- return out -+ # out = torch.empty_like(x) -+ # ops.gelu_quick(out, x) -+ # return out - - # TODO implement forward_xpu for QuickGELU - # def forward_xpu(self, x: torch.Tensor) -> torch.Tensor: -diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py -index b108cbd52..7267e62a5 100644 ---- a/vllm/model_executor/layers/fused_moe/layer.py -+++ b/vllm/model_executor/layers/fused_moe/layer.py -@@ -22,6 +22,10 @@ if current_platform.is_tpu(): - from .moe_pallas import fused_moe as fused_moe_pallas - else: - fused_moe_pallas = None # type: ignore -+if current_platform.is_xpu(): -+ from .moe_pallas import fused_moe_xpu -+else: -+ fused_moe_xpu = None # type: ignore - logger = init_logger(__name__) - - -@@ -146,6 +150,37 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): - raise NotImplementedError( - "The CPU backend currently does not support MoE.") - -+ def forward_xpu( -+ self, -+ layer: torch.nn.Module, -+ x: torch.Tensor, -+ use_grouped_topk: bool, -+ top_k: int, -+ router_logits: torch.Tensor, -+ renormalize: bool, -+ topk_group: Optional[int] = None, -+ num_expert_group: Optional[int] = None, -+ custom_routing_function: Optional[Callable] = None, -+ scoring_func: str = "softmax", -+ e_score_correction_bias: Optional[torch.Tensor] = None -+ ) -> torch.Tensor: -+ # assert not use_grouped_topk -+ # assert num_expert_group is None -+ # assert topk_group is None -+ # assert custom_routing_function is None -+ # if scoring_func != "softmax": -+ # raise NotImplementedError( -+ # "Only softmax scoring function is supported for TPU.") -+ # if e_score_correction_bias is not None: -+ # raise NotImplementedError( -+ # "Expert score correction bias is not supported for TPU.") -+ return fused_moe_xpu(hidden_states=x, -+ w1=layer.w13_weight, -+ w2=layer.w2_weight, -+ topk=top_k, -+ gating_output=router_logits, -+ renormalize=renormalize) -+ - def forward_tpu( - self, - layer: torch.nn.Module, -@@ -177,7 +212,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): - gating_output=router_logits, - renormalize=renormalize) - -- forward_native = forward_cuda -+ forward_native = forward_xpu - - - class FusedMoE(torch.nn.Module): -diff --git a/vllm/model_executor/layers/fused_moe/moe_pallas.py b/vllm/model_executor/layers/fused_moe/moe_pallas.py -index 563ee18c6..4e84e3edb 100644 ---- a/vllm/model_executor/layers/fused_moe/moe_pallas.py -+++ b/vllm/model_executor/layers/fused_moe/moe_pallas.py -@@ -1,9 +1,67 @@ - import torch - import torch.nn.functional as F --from torch_xla.experimental.custom_kernel import _histogram -+# from torch_xla.experimental.custom_kernel import _histogram - - --def fused_moe( -+# def fused_moe( -+# hidden_states: torch.Tensor, -+# w1: torch.Tensor, -+# w2: torch.Tensor, -+# gating_output: torch.Tensor, -+# topk: int, -+# renormalize: bool, -+# ) -> torch.Tensor: -+# """ -+# Args: -+# hidden_states: [*, hidden_size] -+# w1: [num_experts, intermediate_size * 2, hidden_size] -+# w2: [num_experts, hidden_size, intermediate_size] -+# gating_output: [*, num_experts] -+# """ -+# orig_shape = hidden_states.shape -+# hidden_size = hidden_states.shape[-1] -+# num_tokens = hidden_states.shape[:-1].numel() -+# num_experts = w1.shape[0] -+# intermediate_size = w2.shape[-1] -+# device = hidden_states.device -+# dtype = hidden_states.dtype -+# assert (num_tokens * topk) % 16 == 0, ( -+# "The Pallas GMM kernel requires num_tokens * topk to be a multiple of " -+# f"16 but got {num_tokens * topk}") -+ -+# hidden_states = hidden_states.view(num_tokens, hidden_size) -+# gating_output = gating_output.view(num_tokens, num_experts) -+# topk_weights = gating_output.softmax(dim=-1, dtype=torch.float) -+# topk_weights, topk_indices = topk_weights.topk(topk, dim=-1) -+# if renormalize: -+# topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True) -+# topk_weights = topk_weights.to(dtype) -+ -+# topk_indices = topk_indices.flatten() -+# topk_argsort_indices = topk_indices.argsort() -+# topk_argsort_revert_indices = topk_argsort_indices.argsort() -+# token_indices = torch.arange(num_tokens, -+# device=device).repeat_interleave(topk) -+# token_indices = token_indices[topk_argsort_indices] -+# group_sizes = _histogram(topk_indices.to(torch.int32), 0, num_experts - 1) -+ -+# # NOTE(woosuk): The GMM Pallas kernel requires a different weight layout -+# # from HF Transformers. -+# w1 = w1.transpose(1, 2) -+# w2 = w2.transpose(1, 2) -+ -+# x = hidden_states[token_indices] -+# x = torch.ops.xla.gmm(x, w1, group_sizes) -+# x = F.silu(x[..., :intermediate_size]) * x[..., intermediate_size:] -+# x = torch.ops.xla.gmm(x, w2, group_sizes) -+# x = x[topk_argsort_revert_indices].reshape(-1, topk, hidden_size) -+ -+# x = x * topk_weights.unsqueeze_(dim=-1) -+# x = x.sum(dim=-2) -+# x = x.reshape(orig_shape) -+# return x -+ -+def fused_moe_xpu( - hidden_states: torch.Tensor, - w1: torch.Tensor, - w2: torch.Tensor, -@@ -25,14 +83,9 @@ def fused_moe( - intermediate_size = w2.shape[-1] - device = hidden_states.device - dtype = hidden_states.dtype -- assert (num_tokens * topk) % 16 == 0, ( -- "The Pallas GMM kernel requires num_tokens * topk to be a multiple of " -- f"16 but got {num_tokens * topk}") -- - hidden_states = hidden_states.view(num_tokens, hidden_size) - gating_output = gating_output.view(num_tokens, num_experts) -- topk_weights = gating_output.softmax(dim=-1, dtype=torch.float) -- topk_weights, topk_indices = topk_weights.topk(topk, dim=-1) -+ topk_weights, topk_indices = F.softmax(gating_output, dim=-1, dtype=torch.float).topk(topk, dim=-1) - if renormalize: - topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True) - topk_weights = topk_weights.to(dtype) -@@ -40,23 +93,40 @@ def fused_moe( - topk_indices = topk_indices.flatten() - topk_argsort_indices = topk_indices.argsort() - topk_argsort_revert_indices = topk_argsort_indices.argsort() -- token_indices = torch.arange(num_tokens, -- device=device).repeat_interleave(topk) -+ token_indices = torch.arange(num_tokens, device=device).repeat_interleave(topk) - token_indices = token_indices[topk_argsort_indices] -- group_sizes = _histogram(topk_indices.to(torch.int32), 0, num_experts - 1) -+ group_sizes = custom_histogram(topk_indices.to(torch.int32), 0, num_experts - 1) - -- # NOTE(woosuk): The GMM Pallas kernel requires a different weight layout -- # from HF Transformers. - w1 = w1.transpose(1, 2) - w2 = w2.transpose(1, 2) -- - x = hidden_states[token_indices] -- x = torch.ops.xla.gmm(x, w1, group_sizes) -+ x = custom_gmm(x, w1, group_sizes) - x = F.silu(x[..., :intermediate_size]) * x[..., intermediate_size:] -- x = torch.ops.xla.gmm(x, w2, group_sizes) -+ x = custom_gmm(x, w2, group_sizes) - x = x[topk_argsort_revert_indices].reshape(-1, topk, hidden_size) - - x = x * topk_weights.unsqueeze_(dim=-1) - x = x.sum(dim=-2) - x = x.reshape(orig_shape) - return x -+ -+def custom_histogram(indices, min, max): -+ bin_counts = torch.histc(indices, bins=max - min + 1, min=min, max=max).to(torch.int32) -+ return bin_counts -+ -+ -+def custom_gmm(x, w, group_sizes): -+ result = torch.zeros( -+ (x.shape[0], w.shape[-1]), -+ dtype=x.dtype, -+ device=x.device -+ ) -+ start = 0 -+ i = 0 -+ for end_index in group_sizes.tolist(): -+ if end_index > 0: -+ end = start + end_index -+ result[start:end] = torch.matmul(x[start:end], w[i]) -+ start = end -+ i += 1 -+ return result -diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py -index 43ea4eb5a..1b103365e 100644 ---- a/vllm/model_executor/layers/layernorm.py -+++ b/vllm/model_executor/layers/layernorm.py -@@ -136,11 +136,14 @@ class RMSNorm(CustomOp): - self.variance_epsilon, - ) - return x, residual -- return ops.rms_norm( -+ out = torch.empty_like(x) -+ ops.rms_norm( -+ out, - x, - self.weight.data, - self.variance_epsilon, - ) -+ return out - - def extra_repr(self) -> str: - s = f"hidden_size={self.weight.data.size(0)}" -diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py -index 33b221b99..e22b0c23b 100644 ---- a/vllm/model_executor/layers/linear.py -+++ b/vllm/model_executor/layers/linear.py -@@ -128,8 +128,8 @@ class UnquantizedLinearMethod(LinearMethodBase): - input_size_per_partition, - dtype=params_dtype), - requires_grad=False) -- set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0}) - layer.register_parameter("weight", weight) -+ set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0}) - set_weight_attrs(weight, extra_weight_attrs) - - def apply(self, -@@ -238,7 +238,9 @@ class ReplicatedLinear(LinearBase): - assert param.size() == loaded_weight.size() - param.data.copy_(loaded_weight) - -- def forward(self, x: torch.Tensor) -> torch.Tensor: -+ def forward( -+ self, x: torch.Tensor -+ ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]: - bias = self.bias if not self.skip_bias_add else None - assert self.quant_method is not None - output = self.quant_method.apply(self, x, bias) -diff --git a/vllm/model_executor/layers/mamba/mamba_mixer.py b/vllm/model_executor/layers/mamba/mamba_mixer.py -index 10bec75f4..606c796d5 100644 ---- a/vllm/model_executor/layers/mamba/mamba_mixer.py -+++ b/vllm/model_executor/layers/mamba/mamba_mixer.py -@@ -42,12 +42,14 @@ class MambaMixer(CustomOp): - use_rms_norm: bool, - rms_norm_has_weight: bool = True, - rms_norm_eps: float = 1e-5, -- activation="silu"): -+ activation="silu", -+ is_lora_enabled: bool = False): - super().__init__() - self.time_step_rank = time_step_rank - self.ssm_state_size = ssm_state_size - self.use_rms_norm = use_rms_norm - self.activation = activation -+ self.is_lora_enabled = is_lora_enabled - - self.conv1d = ColumnParallelLinear( - input_size=conv_kernel_size, -@@ -63,6 +65,7 @@ class MambaMixer(CustomOp): - self.in_proj = MergedColumnParallelLinear(hidden_size, - [intermediate_size] * 2, - bias=use_bias) -+ - # selective projection used to make dt, B and C input dependent - self.x_proj = RowParallelLinear( - intermediate_size, -@@ -170,7 +173,13 @@ class MambaMixer(CustomOp): - - # 3. State Space Model sequence transformation - # 3.a. input varying initialization of time_step, B and C -- ssm_parameters = self.x_proj(hidden_states.transpose(-2, -1))[0] -+ -+ if self.is_lora_enabled: -+ # lora kernel requires contiguous tensor -+ ssm_parameters = self.x_proj( -+ hidden_states.transpose(-2, -1).contiguous())[0] -+ else: -+ ssm_parameters = self.x_proj(hidden_states.transpose(-2, -1))[0] - - time_step, B, C = torch.split( - ssm_parameters, -@@ -222,6 +231,11 @@ class MambaMixer(CustomOp): - scan_outputs = scan_outputs.transpose(0, 1) - - # 4. Final linear projection -- contextualized_states = self.out_proj(scan_outputs.transpose(-2, -- -1))[0] -+ if self.is_lora_enabled: -+ # lora kernel requires contiguous tensor -+ contextualized_states = self.out_proj( -+ scan_outputs.transpose(-2, -1).contiguous())[0] -+ else: -+ contextualized_states = self.out_proj( -+ scan_outputs.transpose(-2, -1))[0] - return contextualized_states -diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py -index 73cc8ce0d..1d4e4bd52 100644 ---- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py -+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py -@@ -41,10 +41,12 @@ class CompressedTensorsW8A8Fp8(CompressedTensorsScheme): - ) - - if current_platform.is_rocm(): -+ input_scale = getattr(layer, 'input_scale', None) -+ - weight, max_w_scale, input_scale = normalize_e4m3fn_to_e4m3fnuz( - weight=weight, - weight_scale=max_w_scale, -- input_scale=layer.input_scale) -+ input_scale=input_scale) - if input_scale is not None: - layer.input_scale = Parameter(input_scale, - requires_grad=False) -@@ -57,11 +59,13 @@ class CompressedTensorsW8A8Fp8(CompressedTensorsScheme): - weight = layer.weight - - if current_platform.is_rocm(): -+ input_scale = getattr(layer, 'input_scale', None) -+ - weight, weight_scale, input_scale = \ - normalize_e4m3fn_to_e4m3fnuz( - weight=weight, - weight_scale=layer.weight_scale, -- input_scale=layer.input_scale) -+ input_scale=input_scale) - if input_scale is not None: - layer.input_scale = Parameter(input_scale, - requires_grad=False) -@@ -76,7 +80,7 @@ class CompressedTensorsW8A8Fp8(CompressedTensorsScheme): - raise ValueError(f"Unknown quantization strategy {self.strategy}") - - # INPUT SCALE -- if self.is_static_input_scheme: -+ if self.is_static_input_scheme and hasattr(layer, 'input_scale'): - layer.input_scale = Parameter(layer.input_scale.max(), - requires_grad=False) - else: -diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py -index 7f779ac8d..2fe22903a 100644 ---- a/vllm/model_executor/layers/quantization/fp8.py -+++ b/vllm/model_executor/layers/quantization/fp8.py -@@ -15,8 +15,6 @@ from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase, - from vllm.model_executor.layers.quantization.base_config import ( - QuantizationConfig, QuantizeMethodBase) - from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod --from vllm.model_executor.layers.quantization.utils.fp8_utils import ( -- apply_w8a8_block_fp8_linear) - from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import ( - apply_fp8_marlin_linear, prepare_fp8_layer_for_marlin) - from vllm.model_executor.layers.quantization.utils.quant_utils import ( -@@ -337,6 +335,9 @@ class Fp8LinearMethod(LinearMethodBase): - size_k=layer.input_size_per_partition, - bias=bias) - -+ # Note: lazy import to avoid triton import error. -+ from vllm.model_executor.layers.quantization.utils.fp8_utils import ( -+ apply_w8a8_block_fp8_linear) - if self.block_quant: - assert self.quant_config.weight_block_size is not None - return apply_w8a8_block_fp8_linear( -diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py -index abafad0f1..ab4f6ea1e 100644 ---- a/vllm/model_executor/layers/quantization/gptq.py -+++ b/vllm/model_executor/layers/quantization/gptq.py -@@ -217,6 +217,7 @@ class GPTQLinearMethod(LinearMethodBase): - - # exllama needs to shuffle the weight after the weight is loaded - # here we do the shuffle on first forward pass -+ ''' - if layer.exllama_state == ExllamaState.UNINITIALIZED: - if self.quant_config.desc_act: - layer.g_idx.data = torch.argsort(layer.g_idx).to(torch.int) -@@ -227,7 +228,7 @@ class GPTQLinearMethod(LinearMethodBase): - layer.exllama_state = ExllamaState.READY - ops.gptq_shuffle(layer.qweight, layer.g_idx, - self.quant_config.weight_bits) -- -+ ''' - def apply(self, - layer: torch.nn.Module, - x: torch.Tensor, -diff --git a/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py -index c16a96213..fc5d21f40 100644 ---- a/vllm/model_executor/layers/quantization/ipex_quant.py -+++ b/vllm/model_executor/layers/quantization/ipex_quant.py -@@ -96,13 +96,14 @@ class IPEXConfig(QuantizationConfig): - @classmethod - def override_quantization_method(cls, hf_quant_cfg, - user_quant) -> Optional[str]: -- if not current_platform.is_cpu() and not current_platform.is_xpu(): -- return None -+ # not use IPEXConfig -+ # if not current_platform.is_cpu() and not current_platform.is_xpu(): -+ # return None - -- quant_method = hf_quant_cfg.get("quant_method", "").lower() -+ # quant_method = hf_quant_cfg.get("quant_method", "").lower() - -- if quant_method in ["awq", "gptq"]: -- return cls.get_name() -+ # if quant_method in ["awq", "gptq"]: -+ # return cls.get_name() - - return None - -diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py -index 97a1b0c96..f173cbde0 100644 ---- a/vllm/model_executor/layers/rejection_sampler.py -+++ b/vllm/model_executor/layers/rejection_sampler.py -@@ -39,7 +39,7 @@ class RejectionSampler(SpecDecodeStochasticBaseSampler): - strict_mode: Whether or not to perform shape/device/dtype checks - during sampling. This catches correctness issues but adds - nontrivial latency. -- use_falshinfer: We will use this parameter to determine whether -+ use_flashinfer: We will use this parameter to determine whether - to use the FlashInfer rejection sampling kernel or not. If it's - None, we will use the default value from the environment variable. - This parameter is only used for testing purposes. -@@ -118,7 +118,7 @@ class RejectionSampler(SpecDecodeStochasticBaseSampler): - - # If use Flashinfer chain_speculative_sampling kernel - # for rejection sampling -- if self.use_flashinfer: -+ if self.use_flashinfer and chain_speculative_sampling is not None: - batch_size, k, _ = draft_probs.shape - uniform_samples = self._create_uniform_samples( - seeded_seqs, batch_size, k, draft_probs.device) -diff --git a/vllm/model_executor/layers/resampler.py b/vllm/model_executor/layers/resampler.py -index aae806f6a..a67713c32 100644 ---- a/vllm/model_executor/layers/resampler.py -+++ b/vllm/model_executor/layers/resampler.py -@@ -27,7 +27,7 @@ - Shared resampler perceiver network used in multimodal models and - related helpers for sincos positional embeddings. - --Example models: Qwen (Qwen-VL), Minicpmv2.0 -+Example models: Qwen (Qwen-VL), MiniCPM-V 2.0 - """ - import math - from functools import partial -@@ -37,7 +37,6 @@ import numpy as np - import torch - import torch.nn.functional as F - from torch import nn --from torch.nn.init import trunc_normal_ - - from vllm.model_executor.layers.linear import ReplicatedLinear - from vllm.model_executor.layers.quantization import QuantizationConfig -@@ -169,8 +168,8 @@ class BaseResampler(nn.Module): - self.embed_dim = embed_dim - self.num_heads = num_heads - -- self.query = nn.Parameter(torch.zeros(self.num_queries, embed_dim)) -- trunc_normal_(self.query, std=0.02) -+ self.query = nn.Parameter(torch.empty(self.num_queries, embed_dim)) -+ - if kv_dim is not None and kv_dim != embed_dim: - self.kv_proj = ReplicatedLinear(kv_dim, - embed_dim, -@@ -190,16 +189,7 @@ class BaseResampler(nn.Module): - self.ln_post = norm_layer(embed_dim) if do_post_projection else None - self.proj = nn.Parameter( - (embed_dim**-0.5) * -- torch.randn(embed_dim, embed_dim)) if do_post_projection else None -- -- def _init_weights(self, m: nn.Module) -> None: -- if isinstance(m, nn.Linear): -- trunc_normal_(m.weight, std=0.02) -- if isinstance(m, nn.Linear) and m.bias is not None: -- nn.init.constant_(m.bias, 0) -- elif isinstance(m, nn.LayerNorm): -- nn.init.constant_(m.bias, 0) -- nn.init.constant_(m.weight, 1.0) -+ torch.empty(embed_dim, embed_dim)) if do_post_projection else None - - def _repeat(self, query, N: int): - return query.unsqueeze(1).repeat(1, N, 1) -@@ -240,8 +230,6 @@ class Resampler2(BaseResampler): - self.pos_embed = nn.Parameter( - torch.from_numpy(pos_embed_arr).requires_grad_(False)) - -- self.apply(self._init_weights) -- - def forward( - self, - x: torch.Tensor, -diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py -index 117fe086e..47ff7b661 100644 ---- a/vllm/model_executor/layers/rotary_embedding.py -+++ b/vllm/model_executor/layers/rotary_embedding.py -@@ -541,19 +541,12 @@ class Phi3LongRoPEScaledRotaryEmbedding(nn.Module): - short_cache = self._compute_cos_sin_cache( - original_max_position_embeddings, short_factor, short_mscale) - short_cache = short_cache.to(dtype) -- self.register_buffer("short_cos_sin_cache", -- short_cache, -- persistent=False) - - long_cache = self._compute_cos_sin_cache(max_position_embeddings, - long_factor, long_mscale) - long_cache = long_cache.to(dtype) -- self.register_buffer("long_cos_sin_cache", -- long_cache, -- persistent=False) - -- long_short_cache = torch.cat( -- [self.short_cos_sin_cache, self.long_cos_sin_cache], dim=0) -+ long_short_cache = torch.cat([short_cache, long_cache], dim=0) - self.register_buffer("long_short_cos_sin_cache", - long_short_cache, - persistent=False) -@@ -593,8 +586,6 @@ class Phi3LongRoPEScaledRotaryEmbedding(nn.Module): - torch.full_like(positions, k)).long() - idx = (torch.add(positions, long_prompt_offset) - if long_prompt_offset is not None else positions) -- self.long_short_cos_sin_cache: torch.Tensor = ( -- self.long_short_cos_sin_cache.to(idx.device)) - idx = torch.add(idx, offsets) if offsets is not None else idx - cos_sin = torch.index_select(self.long_short_cos_sin_cache, 0, idx) - -@@ -652,7 +643,7 @@ class DeepseekScalingRotaryEmbedding(RotaryEmbedding): - - def _compute_inv_freq(self, scaling_factor: float) -> torch.Tensor: - pos_freqs = self.base**(torch.arange( -- 0, self.rotary_dim, 2, dtype=torch.float, device="cuda") / -+ 0, self.rotary_dim, 2, dtype=torch.float) / - self.rotary_dim) - inv_freq_extrapolation = 1.0 / pos_freqs - inv_freq_interpolation = 1.0 / (scaling_factor * pos_freqs) -@@ -671,13 +662,11 @@ class DeepseekScalingRotaryEmbedding(RotaryEmbedding): - def _compute_cos_sin_cache(self) -> torch.Tensor: - inv_freq = self._compute_inv_freq(self.scaling_factor) - t = torch.arange(self.max_position_embeddings * self.scaling_factor, -- device="cuda", - dtype=torch.float32) - freqs = torch.einsum("i,j -> ij", t, inv_freq) - cos = (freqs.cos() * self.mscale) - sin = (freqs.sin() * self.mscale) - cache = torch.cat((cos, sin), dim=-1) -- print("Cache shape", cache.shape) - return cache - - def forward( -diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py -index 30548e656..a95aeb157 100644 ---- a/vllm/model_executor/layers/vocab_parallel_embedding.py -+++ b/vllm/model_executor/layers/vocab_parallel_embedding.py -@@ -30,8 +30,8 @@ class UnquantizedEmbeddingMethod(QuantizeMethodBase): - input_size_per_partition, - dtype=params_dtype), - requires_grad=False) -- set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0}) - layer.register_parameter("weight", weight) -+ set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0}) - set_weight_attrs(weight, extra_weight_attrs) - - def apply(self, -diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py -index f2d9293b3..b3d7d6977 100644 ---- a/vllm/model_executor/model_loader/loader.py -+++ b/vllm/model_executor/model_loader/loader.py -@@ -11,7 +11,8 @@ import os - import warnings - from abc import ABC, abstractmethod - from contextlib import contextmanager --from typing import Any, Dict, Generator, Iterable, List, Optional, Tuple, cast -+from typing import (Any, Callable, Dict, Generator, Iterable, List, Optional, -+ Tuple, cast) - - import gguf - import huggingface_hub -@@ -706,6 +707,8 @@ class BitsAndBytesModelLoader(BaseModelLoader): - # Store all module names (from transformers) that support - # BNB quantization. - self.target_modules: List[str] = [] -+ # mapping weight names from transformers to vllm. -+ self.weight_mapper: Callable = lambda name: name - - def _get_weight_files( - self, -@@ -763,9 +766,12 @@ class BitsAndBytesModelLoader(BaseModelLoader): - - def _hf_weight_iter(self, hf_weights_files, use_safetensors: bool): - if use_safetensors: -- return safetensors_weights_iterator(hf_weights_files) -+ iterator = safetensors_weights_iterator(hf_weights_files) - else: -- return pt_weights_iterator(hf_weights_files) -+ iterator = pt_weights_iterator(hf_weights_files) -+ for name, param in iterator: -+ # mapping weight names from transformers to vllm. -+ yield self.weight_mapper(name), param - - def _get_quantized_weights_iterator( - self, -@@ -782,12 +788,12 @@ class BitsAndBytesModelLoader(BaseModelLoader): - try: - import bitsandbytes - -- if bitsandbytes.__version__ < "0.44.0": -+ if bitsandbytes.__version__ < "0.45.0": - raise ImportError("bitsandbytes version is wrong. Please " -- "install bitsandbytes>=0.44.0.") -+ "install bitsandbytes>=0.45.0.") - except ImportError as err: -- raise ImportError("Please install bitsandbytes>=0.44.0 via " -- "`pip install bitsandbytes>=0.44.0` to use " -+ raise ImportError("Please install bitsandbytes>=0.45.0 via " -+ "`pip install bitsandbytes>=0.45.0` to use " - "bitsandbytes quantizer.") from err - - hf_weights_files, use_safetensors = self._prepare_weights( -@@ -991,12 +997,15 @@ class BitsAndBytesModelLoader(BaseModelLoader): - if isinstance(module, (LinearBase, )): - last_name = name.split(".")[-1] - if sub_modules := inverse_stacked_mapping.get(last_name, []): -- # Map vllm's names to transformers' names. -+ # Map vllm's names to transformers's names. - for sub_name in sub_modules: - self.target_modules.append( - name.replace(last_name, sub_name)) -- else: -- self.target_modules.append(name) -+ # Add original module name even if the module has stacked map, -+ # in case model has a mixture of disk-merged and disk-splitted -+ # weights with same last name. -+ self.target_modules.append(name) -+ - assert (self.target_modules - ), "vllm currently does not support BNB quantization for" - f" {type(model).__name__}" -@@ -1013,6 +1022,10 @@ class BitsAndBytesModelLoader(BaseModelLoader): - f"Model {type(model).__name__} does not support BitsAndBytes " - "quantization yet.") - -+ # For some models like Molmo, we need to use hf_to_vllm_mapper -+ # to ensure correct loading of weights. -+ if hf_to_vllm_mapper := getattr(model, "hf_to_vllm_mapper", None): -+ self.weight_mapper = lambda name: hf_to_vllm_mapper._map_name(name) - # Modules whose weights might have fused on disk - # we need their output_sizes to make shard in flight correctly with TP - self.maybe_fused_weights_modules: Dict[str, List[int]] = {} -@@ -1338,9 +1351,41 @@ class RunaiModelStreamerLoader(BaseModelLoader): - return model.eval() - - -+class IPEXLLMLowBitLoader(BaseModelLoader): -+ def __init__(self, load_config: LoadConfig): -+ super().__init__(load_config) -+ logger.info("IPEXLLMLowBitLoader get selected. Ensure your model is converted before.") -+ if load_config.model_loader_extra_config: -+ raise ValueError(f"Model loader extra config is not supported for " -+ f"load format {load_config.load_format}") -+ -+ def download_model(self, model_config: ModelConfig) -> None: -+ """Download a model so that it can be immediately loaded.""" -+ raise ValueError(f"IPEXLLMLowBitLoader does not support " -+ f"download_model api.") -+ -+ def load_model(self, vllm_config: VllmConfig) -> nn.Module: -+ model_config = vllm_config.model_config -+ -+ from ipex_llm.optimize import low_memory_init, load_low_bit -+ with set_default_torch_dtype(model_config.dtype): -+ # Initialize an empty skeleton of the model -+ with low_memory_init(): -+ model = _initialize_model(vllm_config=vllm_config) -+ # Load the real weights from the config -+ local_rank = os.environ["LOCAL_RANK"] -+ load_path = os.path.join(model_config.low_bit_model_path, -+ str(local_rank)) -+ model = load_low_bit(model, load_path) -+ return model -+ -+ - def get_model_loader(load_config: LoadConfig) -> BaseModelLoader: - """Get a model loader based on the load format.""" - -+ if load_config.use_low_bit_loader: -+ return IPEXLLMLowBitLoader(load_config) -+ - if isinstance(load_config.load_format, type): - return load_config.load_format(load_config) - -diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py -index 8aa0c98df..34c86a23a 100644 ---- a/vllm/model_executor/model_loader/weight_utils.py -+++ b/vllm/model_executor/model_loader/weight_utils.py -@@ -93,7 +93,7 @@ def convert_bin_to_safetensor_file( - pt_filename: str, - sf_filename: str, - ) -> None: -- loaded = torch.load(pt_filename, map_location="cpu") -+ loaded = torch.load(pt_filename, map_location="cpu", weights_only=True) - if "state_dict" in loaded: - loaded = loaded["state_dict"] - shared = _shared_pointers(loaded) -@@ -381,7 +381,9 @@ def np_cache_weights_iterator( - disable=not enable_tqdm, - bar_format=_BAR_FORMAT, - ): -- state = torch.load(bin_file, map_location="cpu") -+ state = torch.load(bin_file, -+ map_location="cpu", -+ weights_only=True) - for name, param in state.items(): - param_path = os.path.join(np_folder, name) - with open(param_path, "wb") as f: -@@ -447,7 +449,7 @@ def pt_weights_iterator( - disable=not enable_tqdm, - bar_format=_BAR_FORMAT, - ): -- state = torch.load(bin_file, map_location="cpu") -+ state = torch.load(bin_file, map_location="cpu", weights_only=True) - yield from state.items() - del state - torch.cuda.empty_cache() -diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py -index 9437ad968..2e649f10c 100644 ---- a/vllm/model_executor/models/aria.py -+++ b/vllm/model_executor/models/aria.py -@@ -1,15 +1,13 @@ --import math --from typing import Iterable, List, Optional, Set, Tuple, TypedDict, Union -+from typing import (Callable, Iterable, List, Mapping, Optional, Set, Tuple, -+ TypedDict, Union) - - import torch - import torch.nn as nn --from torch.nn.init import trunc_normal_ --from transformers import LlamaConfig -+from transformers import BatchFeature, PretrainedConfig - - from vllm.attention import AttentionMetadata - from vllm.config import CacheConfig, QuantizationConfig, VllmConfig - from vllm.distributed import get_tensor_model_parallel_rank --from vllm.inputs import INPUT_REGISTRY, token_inputs - from vllm.model_executor.layers.activation import get_act_fn - from vllm.model_executor.layers.fused_moe import FusedMoE - from vllm.model_executor.layers.linear import (ColumnParallelLinear, -@@ -17,30 +15,28 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear, - from vllm.model_executor.layers.logits_processor import LogitsProcessor - from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( - get_compressed_tensors_cache_scale) --from vllm.model_executor.layers.sampler import (Sampler, SamplerOutput, -- SamplingMetadata) -+from vllm.model_executor.layers.sampler import (SamplerOutput, -+ SamplingMetadata, get_sampler) - from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead - from vllm.model_executor.model_loader.weight_utils import ( - default_weight_loader, maybe_remap_kv_scale_name) --from vllm.model_executor.models.idefics2_vision_model import ( -- Idefics2VisionTransformer) --from vllm.model_executor.models.interfaces import SupportsMultiModal --from vllm.model_executor.models.llama import (LlamaDecoderLayer, LlamaMLP, -- LlamaModel) --from vllm.model_executor.models.utils import (AutoWeightsLoader, WeightsMapper, -- is_pp_missing_parameter, -- maybe_prefix, -- merge_multimodal_embeddings) - from vllm.multimodal import MULTIMODAL_REGISTRY --from vllm.multimodal.image import cached_get_image_processor --from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors --from vllm.multimodal.utils import (cached_get_tokenizer, -- repeat_and_pad_placeholder_tokens) -+from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, -+ NestedTensors) -+from vllm.multimodal.processing import (BaseMultiModalProcessor, -+ MultiModalDataItems, ProcessingMixin, -+ PromptReplacement) -+from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs - from vllm.sequence import IntermediateTensors - from vllm.transformers_utils.configs.aria import (AriaMoELMConfig, - AriaVisionConfig) - --from .utils import flatten_bn -+from .idefics2_vision_model import Idefics2VisionTransformer -+from .interfaces import SupportsMultiModal -+from .llama import LlamaDecoderLayer, LlamaMLP, LlamaModel -+from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, -+ is_pp_missing_parameter, maybe_prefix, -+ merge_multimodal_embeddings) - - - class AriaImagePixelInputs(TypedDict): -@@ -90,8 +86,8 @@ class AriaVisionModel(nn.Module): - def forward( - self, - pixel_values: torch.Tensor, -- pixel_mask: Optional[torch.BoolTensor] = None, -- ) -> Tuple[torch.Tensor, Optional[torch.BoolTensor]]: -+ pixel_mask: Optional[torch.Tensor] = None, -+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: - patch_attention_mask = self._create_patch_attention_mask(pixel_mask) - - vit_oup = self.vision_model( -@@ -103,7 +99,8 @@ class AriaVisionModel(nn.Module): - - return vit_oup, image_atts - -- def _create_patch_attention_mask(self, pixel_mask): -+ def _create_patch_attention_mask( -+ self, pixel_mask: Optional[torch.Tensor]) -> torch.Tensor: - if pixel_mask is None: - return None - -@@ -118,7 +115,8 @@ class AriaVisionModel(nn.Module): - ) - return (patches_subgrid.sum(dim=(-1, -2)) > 0).bool() - -- def _create_image_attention_mask(self, patch_attention_mask): -+ def _create_image_attention_mask( -+ self, patch_attention_mask: torch.Tensor) -> torch.Tensor: - if patch_attention_mask is None: - return None - -@@ -128,13 +126,13 @@ class AriaVisionModel(nn.Module): - - class FFN(nn.Module): - -- def __init__(self, embed_dim, ff_dim, output_dim): -+ def __init__(self, embed_dim: int, ff_dim: int, output_dim: int) -> None: - super().__init__() - self.linear_in = ColumnParallelLinear(embed_dim, ff_dim, bias=False) - self.linear_out = RowParallelLinear(ff_dim, output_dim, bias=False) - self.act = get_act_fn("gelu_new") - -- def forward(self, hidden_states): -+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - hidden_states, _ = self.linear_in(hidden_states) - hidden_states = self.act(hidden_states) - hidden_states, _ = self.linear_out(hidden_states) -@@ -143,7 +141,7 @@ class FFN(nn.Module): - - class CrossAttention(nn.Module): - -- def __init__(self, kv_dim, embed_dim, num_heads, drop_out_rate=0): -+ def __init__(self, kv_dim: int, embed_dim: int, num_heads: int) -> None: - super().__init__() - self.num_heads = num_heads - self.q_proj = nn.Linear(embed_dim, embed_dim, bias=False) -@@ -152,12 +150,16 @@ class CrossAttention(nn.Module): - - self.multihead_attn = nn.MultiheadAttention(embed_dim, num_heads) - self.linear = nn.Linear(embed_dim, embed_dim) -- self.dropout = nn.Dropout(drop_out_rate) - - self.layer_norm = nn.LayerNorm(embed_dim) - self.ln_kv = nn.LayerNorm(kv_dim) - -- def forward(self, x, hidden_states, attn_mask=None, add_residual=False): -+ def forward( -+ self, -+ x: torch.Tensor, -+ hidden_states: torch.Tensor, -+ attn_mask: Optional[torch.Tensor] = None, -+ ) -> torch.Tensor: - normed_hidden_states = self.layer_norm(hidden_states) - query = self.q_proj(normed_hidden_states).permute(1, 0, 2) - -@@ -172,11 +174,7 @@ class CrossAttention(nn.Module): - - attn_output = attn_output.permute(1, 0, 2) - -- if add_residual: -- attn_output = hidden_states + self.dropout( -- self.linear(attn_output)) -- else: -- attn_output = self.dropout(self.linear(attn_output)) -+ attn_output = self.linear(attn_output) - - return attn_output - -@@ -204,30 +202,32 @@ class AriaProjector(nn.Module): - - def __init__( - self, -- patch_to_query_dict, -- embed_dim, -- num_heads, -- kv_dim, -- ff_dim, -- output_dim, -- norm_layer=nn.LayerNorm, -- ): -+ patch_to_query_dict: dict[int, int], -+ embed_dim: int, -+ num_heads: int, -+ kv_dim: int, -+ ff_dim: int, -+ output_dim: int, -+ norm_layer: Callable[[int], nn.Module] = nn.LayerNorm, -+ ) -> None: - super().__init__() - self.patch_to_query_dict = patch_to_query_dict - self.embed_dim = embed_dim - self.num_heads = num_heads - - self.query = nn.Parameter( -- torch.zeros(max(patch_to_query_dict.values()), self.embed_dim)) -- -- trunc_normal_(self.query, std=0.02) -+ torch.empty(max(patch_to_query_dict.values()), self.embed_dim)) - - self.cross_attn = CrossAttention(kv_dim, embed_dim, num_heads) - - self.ln_ffn = norm_layer(embed_dim) - self.ffn = FFN(embed_dim, ff_dim, output_dim) - -- def forward(self, x, attn_mask=None): -+ def forward( -+ self, -+ x: torch.Tensor, -+ attn_mask: Optional[torch.Tensor] = None, -+ ) -> torch.Tensor: - bs = x.shape[0] - queries = self.query.unsqueeze(0).repeat(bs, 1, 1) - -@@ -251,7 +251,7 @@ class AriaProjector(nn.Module): - class AriaFusedMoE(FusedMoE): - - def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor, -- shard_id: str) -> Set[str]: -+ shard_id: str) -> None: - # Override the weight_loader to handle the expert weights in the Aria - # model, which are already packed with experts, and merge the gate and - # up weights for each expert. -@@ -346,7 +346,7 @@ class MoEDecoderLayer(LlamaDecoderLayer): - - def __init__( - self, -- config: LlamaConfig, -+ config: AriaMoELMConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", -@@ -434,7 +434,7 @@ class AriaMoELMModel(LlamaModel): - return loaded_params - - --def build_mm_projector(config): -+def build_mm_projector(config: PretrainedConfig): - return AriaProjector( - patch_to_query_dict=config.projector_patch_to_query_dict, - embed_dim=config.vision_config.hidden_size, -@@ -445,75 +445,89 @@ def build_mm_projector(config): - ) - - --def get_max_multimodal_tokens(ctx): -- return max(ctx.model_config.hf_config.image_size2tokens.values()) -+class AriaProcessingMixin(ProcessingMixin): - -+ def _get_hf_config(self): -+ return self.ctx.get_hf_config() - --def input_mapper_for_aria(ctx, data): -- return MultiModalKwargs(data) -+ def _get_vision_config(self) -> AriaVisionConfig: -+ return self._get_hf_config().vision_config - -+ def _get_num_image_tokens(self) -> int: -+ hf_config = self._get_hf_config() -+ return max(hf_config.projector_patch_to_query_dict.values()) - --def input_processor(ctx, llm_inputs): -- multi_modal_data = llm_inputs.get("multi_modal_data") -- # if it is pure text input, use it as is -- if multi_modal_data is None or "image" not in multi_modal_data: -- return llm_inputs - -- model_config = ctx.model_config -+class AriaProfilingInfo(AriaProcessingMixin, BaseProfilingInfo): - -- tokenizer = cached_get_tokenizer(model_config.tokenizer) -- image_processor = cached_get_image_processor( -- model_config.model, trust_remote_code=model_config.trust_remote_code) -- hf_config = model_config.hf_config -+ def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: -+ return {"image": None} - -- # prepare image tokens, the max_image_size is used to determine the number -- # of patch_size for every image -- max_image_size = multi_modal_data.pop("max_image_size", 980) -- _split_image = multi_modal_data.pop("split_image", False) -+ def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: -+ return {"image": self._get_num_image_tokens()} - -- assert isinstance(max_image_size, -- (int, float)), "max_image_size should be float or int" -- images = (multi_modal_data["image"] if isinstance( -- multi_modal_data["image"], list) else [multi_modal_data["image"]]) -+ def get_dummy_processor_inputs( -+ self, -+ seq_len: int, -+ mm_counts: Mapping[str, int], -+ ) -> ProcessorInputs: -+ vision_config = self._get_vision_config() -+ -+ max_image_size = vision_config.image_size -+ num_images = mm_counts.get("image", 0) -+ -+ mm_data = { -+ "image": -+ self._get_dummy_images(width=max_image_size, -+ height=max_image_size, -+ num_images=num_images) -+ } - -- image_inputs = image_processor.preprocess(images, -- max_image_size=max_image_size, -- split_image=_split_image, -- return_tensors="pt").data -- image_inputs['pixel_values'] = image_inputs['pixel_values'].to( -- ctx.model_config.dtype) -- num_crops = image_inputs.pop("num_crops") -+ hf_processor = self._get_hf_processor() -+ image_token: str = hf_processor.image_token # type: ignore - -- prompt_token_ids = llm_inputs["prompt_token_ids"] -- if num_crops.sum().item() > 0: -- _, prompt_token_ids, _ = repeat_and_pad_placeholder_tokens( -- tokenizer, -- None, -- prompt_token_ids, -- placeholder_token_id=hf_config.image_token_index, -- repeat_count=num_crops, -+ return ProcessorInputs( -+ prompt_text=image_token * num_images, -+ mm_data=mm_data, - ) - -- repeat_count = [hf_config.image_size2tokens[max_image_size] -- ] * sum(num_crops).item() -- new_prompt, new_token_ids, _ = repeat_and_pad_placeholder_tokens( -- tokenizer, -- None, -- prompt_token_ids, -- placeholder_token_id=hf_config.image_token_index, -- repeat_count=repeat_count, -- ) - -- return token_inputs( -- prompt_token_ids=new_token_ids, -- prompt=new_prompt, -- multi_modal_data={"image": image_inputs}, -- ) -+class AriaMultiModalProcessor(AriaProcessingMixin, BaseMultiModalProcessor): -+ -+ def _get_profiling_info(self) -> BaseProfilingInfo: -+ return AriaProfilingInfo(self.ctx) -+ -+ def _get_mm_fields_config( -+ self, -+ hf_inputs: BatchFeature, -+ hf_processor_mm_kwargs: Mapping[str, object], -+ ) -> Mapping[str, MultiModalFieldConfig]: -+ return dict( -+ pixel_values=MultiModalFieldConfig.batched("image"), -+ pixel_mask=MultiModalFieldConfig.batched("image"), -+ ) -+ -+ def _get_prompt_replacements( -+ self, -+ mm_items: MultiModalDataItems, -+ hf_processor_mm_kwargs: Mapping[str, object], -+ out_mm_kwargs: MultiModalKwargs, -+ ) -> list[PromptReplacement]: -+ hf_config = self._get_hf_config() -+ image_token_id = hf_config.image_token_index -+ -+ num_image_tokens = self._get_num_image_tokens() -+ -+ return [ -+ PromptReplacement( -+ modality="image", -+ target=[image_token_id], -+ replacement=[image_token_id] * num_image_tokens, -+ ) -+ ] - - --@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_multimodal_tokens) --@MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_aria) --@INPUT_REGISTRY.register_input_processor(input_processor) -+@MULTIMODAL_REGISTRY.register_processor(AriaMultiModalProcessor) - class AriaForConditionalGeneration(nn.Module, SupportsMultiModal): - """ - Aria model for conditional generation tasks. -@@ -540,12 +554,6 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal): - config = vllm_config.model_config.hf_config - quant_config = vllm_config.quant_config - -- # prepare the image_size to tokens mapping for the image preprocess, see -- # input_processor -- config.image_size2tokens = { -- int(math.sqrt(k) * config.vision_config.patch_size): v -- for k, v in config.projector_patch_to_query_dict.items() -- } - self.config = config - self.vision_tower = AriaVisionModel(config.vision_config) - self.multi_modal_projector = build_mm_projector(config) -@@ -566,7 +574,7 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal): - logit_scale = getattr(config, "logit_scale", 1.0) - self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, - self.vocab_size, logit_scale) -- self.sampler = Sampler() -+ self.sampler = get_sampler() - - def _validate_image_sizes( - self, images: List[torch.Tensor]) -> List[torch.Tensor]: -@@ -588,7 +596,12 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal): - - pixel_values = self._validate_image_sizes(pixel_values) - pixel_values = flatten_bn(pixel_values, concat=True) -+ - if pixel_mask is not None: -+ if not isinstance(pixel_mask, (torch.Tensor, list)): -+ raise ValueError("Incorrect type of pixel mask. " -+ f"Got type: {type(pixel_mask)}") -+ - pixel_mask = flatten_bn(pixel_mask, concat=True) - - return AriaImagePixelInputs( -diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py -index 42a239cad..987dfaf44 100644 ---- a/vllm/model_executor/models/blip.py -+++ b/vllm/model_executor/models/blip.py -@@ -4,22 +4,16 @@ from typing import Iterable, Optional, Set, Tuple, Union - - import torch - import torch.nn as nn --from PIL import Image - from transformers import Blip2VisionConfig, BlipVisionConfig - - from vllm.attention.layer import MultiHeadAttention --from vllm.config import ModelConfig - from vllm.distributed import divide, get_tensor_model_parallel_world_size --from vllm.inputs import DecoderOnlyInputs, token_inputs - from vllm.model_executor.layers.activation import get_act_fn - from vllm.model_executor.layers.linear import (ColumnParallelLinear, - QKVParallelLinear, - RowParallelLinear) - from vllm.model_executor.layers.quantization import QuantizationConfig - from vllm.model_executor.model_loader.weight_utils import default_weight_loader --from vllm.multimodal.utils import (cached_get_tokenizer, -- repeat_and_pad_placeholder_tokens) --from vllm.sequence import SequenceData - - - def get_blip_patch_grid_length(*, image_size: int, patch_size: int) -> int: -@@ -33,92 +27,6 @@ def get_blip_num_patches(*, image_size: int, patch_size: int) -> int: - return grid_length * grid_length - - --def get_blip_image_feature_size( -- hf_config: Union[BlipVisionConfig, Blip2VisionConfig]) -> int: -- return get_blip_num_patches(image_size=hf_config.image_size, -- patch_size=hf_config.patch_size) -- -- --def get_max_blip_image_tokens( -- hf_config: Union[BlipVisionConfig, Blip2VisionConfig]) -> int: -- return get_blip_image_feature_size(hf_config) -- -- --def dummy_seq_data_for_blip( -- hf_config: Union[BlipVisionConfig, Blip2VisionConfig], -- seq_len: int, -- num_images: int, -- *, -- image_token_id: int, -- image_feature_size_override: Optional[int] = None, --): -- if image_feature_size_override is None: -- image_feature_size = get_blip_image_feature_size(hf_config) -- else: -- image_feature_size = image_feature_size_override -- -- return SequenceData.from_prompt_token_counts( -- (image_token_id, image_feature_size * num_images), -- (0, seq_len - image_feature_size * num_images), -- ) -- -- --def dummy_image_for_blip( -- hf_config: Union[BlipVisionConfig, Blip2VisionConfig], -- num_images: int, -- *, -- image_width_override: Optional[int] = None, -- image_height_override: Optional[int] = None, --): -- width = height = hf_config.image_size -- if image_width_override is not None: -- width = image_width_override -- if image_height_override is not None: -- height = image_height_override -- -- image = Image.new("RGB", (width, height), color=0) -- return {"image": image if num_images == 1 else [image] * num_images} -- -- --def input_processor_for_blip( -- model_config: ModelConfig, -- hf_config: Union[BlipVisionConfig, Blip2VisionConfig], -- inputs: DecoderOnlyInputs, -- *, -- image_token_id: int, -- image_feature_size_override: Optional[int] = None, --): -- multi_modal_data = inputs.get("multi_modal_data") -- if multi_modal_data is None or "image" not in multi_modal_data: -- return inputs -- -- if "multi_modal_placeholders" in inputs and "image" in inputs[ -- "multi_modal_placeholders"]: -- # The inputs already have placeholders. -- return inputs -- -- tokenizer = cached_get_tokenizer(model_config.tokenizer) -- -- if image_feature_size_override is None: -- image_feature_size = get_blip_image_feature_size(hf_config) -- else: -- image_feature_size = image_feature_size_override -- -- new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens( -- tokenizer, -- inputs.get("prompt"), -- inputs["prompt_token_ids"], -- placeholder_token_id=image_token_id, -- repeat_count=image_feature_size, -- ) -- -- # NOTE: Create a defensive copy of the original inputs -- return token_inputs(prompt_token_ids=new_token_ids, -- prompt=new_prompt, -- multi_modal_data=multi_modal_data, -- multi_modal_placeholders={"image": ranges}) -- -- - # Adapted from https://github.com/huggingface/transformers/blob/v4.39.0/src/transformers/models/blip/modeling_blip.py#L164 # noqa - class BlipVisionEmbeddings(nn.Module): - -diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py -index 76b8505ee..fd45783f1 100644 ---- a/vllm/model_executor/models/blip2.py -+++ b/vllm/model_executor/models/blip2.py -@@ -4,32 +4,33 @@ from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, - - import torch - import torch.nn as nn --from transformers import (Blip2Config, Blip2QFormerConfig, Blip2VisionConfig, -+from transformers import (BatchFeature, Blip2Config, Blip2QFormerConfig, - apply_chunking_to_forward) - - from vllm.attention import AttentionMetadata - from vllm.config import CacheConfig, VllmConfig --from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, -- InputContext, token_inputs) - from vllm.model_executor.layers.activation import get_act_fn - from vllm.model_executor.layers.quantization import QuantizationConfig - from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler - from vllm.model_executor.sampling_metadata import SamplingMetadata - from vllm.multimodal import MULTIMODAL_REGISTRY --from vllm.multimodal.inputs import NestedTensors --from vllm.multimodal.utils import consecutive_placeholder_ranges --from vllm.sequence import IntermediateTensors, SequenceData -- --from .blip import (BlipVisionModel, dummy_image_for_blip, -- get_max_blip_image_tokens) -+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, -+ MultiModalInputsV2, MultiModalKwargs, -+ NestedTensors, PlaceholderRange) -+from vllm.multimodal.processing import (BaseMultiModalProcessor, -+ MultiModalDataItems, ProcessingMixin, -+ PromptReplacement) -+from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs -+from vllm.sequence import IntermediateTensors -+ -+from .blip import BlipVisionModel - from .interfaces import SupportsMultiModal, SupportsPP - from .utils import (AutoWeightsLoader, init_vllm_registered_model, - maybe_prefix, merge_multimodal_embeddings) - - # We use this internally as placeholders since there is no image token - # defined on the HuggingFace repo --BLIP2_IMAGE_TOKEN = "" --BLIP2_IMAGE_TOKEN_ID = 50265 -+_IMAGE_TOKEN_ID = 50265 - - - class Blip2ImagePixelInputs(TypedDict): -@@ -396,92 +397,101 @@ class Blip2QFormerModel(nn.Module): - return sequence_output - - --def get_blip2_image_feature_size(hf_config: Blip2Config) -> int: -- return hf_config.num_query_tokens -- -- --def get_max_blip2_image_tokens(ctx: InputContext): -- hf_config = ctx.get_hf_config(Blip2Config) -- vision_config = hf_config.vision_config -- -- if isinstance(vision_config, Blip2VisionConfig): -- return get_max_blip_image_tokens(vision_config) -- -- msg = f"Unsupported vision config: {type(vision_config)}" -- raise NotImplementedError(msg) -- -- --def dummy_seq_data_for_blip2( -- hf_config: Blip2Config, -- seq_len: int, -- num_images: int, -- *, -- image_token_id: int, -- image_feature_size_override: Optional[int] = None, --): -- if image_feature_size_override is None: -- image_feature_size = get_blip2_image_feature_size(hf_config) -- else: -- image_feature_size = image_feature_size_override -- -- return SequenceData.from_prompt_token_counts( -- (image_token_id, image_feature_size * num_images), -- (0, seq_len - image_feature_size * num_images), -- ), { -- "image": -- consecutive_placeholder_ranges(num_items=num_images, -- item_size=image_feature_size) -- } -+class Blip2ProcessingMixin(ProcessingMixin): - -+ def _get_hf_config(self): -+ return self.ctx.get_hf_config(Blip2Config) - --def dummy_data_for_blip2(ctx: InputContext, seq_len: int, -- mm_counts: Mapping[str, int]): -- hf_config = ctx.get_hf_config(Blip2Config) -- vision_config = hf_config.vision_config -- num_images = mm_counts["image"] -+ def _get_num_image_tokens(self) -> int: -+ hf_config = self._get_hf_config() -+ return hf_config.num_query_tokens - -- seq_data, ranges = dummy_seq_data_for_blip2( -- hf_config, -- seq_len, -- num_images, -- image_token_id=BLIP2_IMAGE_TOKEN_ID, -- ) - -- if isinstance(vision_config, Blip2VisionConfig): -- mm_data = dummy_image_for_blip(vision_config, num_images) -+class Blip2ProfilingInfo(Blip2ProcessingMixin, BaseProfilingInfo): - -- return DummyData(seq_data, mm_data, ranges) -+ def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: -+ return {"image": 1} - -- msg = f"Unsupported vision config: {type(vision_config)}" -- raise NotImplementedError(msg) -+ def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: -+ return {"image": self._get_num_image_tokens()} - -+ def get_dummy_processor_inputs( -+ self, -+ seq_len: int, -+ mm_counts: Mapping[str, int], -+ ) -> ProcessorInputs: -+ hf_config = self._get_hf_config() -+ vision_config = hf_config.vision_config -+ -+ max_image_size = vision_config.image_size -+ num_images = mm_counts.get("image", 0) -+ -+ mm_data = { -+ "image": -+ self._get_dummy_images(width=max_image_size, -+ height=max_image_size, -+ num_images=num_images) -+ } -+ -+ return ProcessorInputs( -+ prompt_text="", -+ mm_data=mm_data, -+ ) - --def input_processor_for_blip2(ctx: InputContext, inputs: DecoderOnlyInputs): -- multi_modal_data = inputs.get("multi_modal_data") -- if multi_modal_data is None or "image" not in multi_modal_data: -- return inputs -- -- hf_config = ctx.get_hf_config(Blip2Config) -- image_feature_size = get_blip2_image_feature_size(hf_config) - -- # The original model places image tokens at the front -- # https://github.com/huggingface/transformers/blob/v4.41.2/src/transformers/models/blip_2/modeling_blip_2.py#L1514 -- new_token_ids = [BLIP2_IMAGE_TOKEN_ID] * image_feature_size -- new_token_ids += inputs["prompt_token_ids"] -+class Blip2MultiModalProcessor(Blip2ProcessingMixin, BaseMultiModalProcessor): - -- new_prompt = inputs.get("prompt") -- if new_prompt is not None: -- new_prompt = BLIP2_IMAGE_TOKEN * image_feature_size + new_prompt -+ def _get_profiling_info(self) -> BaseProfilingInfo: -+ return Blip2ProfilingInfo(self.ctx) - -- return token_inputs(prompt_token_ids=new_token_ids, -- prompt=new_prompt, -- multi_modal_data=multi_modal_data) -+ def _get_mm_fields_config( -+ self, -+ hf_inputs: BatchFeature, -+ hf_processor_mm_kwargs: Mapping[str, object], -+ ) -> Mapping[str, MultiModalFieldConfig]: -+ return dict( -+ pixel_values=MultiModalFieldConfig.batched("image"), -+ image_embeds=MultiModalFieldConfig.batched("image"), -+ ) - -+ def _get_prompt_replacements( -+ self, -+ mm_items: MultiModalDataItems, -+ hf_processor_mm_kwargs: Mapping[str, object], -+ out_mm_kwargs: MultiModalKwargs, -+ ) -> list[PromptReplacement]: -+ num_image_tokens = self._get_num_image_tokens() -+ -+ return [ -+ PromptReplacement( -+ modality="image", -+ target="", -+ replacement="" * num_image_tokens + "", -+ ) -+ ] - --@MULTIMODAL_REGISTRY.register_image_input_mapper() --@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_blip2_image_tokens) --@INPUT_REGISTRY.register_dummy_data(dummy_data_for_blip2) --@INPUT_REGISTRY.register_input_processor(input_processor_for_blip2) -+ def apply( -+ self, -+ prompt_text: str, -+ mm_data: MultiModalDataDict, -+ hf_processor_mm_kwargs: Mapping[str, object], -+ ) -> MultiModalInputsV2: -+ result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs) -+ -+ # Only tokens should be considered as placeholders, -+ # so we ignore the trailing bos_token -+ result["mm_placeholders"] = { -+ modality: [ -+ PlaceholderRange(offset=p["offset"], length=p["length"] - 1) -+ for p in ps -+ ] -+ for modality, ps in result["mm_placeholders"].items() -+ } -+ -+ return result -+ -+ -+@MULTIMODAL_REGISTRY.register_processor(Blip2MultiModalProcessor) - class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): -@@ -627,7 +637,7 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): - if multimodal_embeddings is not None: - inputs_embeds = merge_multimodal_embeddings( - input_ids, inputs_embeds, multimodal_embeddings, -- BLIP2_IMAGE_TOKEN_ID) -+ _IMAGE_TOKEN_ID) - return inputs_embeds - - def forward( -diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py -index a40c321ce..73ed73b61 100644 ---- a/vllm/model_executor/models/chameleon.py -+++ b/vllm/model_executor/models/chameleon.py -@@ -3,16 +3,14 @@ from typing import (Any, Dict, Iterable, List, Literal, Mapping, Optional, Set, - Tuple, TypedDict, Union) - - import torch -+import torch.nn as nn - import torch.nn.functional as F --from PIL import Image --from torch import nn --from transformers import ChameleonConfig, ChameleonVQVAEConfig -+from transformers import (BatchFeature, ChameleonConfig, ChameleonProcessor, -+ ChameleonVQVAEConfig) - - from vllm.attention import Attention, AttentionMetadata - from vllm.config import CacheConfig, VllmConfig - from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size --from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, -- InputContext, token_inputs) - from vllm.model_executor.layers.activation import SiluAndMul - from vllm.model_executor.layers.layernorm import RMSNorm - from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, -@@ -29,11 +27,14 @@ from vllm.model_executor.model_loader.weight_utils import ( - from vllm.model_executor.sampling_metadata import SamplingMetadata - from vllm.model_executor.utils import set_weight_attrs - from vllm.multimodal import MULTIMODAL_REGISTRY --from vllm.multimodal.inputs import NestedTensors --from vllm.multimodal.utils import (cached_get_tokenizer, -- consecutive_placeholder_ranges, -- repeat_and_pad_placeholder_tokens) --from vllm.sequence import IntermediateTensors, SequenceData -+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, -+ MultiModalInputsV2, MultiModalKwargs, -+ NestedTensors, PlaceholderRange) -+from vllm.multimodal.processing import (BaseMultiModalProcessor, -+ MultiModalDataItems, ProcessingMixin, -+ PromptReplacement) -+from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs -+from vllm.sequence import IntermediateTensors - from vllm.utils import print_warning_once - - from .interfaces import SupportsMultiModal, SupportsPP -@@ -41,15 +42,6 @@ from .utils import (is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers, - maybe_prefix, merge_multimodal_embeddings) - --# These configs are not part of the model config but the preprocessor --# and processor files, so we hardcode them in the model file for now. --CHAMELEON_CROP_SIZE_HEIGHT = CHAMELEON_CROP_SIZE_WIDTH = 512 --CHAMELEON_IMAGE_SEQ_LENGTH = 1024 --CHAMELEON_IMAGE_TOKEN_ID = 8711 --CHAMELEON_IMAGE_START_TOKEN_ID = 8197 --CHAMELEON_IMAGE_END_TOKEN_ID = 8196 --CHAMELEON_SEP_TOKEN_ID = 8710 -- - - class ChameleonImagePixelInputs(TypedDict): - type: Literal["pixel_values"] -@@ -57,103 +49,102 @@ class ChameleonImagePixelInputs(TypedDict): - """Shape: `(batch_size * num_images, num_channels, height, width)`""" - - --def get_max_chameleon_image_tokens(ctx: InputContext): -- return CHAMELEON_IMAGE_SEQ_LENGTH -+class ChameleonProcessingMixin(ProcessingMixin): - -+ def _get_hf_config(self): -+ return self.ctx.get_hf_config(ChameleonConfig) - --def dummy_seq_data_for_chameleon( -- seq_len: int, -- num_images: int, -- *, -- image_token_id: int, -- image_feature_size_override: Optional[int] = None, --): -- if image_feature_size_override is None: -- image_feature_size = CHAMELEON_IMAGE_SEQ_LENGTH -- else: -- image_feature_size = image_feature_size_override -+ def _get_hf_processor(self): -+ return self.ctx.get_hf_processor(ChameleonProcessor) - -- return SequenceData.from_prompt_token_counts( -- (image_token_id, image_feature_size * num_images), -- (0, seq_len - image_feature_size * num_images), -- ), { -- "image": -- consecutive_placeholder_ranges(num_items=num_images, -- item_size=image_feature_size) -- } -+ def _get_num_image_tokens(self) -> int: -+ processor = self._get_hf_processor() -+ return processor.image_seq_length - - --def dummy_image_for_chameleon( -- num_images: int, -- *, -- image_width_override: Optional[int] = None, -- image_height_override: Optional[int] = None, --): -- width = CHAMELEON_CROP_SIZE_WIDTH -- height = CHAMELEON_CROP_SIZE_HEIGHT -- if image_width_override is not None: -- width = image_width_override -- if image_height_override is not None: -- height = image_height_override -+class ChameleonProfilingInfo(ChameleonProcessingMixin, BaseProfilingInfo): - -- image = Image.new("RGB", (width, height), color=0) -- return {"image": image if num_images == 1 else [image] * num_images} -+ def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: -+ return {"image": 1} - -+ def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: -+ return {"image": self._get_num_image_tokens()} - --def dummy_data_for_chameleon(ctx: InputContext, seq_len: int, -- mm_counts: Mapping[str, int]): -- num_images = mm_counts["image"] -+ def get_dummy_processor_inputs( -+ self, -+ seq_len: int, -+ mm_counts: Mapping[str, int], -+ ) -> ProcessorInputs: -+ config = self._get_hf_config() -+ -+ width = height = config.vq_config.resolution -+ num_images = mm_counts.get("image", 0) -+ -+ mm_data = { -+ "image": -+ self._get_dummy_images(width=width, -+ height=height, -+ num_images=num_images) -+ } - -- seq_data, ranges = dummy_seq_data_for_chameleon( -- seq_len, -- num_images, -- image_token_id=CHAMELEON_IMAGE_TOKEN_ID, -- ) -+ return ProcessorInputs( -+ prompt_text="" * num_images, -+ mm_data=mm_data, -+ ) - -- mm_data = dummy_image_for_chameleon(num_images) -- return DummyData(seq_data, mm_data, ranges) - -+class ChameleonMultiModalProcessor(ChameleonProcessingMixin, -+ BaseMultiModalProcessor): - --def input_processor_for_chameleon(ctx: InputContext, -- inputs: DecoderOnlyInputs): -+ def _get_profiling_info(self) -> BaseProfilingInfo: -+ return ChameleonProfilingInfo(self.ctx) - -- """ -- Processing input prompt to insert required tokens for image placeholder. -- -- See https://github.com/huggingface/transformers/blob/0fdea8607d7e01eb0e38a1ebeb7feee30a22f0cf/src/transformers/models/chameleon/processing_chameleon.py#L58 -- """ # noqa -- -- multi_modal_data = inputs.get("multi_modal_data") -- if multi_modal_data is None or "image" not in multi_modal_data: -- return inputs -- -- if "multi_modal_placeholders" in inputs and "image" in inputs[ -- "multi_modal_placeholders"]: -- # The inputs already have placeholders. -- return inputs -- -- model_config = ctx.model_config -- tokenizer = cached_get_tokenizer(model_config.tokenizer) -- new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens( -- tokenizer, -- inputs.get("prompt"), -- inputs["prompt_token_ids"], -- placeholder_token_id=CHAMELEON_IMAGE_TOKEN_ID, -- repeat_count=CHAMELEON_IMAGE_SEQ_LENGTH, -- pad_token_left=CHAMELEON_IMAGE_START_TOKEN_ID, -- pad_token_right=CHAMELEON_IMAGE_END_TOKEN_ID, -- ) -- -- # Appending sep token for chat mode to follow default processor -- # behavior -- if new_prompt is not None: -- new_prompt += tokenizer.sep_token -- new_token_ids += [CHAMELEON_SEP_TOKEN_ID] -- -- # NOTE: Create a defensive copy of the original inputs -- return token_inputs(prompt_token_ids=new_token_ids, -- prompt=new_prompt, -- multi_modal_data=multi_modal_data) -+ def _get_mm_fields_config( -+ self, -+ hf_inputs: BatchFeature, -+ hf_processor_mm_kwargs: Mapping[str, object], -+ ) -> Mapping[str, MultiModalFieldConfig]: -+ return dict(pixel_values=MultiModalFieldConfig.batched("image")) -+ -+ def _get_prompt_replacements( -+ self, -+ mm_items: MultiModalDataItems, -+ hf_processor_mm_kwargs: Mapping[str, object], -+ out_mm_kwargs: MultiModalKwargs, -+ ) -> list[PromptReplacement]: -+ processor = self._get_hf_processor(**hf_processor_mm_kwargs) -+ -+ return [ -+ PromptReplacement( -+ modality="image", -+ target="", -+ replacement="".join([ -+ processor.image_start_token, -+ processor.image_token * self._get_num_image_tokens(), -+ processor.image_end_token, -+ ]), -+ ) -+ ] -+ -+ def apply( -+ self, -+ prompt_text: str, -+ mm_data: MultiModalDataDict, -+ hf_processor_mm_kwargs: Mapping[str, object], -+ ) -> MultiModalInputsV2: -+ result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs) -+ -+ # Only tokens should be considered as placeholders, -+ # so we ignore the image_start_token and image_end_token -+ result["mm_placeholders"] = { -+ modality: [ -+ PlaceholderRange(offset=p["offset"] + 1, -+ length=p["length"] - 2) for p in ps -+ ] -+ for modality, ps in result["mm_placeholders"].items() -+ } -+ -+ return result - - - class ChameleonLayerNorm(nn.LayerNorm): -@@ -736,7 +727,7 @@ class ChameleonVQVAEEncoder(nn.Module): - for i_level in range(self.num_resolutions): - for i_block in range(self.num_res_blocks): - hidden_state = self.down[i_level].block[i_block]( -- hidden_states[-1], ) -+ hidden_states[-1]) - if len(self.down[i_level].attn) > 0: - hidden_state = self.down[i_level].attn[i_block]( - hidden_state) -@@ -925,10 +916,7 @@ class ChameleonModel(nn.Module): - return hidden_states - - --@MULTIMODAL_REGISTRY.register_image_input_mapper() --@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_chameleon_image_tokens) --@INPUT_REGISTRY.register_dummy_data(dummy_data_for_chameleon) --@INPUT_REGISTRY.register_input_processor(input_processor_for_chameleon) -+@MULTIMODAL_REGISTRY.register_processor(ChameleonMultiModalProcessor) - class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal, - SupportsPP): - -@@ -956,9 +944,8 @@ class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal, - self.model.make_empty_intermediate_tensors) - - def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor: -- -- expected_dims = (3, CHAMELEON_CROP_SIZE_HEIGHT, -- CHAMELEON_CROP_SIZE_WIDTH) -+ vq_config: ChameleonVQVAEConfig = self.config.vq_config -+ expected_dims = (3, vq_config.resolution, vq_config.resolution) - actual_dims = tuple(data.shape[1:]) - - if actual_dims != expected_dims: -diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py -index 6c50882d8..ce3a71e30 100644 ---- a/vllm/model_executor/models/chatglm.py -+++ b/vllm/model_executor/models/chatglm.py -@@ -33,7 +33,7 @@ from vllm.model_executor.models.glm4_vision_encoder import EVA2CLIPModel - from vllm.model_executor.models.module_mapping import MultiModelKeys - from vllm.model_executor.sampling_metadata import SamplingMetadata - from vllm.multimodal import MULTIMODAL_REGISTRY --from vllm.multimodal.inputs import (MultiModalData, MultiModalKwargs, -+from vllm.multimodal.inputs import (ModalityData, MultiModalKwargs, - NestedTensors) - from vllm.multimodal.utils import cached_get_tokenizer - from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors, -@@ -54,7 +54,7 @@ def calculate_image_placeholder(vision_config): - - def mm_input_mapper_for_glmv( - ctx: InputContext, -- data: MultiModalData[object], -+ data: ModalityData[object], - ) -> Dict: - model_config = ctx.model_config - tokenizer = cached_get_tokenizer( -@@ -63,11 +63,15 @@ def mm_input_mapper_for_glmv( - if tokenizer is None: - raise RuntimeError("No HuggingFace processor is available " - "to process the image object") -+ if isinstance(data, List): -+ image = data[0] -+ else: -+ image = data - try: - raw_batch_data = tokenizer.apply_chat_template( - conversation=[{ - "role": "user", -- "image": data -+ "image": image - }], - add_generation_prompt=True, - tokenize=True, -@@ -175,11 +179,15 @@ def input_processor_for_glmv(ctx: InputContext, inputs: DecoderOnlyInputs): - ctx.model_config.model, - trust_remote_code=ctx.model_config.trust_remote_code) - -+ if isinstance(multi_modal_data["image"], List): -+ image = multi_modal_data["image"][0] -+ else: -+ image = multi_modal_data["image"] - try: - raw_batch_data = tokenizer.apply_chat_template( - conversation=[{ - "role": "user", -- "image": multi_modal_data["image"], -+ "image": image, - "content": inputs['prompt'], - }], - add_generation_prompt=True, -@@ -322,11 +330,13 @@ class GLMMLP(nn.Module): - def __init__( - self, - config: ChatGLMConfig, -+ layer, - quant_config: Optional[QuantizationConfig] = None, - ): - super().__init__() - - self.add_bias = config.add_bias_linear -+ self.layer = layer - - # Project to 4h. - self.dense_h_to_4h = MergedColumnParallelLinear( -@@ -349,7 +359,14 @@ class GLMMLP(nn.Module): - def forward(self, hidden_states): - # [s, b, 4hp] - intermediate_parallel, _ = self.dense_h_to_4h(hidden_states) -- intermediate_parallel = self.activation_func(intermediate_parallel) -+ # IPEX-LLM changes start: workaround fp16 overflow -+ if self.layer >= 38 and intermediate_parallel.device.type == "xpu": -+ d = intermediate_parallel.shape[-1] // 2 -+ intermediate_parallel[..., d:] /= 10 -+ intermediate_parallel = self.activation_func(intermediate_parallel) -+ else: -+ intermediate_parallel = self.activation_func(intermediate_parallel) -+ # IPEX-LLM changes end. - # [s, b, h] - output, _ = self.dense_4h_to_h(intermediate_parallel) - return output -@@ -365,6 +382,7 @@ class GLMBlock(nn.Module): - def __init__( - self, - config: ChatGLMConfig, -+ layer, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", -@@ -392,7 +410,8 @@ class GLMBlock(nn.Module): - config.hidden_size, eps=config.layernorm_epsilon) - - # MLP -- self.mlp = GLMMLP(config, quant_config) -+ self.layer = layer -+ self.mlp = GLMMLP(config, layer, quant_config) - - def forward( - self, -@@ -428,8 +447,13 @@ class GLMBlock(nn.Module): - residual = layernorm_output - else: - residual = layernorm_input -- -- output = self.mlp(layernorm_output) + residual -+ # IPEX-LLM changes start: workaround fp16 overflow -+ if self.layer >= 38 and layernorm_output.device.type == "xpu": -+ output = self.mlp(layernorm_output) * 10 + residual -+ output = torch.nan_to_num(output) -+ else: -+ output = self.mlp(layernorm_output) + residual -+ # ipex-llm changes end - - return output - -@@ -451,12 +475,15 @@ class GLMTransformer(nn.Module): - self.num_layers = config.num_layers - - # Transformer layers. -- self.start_layer, self.end_layer, self.layers = make_layers( -- self.num_layers, -- lambda prefix: GLMBlock( -- config, cache_config, quant_config, prefix=prefix), -- prefix=f"{prefix}.layers", -- ) -+ # Not sure if pp is available now -+ from vllm.distributed.utils import get_pp_indices -+ self.start_layer, self.end_layer = get_pp_indices(self.num_layers, -+ get_pp_group().rank_in_group, -+ get_pp_group().world_size) -+ self.layers = nn.ModuleList([ -+ GLMBlock(config, i, cache_config, quant_config, prefix=f"{prefix}.layers.{i}") -+ for i in range(self.start_layer, self.end_layer) -+ ]) - - if self.post_layer_norm: - layer_norm_func = RMSNorm if config.rmsnorm else LayerNorm -@@ -550,8 +577,8 @@ class ChatGLMModel(nn.Module): - image_input = self._parse_and_validate_image_input(**kwargs) - if image_input["pixel_values"] is None: - return None -- pixel_values = image_input["pixel_values"].to( -- dtype=self.config.torch_dtype) -+ dtype = self.embedding.weight.dtype -+ pixel_values = image_input["pixel_values"].to(dtype) - vision_embeddings = self.vision(pixel_values) - return vision_embeddings - -diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py -index a5300dfd9..1bde45cb1 100644 ---- a/vllm/model_executor/models/clip.py -+++ b/vllm/model_executor/models/clip.py -@@ -24,6 +24,8 @@ from vllm.multimodal.utils import (cached_get_tokenizer, - resolve_visual_encoder_outputs) - from vllm.sequence import SequenceData - -+from .vision import VisionEncoderInfo -+ - - def get_clip_patch_grid_length(*, image_size: int, patch_size: int) -> int: - assert image_size % patch_size == 0 -@@ -149,6 +151,32 @@ def input_processor_for_clip( - multi_modal_placeholders={"image": ranges}) - - -+class CLIPEncoderInfo(VisionEncoderInfo[CLIPVisionConfig]): -+ -+ def get_num_image_tokens( -+ self, -+ *, -+ image_width: int, -+ image_height: int, -+ ) -> int: -+ return get_clip_image_feature_size(self.vision_config) -+ -+ def get_max_image_tokens(self) -> int: -+ return get_max_clip_image_tokens(self.vision_config) -+ -+ def get_image_size(self) -> int: -+ return self.vision_config.image_size -+ -+ def get_patch_size(self) -> int: -+ return self.vision_config.patch_size -+ -+ def get_patch_grid_length(self) -> int: -+ return get_clip_patch_grid_length( -+ image_size=self.vision_config.image_size, -+ patch_size=self.vision_config.patch_size, -+ ) -+ -+ - # Adapted from https://github.com/huggingface/transformers/blob/v4.39.0/src/transformers/models/clip/modeling_clip.py#L164 # noqa - class CLIPVisionEmbeddings(nn.Module): - -diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py -index c846e42f1..d22d1f317 100644 ---- a/vllm/model_executor/models/commandr.py -+++ b/vllm/model_executor/models/commandr.py -@@ -172,16 +172,18 @@ class CohereAttention(nn.Module): - is_neox_style=False, - ) - -- sliding_window = getattr(config, "sliding_window", None) -- # Model v2 has sliding windows, v1 does not -- self.v1 = sliding_window is None -+ # Model v2 has interleaved sliding windows, v1 does not -+ interleaved_sliding_window = getattr(config, -+ "interleaved_sliding_window", -+ None) -+ self.v1 = interleaved_sliding_window is None - - layer_idx = extract_layer_index(prefix) - layer_has_sliding_window = ( - getattr(config, "sliding_window_pattern", False) - and (layer_idx + 1) % self.config.sliding_window_pattern != 0) - -- self.sliding_window = (sliding_window -+ self.sliding_window = (interleaved_sliding_window - if layer_has_sliding_window else None) - - self.attn = Attention(self.num_heads, -diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py -index 0398f0943..8324a563e 100644 ---- a/vllm/model_executor/models/exaone.py -+++ b/vllm/model_executor/models/exaone.py -@@ -606,8 +606,9 @@ class ExaoneForCausalLM(nn.Module, SupportsLoRA, SupportsPP): - # which is consistent with the practice of setting - # scaling_factor = tensor_amax / FPtype_max - scaling_factor *= 2 -- if hasattr(layer_self_attn, "kv_scale"): -- layer_self_attn.attn._kv_scale = scaling_factor -+ if hasattr(layer_self_attn.attn, "_k_scale"): -+ layer_self_attn.attn._k_scale = scaling_factor -+ layer_self_attn.attn._v_scale = scaling_factor - else: - raise RuntimeError("Self attention has no KV cache scaling " - "factor attribute!") -diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py -index 6e8690032..c937fcb09 100644 ---- a/vllm/model_executor/models/fuyu.py -+++ b/vllm/model_executor/models/fuyu.py -@@ -15,32 +15,30 @@ - # limitations under the License. - """ PyTorch Fuyu model.""" - import math --from array import array - from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, - TypedDict) - - import torch - import torch.nn as nn --import torch.utils.checkpoint --from PIL import Image --from transformers import FuyuImageProcessor -+from transformers import (BatchFeature, FuyuConfig, FuyuImageProcessor, -+ FuyuProcessor) - - from vllm.attention import AttentionMetadata - from vllm.config import VllmConfig --from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, -- InputContext, token_inputs) - from vllm.model_executor.layers.linear import ColumnParallelLinear - from vllm.model_executor.layers.sampler import SamplerOutput - from vllm.model_executor.models.persimmon import PersimmonForCausalLM - from vllm.model_executor.sampling_metadata import SamplingMetadata --from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs --from vllm.multimodal.image import cached_get_image_processor --from vllm.multimodal.inputs import NestedTensors --from vllm.multimodal.utils import (cached_get_tokenizer, -- consecutive_placeholder_ranges) --from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors, -- SequenceData) --from vllm.utils import is_list_of -+from vllm.multimodal import MULTIMODAL_REGISTRY -+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, -+ MultiModalInputsV2, MultiModalKwargs, -+ NestedTensors, PlaceholderRange) -+from vllm.multimodal.parse import ImageProcessorItems, ImageSize -+from vllm.multimodal.processing import (BaseMultiModalProcessor, -+ MultiModalDataItems, ProcessingMixin, -+ PromptReplacement) -+from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs -+from vllm.sequence import IntermediateTensors - - from .interfaces import SupportsMultiModal, SupportsPP - from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix, -@@ -50,182 +48,203 @@ from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix, - _IMAGE_TOKEN_ID = 71011 - _NEWLINE_TOKEN_ID = 71019 - --MAX_IMAGE_FEATURE_SIZE_HEIGHT = 1080 --MAX_IMAGE_FEATURE_SIZE_WIDTH = 1920 - -- --class FuyuImagePixelInputs(TypedDict): -- type: Literal["pixel_values"] -- data: torch.Tensor -+class FuyuImagePatchInputs(TypedDict): -+ type: Literal["image_patches"] -+ flat_data: torch.Tensor - """ - Shape: -- (batch_size, num_patches, patch_size_x * patch_size_y * num_channels) -+ `(batch_size * num_patches, patch_size_x * patch_size_y * num_channels)` - """ - -- --def _calculate_num_image_tokens( -- height: int, -- width: int, --) -> Tuple[int, int]: -+ patches_per_image: List[int] - """ -- calculate number of image tokens needed for a given image size -- The expected Fuyu image prompts is in format: -- (image_token * ncols + newline_token) * nrows -- args: -- image_size: Tuple[int, int] - (width, height) of the image -- returns: -- ncols: int - number of image tokens in x direction -- nrows: int - number of image tokens in y direction -+ List of number of total patches for each image in the batch. -+ This is used to restore the first two dimensions of `flat_data`. - """ -- ncol = math.ceil(width / 30) -- nrow = math.ceil(height / 30) -- return ncol, nrow -- -- --def get_max_fuyu_image_feature_size(): -- -- return _calculate_num_image_tokens( -- height=MAX_IMAGE_FEATURE_SIZE_HEIGHT, -- width=MAX_IMAGE_FEATURE_SIZE_WIDTH, -- ) -- -- --def get_max_fuyu_image_tokens(ctx: InputContext): -- ncol, nrow = get_max_fuyu_image_feature_size() -- return (ncol + 1) * nrow -- -- --def dummy_seq_data_for_fuyu(ctx: InputContext, seq_len: int, num_images: int): -- ncol, nrow = get_max_fuyu_image_feature_size() -- image_feature_size = get_max_fuyu_image_tokens(ctx) -- -- image_token_ids = ( -- array(VLLM_TOKEN_ID_ARRAY_TYPE, [_IMAGE_TOKEN_ID]) * ncol + -- array(VLLM_TOKEN_ID_ARRAY_TYPE, [_NEWLINE_TOKEN_ID])) * nrow -- token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE, image_token_ids) * num_images -- token_ids += array(VLLM_TOKEN_ID_ARRAY_TYPE, -- [0]) * (seq_len - image_feature_size * num_images) -- return SequenceData(token_ids), { -- "image": -- consecutive_placeholder_ranges(num_items=num_images, -- item_size=image_feature_size) -- } -- -- --def dummy_image_for_fuyu( -- num_images: int, -- *, -- image_width: int, -- image_height: int, --): -- image = Image.new("RGB", (image_width, image_height), color=0) -- return {"image": image if num_images == 1 else [image] * num_images} -- -- --def dummy_data_for_fuyu(ctx: InputContext, seq_len: int, -- mm_counts: Mapping[str, int]): -- num_images = mm_counts["image"] -- seq_data, ranges = dummy_seq_data_for_fuyu(ctx, seq_len, num_images) -- mm_data = dummy_image_for_fuyu(num_images, -- image_width=MAX_IMAGE_FEATURE_SIZE_WIDTH, -- image_height=MAX_IMAGE_FEATURE_SIZE_HEIGHT) -- return DummyData(seq_data, mm_data, ranges) -- -- --def _fuyu_image_preprocess(image_processor: FuyuImageProcessor, -- data: List[Image.Image]): -- image_encoding = image_processor.preprocess(data, return_tensors="pt") -- batch_images = torch.stack([img[0] for img in image_encoding["images"] -- ]).unsqueeze(1) -- image_unpadded_heights = torch.tensor( -- image_encoding["image_unpadded_heights"]) -- image_unpadded_widths = torch.tensor( -- image_encoding["image_unpadded_widths"]) -- -- batch_size = len(image_encoding["images"]) -- image_present = torch.ones(batch_size, 1, 1) -- model_image_input = image_processor.preprocess_with_tokenizer_info( -- image_input=batch_images, -- image_present=image_present, -- image_unpadded_h=image_unpadded_heights, -- image_unpadded_w=image_unpadded_widths, -- image_placeholder_id=_IMAGE_TOKEN_ID, -- image_newline_id=_NEWLINE_TOKEN_ID, -- variable_sized=True, -- ) -- return model_image_input -- -- --def input_processor_for_fuyu(ctx: InputContext, inputs: DecoderOnlyInputs): -- multi_modal_data = inputs.get("multi_modal_data") -- if multi_modal_data is None or "image" not in multi_modal_data: -- return inputs -- -- model_config = ctx.model_config -- image_data = multi_modal_data["image"] -- new_multi_modal_data = {} -- image_list = image_data if isinstance(image_data, list) else [image_data] -- -- # process image data -- if is_list_of(image_list, Image.Image): -- # Fuyu's image_processor can also finish token padding -- image_processor: FuyuImageProcessor = cached_get_image_processor( -- model_config.model) -- -- model_image_input = _fuyu_image_preprocess(image_processor, image_data) -- image_patches = torch.cat([ -- image_patch[0] -- for image_patch in model_image_input["image_patches"] -- ]) -- new_multi_modal_data["image"] = image_patches -- -- elif is_list_of(image_list, torch.Tensor): -- raise NotImplementedError("Embeddings input is not supported yet") -- else: -- raise TypeError(f"Invalid image type: {type(image_data)}") -- -- # process prompts -- prompt = inputs.get("prompt") -- prompt_token_ids = inputs["prompt_token_ids"] -- tokenizer = cached_get_tokenizer(model_config.model) -- # dim0 is batch_size, dim1 is subseq_size which will always be 1 -- image_input_ids: List[List[ -- torch.Tensor]] = model_image_input["image_input_ids"] -- image_input_ids = image_input_ids[0][0].tolist() -- bos_token = tokenizer.encode("", add_special_tokens=False)[1:] -- boa_token = tokenizer.encode("\x04", add_special_tokens=False)[1:] -- -- new_prompt = prompt + "\x04" -- new_prompt_token_ids = image_input_ids + bos_token + prompt_token_ids[ -- 1:] + boa_token -- -- return token_inputs(prompt=new_prompt, -- prompt_token_ids=new_prompt_token_ids, -- multi_modal_data=new_multi_modal_data) -- -- --def input_mapper_for_fuyu(ctx: InputContext, data: object): -- model_config = ctx.model_config -- data_list = data if isinstance(data, list) else [data] -- if is_list_of(data_list, Image.Image): -- # Fuyu's image_processor can also finish token padding -- image_processor: FuyuImageProcessor = cached_get_image_processor( -- model_config.model) -- -- model_image_input = _fuyu_image_preprocess(image_processor, data_list) -- data = torch.stack([ -- image_patch[0] -- for image_patch in model_image_input["image_patches"] -- ]) -- -- # image has been processed with prompt in input processor -- return MultiModalKwargs({"pixel_values": data}) -- -- --@MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_fuyu) --@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_fuyu_image_tokens) --@INPUT_REGISTRY.register_dummy_data(dummy_data_for_fuyu) --@INPUT_REGISTRY.register_input_processor(input_processor_for_fuyu) -+ -+ -+class FuyuProcessingMixin(ProcessingMixin): -+ -+ def _get_hf_config(self): -+ return self.ctx.get_hf_config(FuyuConfig) -+ -+ def _get_hf_processor(self): -+ return self.ctx.get_hf_processor(FuyuProcessor) -+ -+ def _get_image_processor(self) -> FuyuImageProcessor: -+ return self._get_hf_processor().image_processor -+ -+ def _get_image_feature_grid_size( -+ self, -+ *, -+ image_width: int, -+ image_height: int, -+ ) -> tuple[int, int]: -+ image_processor = self._get_image_processor() -+ target_width = image_processor.size["width"] -+ target_height = image_processor.size["height"] -+ -+ if not (image_width <= target_width and image_height <= target_height): -+ height_scale_factor = target_height / image_height -+ width_scale_factor = target_width / image_width -+ optimal_scale_factor = min(height_scale_factor, width_scale_factor) -+ -+ image_height = int(image_height * optimal_scale_factor) -+ image_width = int(image_width * optimal_scale_factor) -+ -+ ncols = math.ceil(image_width / 30) -+ nrows = math.ceil(image_height / 30) -+ return ncols, nrows -+ -+ -+class FuyuProfilingInfo(FuyuProcessingMixin, BaseProfilingInfo): -+ -+ def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: -+ return {"image": 1} -+ -+ def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: -+ target_width, target_height = self._get_image_size_with_most_features() -+ -+ max_ncols, max_nrows = self._get_image_feature_grid_size( -+ image_width=target_width, -+ image_height=target_height, -+ ) -+ max_image_tokens = (max_ncols + 1) * max_nrows -+ -+ return {"image": max_image_tokens} -+ -+ def _get_image_size_with_most_features(self) -> ImageSize: -+ image_processor = self._get_image_processor() -+ return ImageSize(width=image_processor.size["width"], -+ height=image_processor.size["height"]) -+ -+ def get_dummy_processor_inputs( -+ self, -+ seq_len: int, -+ mm_counts: Mapping[str, int], -+ ) -> ProcessorInputs: -+ target_width, target_height = self._get_image_size_with_most_features() -+ num_images = mm_counts.get("image", 0) -+ -+ mm_data = { -+ "image": -+ self._get_dummy_images(width=target_width, -+ height=target_height, -+ num_images=num_images) -+ } -+ -+ return ProcessorInputs( -+ prompt_text="", -+ mm_data=mm_data, -+ ) -+ -+ -+class FuyuMultiModalProcessor(FuyuProcessingMixin, BaseMultiModalProcessor): -+ -+ def _get_profiling_info(self) -> BaseProfilingInfo: -+ return FuyuProfilingInfo(self.ctx) -+ -+ def _call_hf_processor( -+ self, -+ prompt: str, -+ mm_data: Mapping[str, object], -+ mm_kwargs: Mapping[str, object], -+ ) -> BatchFeature: -+ -+ if not mm_data: -+ # Avoid warning from HF logger for text-only input -+ # Input_ids format: bos_token_id + prompt_token_ids + boa_token_id -+ # Tokenizer won't add boa_token_id by default, we add it manually. -+ tokenizer = self._get_tokenizer() -+ boa_token_id: int = tokenizer.vocab["<0x04>"] # type: ignore -+ prompt_ids = tokenizer.encode(prompt) + [boa_token_id] -+ return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt") -+ -+ processed_outputs = super()._call_hf_processor( -+ prompt=prompt, -+ mm_data=mm_data, -+ mm_kwargs=mm_kwargs, -+ ) -+ -+ image_patches = processed_outputs.get("image_patches") -+ if image_patches is not None: -+ images = mm_data["images"] -+ assert isinstance(images, list) -+ -+ # Original output: (1, num_images, Pn, Px * Py * C) -+ # New output: (num_images, Pn, Px * Py * C) -+ assert (isinstance(image_patches, list) -+ and len(image_patches) == 1) -+ assert (isinstance(image_patches[0], torch.Tensor) -+ and len(image_patches[0]) == len(images)) -+ -+ processed_outputs["image_patches"] = image_patches[0] -+ -+ return processed_outputs -+ -+ def _get_mm_fields_config( -+ self, -+ hf_inputs: BatchFeature, -+ hf_processor_mm_kwargs: Mapping[str, object], -+ ) -> Mapping[str, MultiModalFieldConfig]: -+ return dict(image_patches=MultiModalFieldConfig.batched("image")) -+ -+ def _get_prompt_replacements( -+ self, -+ mm_items: MultiModalDataItems, -+ hf_processor_mm_kwargs: Mapping[str, object], -+ out_mm_kwargs: MultiModalKwargs, -+ ) -> list[PromptReplacement]: -+ hf_config = self._get_hf_config() -+ bos_token_id = hf_config.bos_token_id -+ -+ tokenizer = self._get_tokenizer() -+ eot_token_id = tokenizer.bos_token_id -+ assert isinstance(eot_token_id, int) -+ -+ def get_replacement_fuyu(item_idx: int): -+ images = mm_items.get_items("image", ImageProcessorItems) -+ image_size = images.get_image_size(item_idx) -+ -+ ncols, nrows = self._get_image_feature_grid_size( -+ image_width=image_size.width, -+ image_height=image_size.height, -+ ) -+ -+ return (([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows + -+ [bos_token_id]) -+ -+ return [ -+ PromptReplacement( -+ modality="image", -+ target=[eot_token_id], -+ replacement=get_replacement_fuyu, -+ ) -+ ] -+ -+ def apply( -+ self, -+ prompt_text: str, -+ mm_data: MultiModalDataDict, -+ hf_processor_mm_kwargs: Mapping[str, object], -+ ) -> MultiModalInputsV2: -+ result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs) -+ -+ # Only |SPEAKER| (image) tokens should be considered as placeholders, -+ # so we ignore the trailing bos_token_id -+ result["mm_placeholders"] = { -+ modality: [ -+ PlaceholderRange(offset=p["offset"], length=p["length"] - 1) -+ for p in ps -+ ] -+ for modality, ps in result["mm_placeholders"].items() -+ } -+ -+ return result -+ -+ -+@MULTIMODAL_REGISTRY.register_processor(FuyuMultiModalProcessor) - class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): -@@ -280,28 +299,33 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): - return data.to(self.vision_embed_tokens.weight.dtype) - - def _parse_and_validate_image_input( -- self, **kwargs: object) -> Optional[FuyuImagePixelInputs]: -- pixel_values = kwargs.pop("pixel_values", None) -- -- if pixel_values is not None: -- if not isinstance(pixel_values, (torch.Tensor, list)): -+ self, **kwargs: object) -> Optional[FuyuImagePatchInputs]: -+ image_patches = kwargs.pop("image_patches", None) -+ if image_patches is not None: -+ if not isinstance(image_patches, (torch.Tensor, list)): - raise ValueError("Incorrect type of image patches. " -- f"Got type: {type(pixel_values)}") -+ f"Got type: {type(image_patches)}") - -- return FuyuImagePixelInputs( -- type="pixel_values", -- data=self._validate_pixel_values( -- flatten_bn(pixel_values, concat=True)), -+ image_patches_flat = flatten_bn(image_patches) -+ -+ return FuyuImagePatchInputs( -+ type="image_patches", -+ flat_data=self._validate_pixel_values( -+ flatten_bn(image_patches_flat, concat=True)), -+ patches_per_image=[x.size(0) for x in image_patches_flat], - ) - - return None - - def _process_image_input( -- self, image_input: FuyuImagePixelInputs) -> torch.Tensor: -+ self, image_input: FuyuImagePatchInputs) -> NestedTensors: -+ image_patches_flat = image_input["flat_data"] -+ patches_per_image = image_input["patches_per_image"] - - assert self.vision_embed_tokens is not None -- vision_embeddings, _ = self.vision_embed_tokens(image_input["data"]) -- return vision_embeddings -+ vision_embeddings_flat, _ = self.vision_embed_tokens( -+ image_patches_flat) -+ return vision_embeddings_flat.split(patches_per_image, dim=0) - - def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]: - image_input = self._parse_and_validate_image_input(**kwargs) -diff --git a/vllm/model_executor/models/glm4_vision_encoder.py b/vllm/model_executor/models/glm4_vision_encoder.py -index 39a5736eb..b44419fbd 100644 ---- a/vllm/model_executor/models/glm4_vision_encoder.py -+++ b/vllm/model_executor/models/glm4_vision_encoder.py -@@ -50,6 +50,60 @@ class PatchEmbedding(nn.Module): - x += self.position_embedding.weight.unsqueeze(0) - return x - -+class GlmSelfAttention(nn.Module): -+ """Multi-headed attention without any cache, used for ViT.""" -+ -+ def __init__( -+ self, -+ num_heads: int, -+ head_size: int, -+ scale: float, -+ num_kv_heads: Optional[int] = None, -+ ): -+ super().__init__() -+ self.num_heads = num_heads -+ self.head_size = head_size -+ self.scale = scale -+ self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads -+ -+ def forward( -+ self, -+ query: torch.Tensor, -+ key: torch.Tensor, -+ value: torch.Tensor, -+ ) -> torch.Tensor: -+ """Input shape: batch_size x seq_len x hidden_size""" -+ # TODO(Isotr0py): Use existing backend implementations and support FA2 -+ bsz, q_len, _ = query.size() -+ kv_len = key.size(1) -+ -+ query = query.view(bsz, q_len, self.num_heads, self.head_size) -+ key = key.view(bsz, kv_len, self.num_kv_heads, self.head_size) -+ value = value.view(bsz, kv_len, self.num_kv_heads, self.head_size) -+ -+ query, key, value = (x.transpose(1, 2) -+ for x in (query, key, value)) -+ from ipex_llm.transformers.models.utils import use_sdp_causal -+ from vllm.attention.backends.ipex_attn import use_sdp_causal -+ import xe_addons, math -+ mask = None -+ scale = 1 / math.sqrt(self.head_size) if self.scale is None else self.scale -+ from ipex_llm.transformers.models.common import padding_qkv_hd -+ -+ query, key, value, = padding_qkv_hd( -+ query, key, value, -+ self.head_size, 128 -+ ) -+ if use_sdp_causal(query.shape[-1], query, 0): -+ out = xe_addons.sdp_causal(query.contiguous(), key.contiguous(), value.contiguous(), mask, scale)[:, :, :, :self.head_size].transpose(1, 2) -+ # import torch.nn.functional as F -+ # out = F.scaled_dot_product_attention(query, -+ # key, -+ # value, -+ # scale=self.scale) -+ # out = out.transpose(1, 2) -+ #return out.view(bsz, q_len, -1) -+ return out.reshape(bsz, q_len, -1) - - class Attention(nn.Module): - -@@ -78,8 +132,10 @@ class Attention(nn.Module): - quant_config=quant_config, - ) - -- self.attn = MultiHeadAttention(self.num_heads_per_rank, self.head_dim, -- self.scale) -+ # self.attn = MultiHeadAttention(self.num_heads_per_rank, self.head_dim, -+ # self.scale) -+ self.attn = GlmSelfAttention(self.num_heads_per_rank, self.head_dim, -+ self.scale) - self.output_dropout = torch.nn.Dropout(config.dropout_prob) - - def forward(self, x: torch.Tensor) -> torch.Tensor: -diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py -index f9e0443b9..a91ed4158 100644 ---- a/vllm/model_executor/models/granite.py -+++ b/vllm/model_executor/models/granite.py -@@ -545,8 +545,9 @@ class GraniteForCausalLM(nn.Module, SupportsLoRA, SupportsPP): - # which is consistent with the practice of setting - # scaling_factor = tensor_amax / FPtype_max - scaling_factor *= 2 -- if hasattr(layer_self_attn, "kv_scale"): -- layer_self_attn.attn._kv_scale = scaling_factor -+ if hasattr(layer_self_attn.attn, "_k_scale"): -+ layer_self_attn.attn._k_scale = scaling_factor -+ layer_self_attn.attn._v_scale = scaling_factor - else: - raise RuntimeError("Self attention has no KV cache scaling " - "factor attribute!") -diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py -index e430a158d..4e42a4b6f 100644 ---- a/vllm/model_executor/models/idefics2_vision_model.py -+++ b/vllm/model_executor/models/idefics2_vision_model.py -@@ -69,7 +69,8 @@ class Idefics2VisionEmbeddings(nn.Module): - patch_attention_mask: torch.BoolTensor, - tgt_sizes: Optional[torch.IntTensor] = None) -> torch.Tensor: - batch_size, _, max_im_h, max_im_w = pixel_values.shape -- patch_embeds = self.patch_embedding(pixel_values) -+ target_dtype = self.patch_embedding.weight.dtype -+ patch_embeds = self.patch_embedding(pixel_values.to(target_dtype)) - embeddings = patch_embeds.flatten(2).transpose(1, 2) - max_nb_patches_h, max_nb_patches_w = ( - max_im_h // self.patch_size, -@@ -309,7 +310,8 @@ class Idefics2VisionTransformer(nn.Module): - hidden_states = self.embeddings( - pixel_values=pixel_values, - patch_attention_mask=patch_attention_mask, -- tgt_sizes=tgt_sizes) -+ tgt_sizes=tgt_sizes, -+ ) - encoder_outputs = self.encoder(hidden_states) - last_hidden_state = self.post_layernorm(encoder_outputs) - return last_hidden_state -diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py -index 7ff68bd60..1dba60756 100644 ---- a/vllm/model_executor/models/intern_vit.py -+++ b/vllm/model_executor/models/intern_vit.py -@@ -271,7 +271,8 @@ class InternSdpaAttention(nn.Module): - v = v.transpose(1, 2) - - x = F.scaled_dot_product_attention(q, k, v, scale=self.scale) -- x = x.transpose(1, 2).view(B, N, -1) -+ #x = x.transpose(1, 2).view(B, N, -1) -+ x = x.transpose(1, 2).reshape(B, N, -1) - - x = self.proj(x) - return x -diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py -index 41b9f110d..28c23edd4 100644 ---- a/vllm/model_executor/models/internlm2.py -+++ b/vllm/model_executor/models/internlm2.py -@@ -18,14 +18,16 @@ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, - QKVParallelLinear, - RowParallelLinear) - from vllm.model_executor.layers.logits_processor import LogitsProcessor -+from vllm.model_executor.layers.pooler import Pooler, PoolingType - from vllm.model_executor.layers.quantization import QuantizationConfig - from vllm.model_executor.layers.rotary_embedding import get_rope - from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler - from vllm.model_executor.layers.vocab_parallel_embedding import ( - ParallelLMHead, VocabParallelEmbedding) - from vllm.model_executor.model_loader.weight_utils import default_weight_loader -+from vllm.model_executor.pooling_metadata import PoolingMetadata - from vllm.model_executor.sampling_metadata import SamplingMetadata --from vllm.sequence import IntermediateTensors -+from vllm.sequence import IntermediateTensors, PoolerOutput - - from .interfaces import SupportsLoRA, SupportsPP - from .utils import (is_pp_missing_parameter, -@@ -433,3 +435,59 @@ class InternLM2ForCausalLM(nn.Module, SupportsPP, SupportsLoRA): - weight_loader(param, loaded_weight) - loaded_params.add(name) - return loaded_params -+ -+ -+class InternLM2ForRewardModel(InternLM2ForCausalLM): -+ -+ def __init__( -+ self, -+ *, -+ vllm_config: VllmConfig, -+ prefix: str = "", -+ model_type: Type[InternLM2Model] = InternLM2Model, -+ ): -+ super().__init__(vllm_config=vllm_config, -+ prefix=prefix, -+ model_type=model_type) -+ -+ for attr in ("output", "logits_processor", "sampler"): -+ delattr(self, attr) -+ -+ config = vllm_config.model_config.hf_config -+ self.v_head = RowParallelLinear( -+ config.hidden_size, -+ 1, -+ bias=False, -+ input_is_parallel=False, -+ prefix=maybe_prefix(prefix, "v_head"), -+ ) -+ -+ pooler_config = vllm_config.model_config.pooler_config -+ self._pooler = Pooler.from_config_with_defaults( -+ pooler_config, -+ pooling_type=PoolingType.ALL, -+ normalize=False, -+ softmax=False, -+ ) -+ -+ def forward( -+ self, -+ input_ids: torch.Tensor, -+ positions: torch.Tensor, -+ kv_caches: List[torch.Tensor], -+ attn_metadata: AttentionMetadata, -+ intermediate_tensors: Optional[IntermediateTensors] = None, -+ inputs_embeds: Optional[torch.Tensor] = None, -+ ) -> Union[torch.Tensor, IntermediateTensors]: -+ hidden_states = self.model(input_ids, positions, kv_caches, -+ attn_metadata, intermediate_tensors, -+ inputs_embeds) -+ logits, _ = self.v_head(hidden_states) -+ return logits -+ -+ def pooler( -+ self, -+ hidden_states: torch.Tensor, -+ pooling_metadata: PoolingMetadata, -+ ) -> Optional[PoolerOutput]: -+ return self._pooler(hidden_states, pooling_metadata) -diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py -index 91786db5d..890b5530b 100644 ---- a/vllm/model_executor/models/jamba.py -+++ b/vllm/model_executor/models/jamba.py -@@ -107,9 +107,11 @@ class JambaMambaDecoderLayer(nn.Module): - layer_idx: int, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, -- prefix: str = "") -> None: -+ is_lora_enabled: Optional[bool] = False, -+ **kwargs) -> None: - super().__init__() - self.config = config -+ self.is_lora_enabled = is_lora_enabled - self.mamba = MambaMixer(hidden_size= config.hidden_size, - ssm_state_size = config.mamba_d_state, - conv_kernel_size = config.mamba_d_conv, -@@ -120,7 +122,9 @@ class JambaMambaDecoderLayer(nn.Module): - use_bias = config.mamba_proj_bias, - use_rms_norm=True, - rms_norm_eps=config.rms_norm_eps, -- activation=config.hidden_act) -+ activation=config.hidden_act, -+ is_lora_enabled = self.is_lora_enabled -+ ) - - num_experts = config.layers_num_experts[layer_idx] - ffn_layer_class = JambaMoE if num_experts > 1 else JambaMLP -@@ -156,14 +160,13 @@ class JambaMambaDecoderLayer(nn.Module): - - class JambaAttentionDecoderLayer(nn.Module): - -- def __init__( -- self, -- config: JambaConfig, -- layer_idx: int, -- cache_config: Optional[CacheConfig] = None, -- quant_config: Optional[QuantizationConfig] = None, -- prefix: str = "", -- ) -> None: -+ def __init__(self, -+ config: JambaConfig, -+ layer_idx: int, -+ cache_config: Optional[CacheConfig] = None, -+ quant_config: Optional[QuantizationConfig] = None, -+ prefix: str = "", -+ **kwargs) -> None: - super().__init__() - self.hidden_size = config.hidden_size - tp_size = get_tensor_model_parallel_world_size() -@@ -287,17 +290,18 @@ class JambaModel(nn.Module): - org_num_embeddings=config.vocab_size, - ) - -+ extra_kwargs = {"is_lora_enabled": bool(vllm_config.lora_config)} -+ - def get_layer(prefix: str): - layer_idx = int(prefix.rsplit(".", 1)[1]) - layer_class = ALL_DECODER_LAYER_TYPES[ - config.layers_block_type[layer_idx]] -- return layer_class( -- config, -- layer_idx, -- cache_config, -- quant_config=quant_config, -- prefix=prefix, -- ) -+ return layer_class(config, -+ layer_idx, -+ cache_config, -+ quant_config=quant_config, -+ prefix=prefix, -+ **extra_kwargs) - - self.start_layer, self.end_layer, self.layers = make_layers( - config.num_hidden_layers, get_layer, prefix=f"{prefix}.layers") -@@ -371,14 +375,13 @@ class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, - "k_proj", - "v_proj", - ], -+ "in_proj": ["in_proj"], - } - - # LoRA specific attributes - supported_lora_modules = [ -- "qkv_proj", -- "o_proj", -- "embed_tokens", -- "lm_head", -+ "qkv_proj", "o_proj", "embed_tokens", "lm_head", "up_proj", -+ "down_proj", "gate_proj", "out_proj", "in_proj", "x_proj" - ] - embedding_modules = { - "embed_tokens": "input_embeddings", -@@ -423,9 +426,9 @@ class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, - self.make_empty_intermediate_tensors = ( - self.model.make_empty_intermediate_tensors) - if self.scheduler_config is not None and \ -- not self.model_config.enforce_eager: -+ not self.model_config.enforce_eager: - if self.scheduler_config.max_num_seqs > \ -- vllm_config.compilation_config.max_capture_size: -+ vllm_config.compilation_config.max_capture_size: - self.max_batch_size = \ - vllm_config.compilation_config.max_capture_size - else: -@@ -446,7 +449,6 @@ class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, - inputs_embeds: Optional[torch.Tensor] = None, - **kwargs): - if self.mamba_cache is None: -- - num_mamba_layers = self.model_config.get_num_layers_by_block_type( - self.vllm_config.parallel_config, LayerBlockType.mamba) - self.mamba_cache = MambaCacheManager( -diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py -index 2902e6999..8623da995 100644 ---- a/vllm/model_executor/models/llama.py -+++ b/vllm/model_executor/models/llama.py -@@ -452,8 +452,9 @@ class LlamaModel(nn.Module): - # which is consistent with the practice of setting - # scaling_factor = tensor_amax / FPtype_max - scaling_factor *= 2 -- if hasattr(layer_self_attn, "kv_scale"): -- layer_self_attn.attn._kv_scale = scaling_factor -+ if hasattr(layer_self_attn.attn, "_k_scale"): -+ layer_self_attn.attn._k_scale = scaling_factor -+ layer_self_attn.attn._v_scale = scaling_factor - else: - raise RuntimeError("Self attention has no KV cache scaling " - "factor attribute!") -diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py -index 0662d90e7..4299af8cd 100644 ---- a/vllm/model_executor/models/llava.py -+++ b/vllm/model_executor/models/llava.py -@@ -1,19 +1,19 @@ -+from abc import ABC, abstractmethod - from functools import cached_property --from types import MethodType --from typing import (Iterable, List, Literal, Mapping, Optional, Protocol, Set, -- Tuple, TypedDict, Union) -+from typing import (Final, Iterable, List, Literal, Mapping, Optional, -+ Protocol, Set, Tuple, TypedDict, Union) - - import torch - import torch.nn as nn - from transformers import (BatchFeature, CLIPVisionConfig, LlavaConfig, - PixtralVisionConfig, PretrainedConfig, -- ProcessorMixin, SiglipVisionConfig) -+ SiglipVisionConfig) - from transformers.models.llava import LlavaProcessor - from transformers.models.pixtral import PixtralProcessor - - from vllm.attention import AttentionMetadata - from vllm.config import VllmConfig --from vllm.inputs import InputContext -+from vllm.inputs import InputProcessingContext - from vllm.model_executor.layers.activation import get_act_fn - from vllm.model_executor.layers.linear import (ColumnParallelLinear, - RowParallelLinear) -@@ -21,22 +21,25 @@ from vllm.model_executor.layers.quantization import QuantizationConfig - from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler - from vllm.model_executor.sampling_metadata import SamplingMetadata - from vllm.multimodal import MULTIMODAL_REGISTRY --from vllm.multimodal.inputs import NestedTensors -+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, -+ MultiModalInputsV2, MultiModalKwargs, -+ NestedTensors) -+from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, -+ ImageSize) - from vllm.multimodal.processing import (BaseMultiModalProcessor, -- MultiModalDataItems, ProcessorInputs, -- PromptReplacement) -+ MultiModalDataItems, ProcessingCache, -+ ProcessingMixin, PromptReplacement) -+from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs - from vllm.sequence import IntermediateTensors - --from .clip import (CLIPVisionModel, dummy_image_for_clip, -- get_max_clip_image_tokens) -+from .clip import CLIPVisionModel - from .interfaces import SupportsMultiModal, SupportsPP --from .pixtral import (PixtralHFVisionModel, dummy_image_for_pixtral_hf, -- get_max_pixtral_hf_image_tokens, -- get_pixtral_hf_image_feature_size) --from .siglip import (SiglipVisionModel, dummy_image_for_siglip, -- get_max_siglip_image_tokens) -+from .pixtral import (PixtralHFVisionModel, -+ get_pixtral_hf_image_feature_grid_size) -+from .siglip import SiglipVisionModel - from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model, - maybe_prefix, merge_multimodal_embeddings) -+from .vision import get_vision_encoder_info - - - class LlavaImagePixelInputs(TypedDict): -@@ -91,140 +94,295 @@ class LlavaMultiModalProjector(nn.Module): - return hidden_states - - --def get_max_llava_image_tokens(ctx: InputContext): -- hf_config = ctx.get_hf_config(LlavaConfig) -- vision_config = hf_config.vision_config -+class LlavaLikeConfig(Protocol): -+ vision_config: Final[PretrainedConfig] -+ image_token_index: Final[int] -+ vision_feature_select_strategy: Final[str] -+ vision_feature_layer: Final[Union[int, list[int]]] - -- if isinstance(vision_config, CLIPVisionConfig): -- num_image_tokens = get_max_clip_image_tokens(vision_config) -- elif isinstance(vision_config, SiglipVisionConfig): -- num_image_tokens = get_max_siglip_image_tokens(vision_config) -- elif isinstance(vision_config, PixtralVisionConfig): -- num_image_tokens = get_max_pixtral_hf_image_tokens(vision_config) -- else: -- msg = f"Unsupported vision config: {type(vision_config)}" -+ -+class LlavaLikeProcessor(Protocol): -+ image_token: Final[str] -+ -+ -+class BaseLlavaProcessingMixin(ProcessingMixin, ABC): -+ -+ def _get_hf_config(self) -> LlavaLikeConfig: -+ return self.ctx.get_hf_config(LlavaConfig) -+ -+ def _get_vision_encoder_info(self): -+ return get_vision_encoder_info(self._get_hf_config()) -+ -+ @abstractmethod -+ def _get_hf_processor(self) -> LlavaLikeProcessor: -+ raise NotImplementedError -+ -+ def _get_num_image_tokens( -+ self, -+ *, -+ image_width: int, -+ image_height: int, -+ ) -> int: -+ hf_config = self._get_hf_config() -+ vision_encoder_info = self._get_vision_encoder_info() -+ -+ return self._apply_feature_select_strategy( -+ hf_config.vision_feature_select_strategy, -+ vision_encoder_info.get_num_image_tokens( -+ image_width=image_width, -+ image_height=image_height, -+ ), -+ ) -+ -+ def _apply_feature_select_strategy( -+ self, -+ strategy: str, -+ encoder_num_image_tokens: int, -+ ) -> int: -+ if strategy == "default": -+ return encoder_num_image_tokens - 1 -+ if strategy == "full": -+ return encoder_num_image_tokens -+ -+ msg = f"Unexpected feature select strategy: {strategy!r}" - raise NotImplementedError(msg) - -- strategy = hf_config.vision_feature_select_strategy -- if strategy == "default": -- return num_image_tokens - 1 -- elif strategy == "full": -- return num_image_tokens -- else: -- raise ValueError(f"Unexpected select feature strategy: {strategy}") - -+class BaseLlavaProfilingInfo(BaseLlavaProcessingMixin, BaseProfilingInfo): -+ -+ def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: -+ return {"image": None} -+ -+ def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: -+ return {"image": self._get_max_image_tokens()} -+ -+ def _get_image_size_with_most_features(self) -> ImageSize: -+ vision_encoder_info = self._get_vision_encoder_info() -+ width = height = vision_encoder_info.get_image_size() -+ return ImageSize(width=width, height=height) -+ -+ def _get_max_image_tokens(self) -> int: -+ target_width, target_height = self._get_image_size_with_most_features() -+ -+ return self._get_num_image_tokens( -+ image_width=target_width, -+ image_height=target_height, -+ ) -+ -+ def get_dummy_processor_inputs( -+ self, -+ seq_len: int, -+ mm_counts: Mapping[str, int], -+ ) -> ProcessorInputs: -+ num_images = mm_counts.get("image", 0) -+ -+ processor = self._get_hf_processor() -+ image_token = processor.image_token -+ target_width, target_height = self._get_image_size_with_most_features() -+ -+ mm_data = { -+ "image": -+ self._get_dummy_images(width=target_width, -+ height=target_height, -+ num_images=num_images) -+ } -+ -+ return ProcessorInputs( -+ prompt_text=image_token * num_images, -+ mm_data=mm_data, -+ ) - --class LlavaMultiModalProcessor(BaseMultiModalProcessor): - -- def _patch_pixtral_processor(self, hf_processor: PixtralProcessor): -- if getattr(hf_processor, "__is_patched__", False): -- return # Already patched -+class LlavaProcessingMixin(BaseLlavaProcessingMixin): - -- image_processor = hf_processor.image_processor # type: ignore -- orig_preprocess = image_processor.preprocess -+ def _get_hf_processor(self): -+ return self.ctx.get_hf_processor(LlavaProcessor) - -- def preprocess(__self, *args, **kwargs): -- hf_inputs = orig_preprocess(*args, **kwargs) -- hf_inputs["is_pixtral"] = torch.tensor(True) -- return hf_inputs - -- image_processor.preprocess = MethodType(preprocess, image_processor) -+class LlavaProfilingInfo(LlavaProcessingMixin, BaseLlavaProfilingInfo): -+ pass - -- hf_processor.__is_patched__ = True # type: ignore - -- def _get_hf_processor(self) -> Union[LlavaProcessor, PixtralProcessor]: -- hf_processor = self.ctx.get_hf_processor( -- (LlavaProcessor, PixtralProcessor)) -+class BaseLlavaMultiModalProcessor(LlavaProcessingMixin, -+ BaseMultiModalProcessor): - -- if isinstance(hf_processor, PixtralProcessor): -- self._patch_pixtral_processor(hf_processor) -+ # Copied from BaseMultiModalProcessor -+ @abstractmethod -+ def _get_profiling_info(self) -> BaseProfilingInfo: -+ raise NotImplementedError - -- return hf_processor -+ # Copied from BaseMultiModalProcessor -+ @abstractmethod -+ def _get_mm_fields_config( -+ self, -+ hf_inputs: BatchFeature, -+ hf_processor_mm_kwargs: Mapping[str, object], -+ ) -> Mapping[str, MultiModalFieldConfig]: -+ raise NotImplementedError - - def _get_prompt_replacements( - self, - mm_items: MultiModalDataItems, -- hf_inputs: BatchFeature, -- mm_processor_kwargs: Mapping[str, object], -+ hf_processor_mm_kwargs: Mapping[str, object], -+ out_mm_kwargs: MultiModalKwargs, - ) -> list[PromptReplacement]: -- hf_config = self.ctx.get_hf_config(LlavaConfig) -+ hf_config = self._get_hf_config() - image_token_id = hf_config.image_token_index - -- processor = self._get_hf_processor() -- if isinstance(processor, PixtralProcessor): -- image_token = processor.image_token -- image_break_token = processor.image_break_token -- image_end_token = processor.image_end_token -- -- vision_config = hf_config.vision_config -- assert isinstance(vision_config, PixtralVisionConfig) -- -- def get_replacement_pixtral(item_idx: int): -- image_size = mm_items.get_image_size(item_idx) -- ( -- num_width_tokens, -- num_height_tokens, -- ) = get_pixtral_hf_image_feature_size( -- vision_config, -+ def get_replacement(item_idx: int): -+ images = mm_items.get_items( -+ "image", (ImageEmbeddingItems, ImageProcessorItems)) -+ -+ if isinstance(images, ImageEmbeddingItems): -+ num_image_tokens = images.get_feature_size(item_idx) -+ else: -+ image_size = images.get_image_size(item_idx) -+ num_image_tokens = self._get_num_image_tokens( - image_width=image_size.width, - image_height=image_size.height, - ) - -- tokens = ([image_token] * num_width_tokens + -- [image_break_token]) * num_height_tokens -- tokens[-1] = image_end_token -- -- return "".join(tokens) -- -- return [ -- PromptReplacement( -- modality="image", -- target=[image_token_id], -- replacement=get_replacement_pixtral, -- ), -- ] -- -- max_image_tokens = get_max_llava_image_tokens(self.ctx) -+ return [image_token_id] * num_image_tokens - - return [ - PromptReplacement( - modality="image", - target=[image_token_id], -- replacement=[image_token_id] * max_image_tokens, -- ) -+ replacement=get_replacement, -+ ), - ] - -- def _get_dummy_mm_inputs( -+ -+class LlavaMultiModalProcessor(BaseLlavaMultiModalProcessor): -+ -+ def _get_profiling_info(self) -> BaseProfilingInfo: -+ return LlavaProfilingInfo(self.ctx) -+ -+ def _get_mm_fields_config( - self, -- mm_counts: Mapping[str, int], -- ) -> ProcessorInputs: -- hf_config = self.ctx.get_hf_config(LlavaConfig) -- vision_config = hf_config.vision_config -- num_images = mm_counts["image"] -+ hf_inputs: BatchFeature, -+ hf_processor_mm_kwargs: Mapping[str, object], -+ ) -> Mapping[str, MultiModalFieldConfig]: -+ return dict( -+ pixel_values=MultiModalFieldConfig.batched("image"), -+ image_embeds=MultiModalFieldConfig.batched("image"), -+ ) - -- if isinstance(vision_config, CLIPVisionConfig): -- data = dummy_image_for_clip(vision_config, num_images) -- elif isinstance(vision_config, SiglipVisionConfig): -- data = dummy_image_for_siglip(vision_config, num_images) -- elif isinstance(vision_config, PixtralVisionConfig): -- data = dummy_image_for_pixtral_hf(vision_config, num_images) -- else: -- msg = f"Unsupported vision config: {type(vision_config)}" -- raise NotImplementedError(msg) - -- hf_processor = self._get_hf_processor() -- image_token = hf_processor.image_token -+class PixtralHFProcessingMixin(BaseLlavaProcessingMixin): - -- return ProcessorInputs( -- prompt_text=image_token * num_images, -- mm_data=data, -- mm_processor_kwargs={}, -+ def _get_hf_processor(self): -+ return self.ctx.get_hf_processor(PixtralProcessor) -+ -+ -+class PixtralHFProfilingInfo(PixtralHFProcessingMixin, BaseLlavaProfilingInfo): -+ pass -+ -+ -+class PixtralHFMultiModalProcessor(PixtralHFProcessingMixin, -+ BaseMultiModalProcessor): -+ -+ def _get_profiling_info(self) -> BaseProfilingInfo: -+ return PixtralHFProfilingInfo(self.ctx) -+ -+ def _call_hf_processor( -+ self, -+ prompt: str, -+ mm_data: Mapping[str, object], -+ mm_kwargs: Mapping[str, object], -+ ) -> BatchFeature: -+ processed_outputs = super()._call_hf_processor( -+ prompt=prompt, -+ mm_data=mm_data, -+ mm_kwargs=mm_kwargs, - ) - -+ pixel_values = processed_outputs.get("pixel_values") -+ if pixel_values is not None: -+ images = mm_data["images"] -+ assert isinstance(images, list) -+ -+ # Original output: (1, num_images, C, H, W) -+ # New output: (num_images, C, H, W) -+ assert (isinstance(pixel_values, list) and len(pixel_values) == 1) -+ assert (isinstance(pixel_values[0], list) -+ and len(pixel_values[0]) == len(images)) - --class LlavaLikeConfig(Protocol): -- vision_config: PretrainedConfig -- vision_feature_layer: Union[int, List[int]] -+ processed_outputs["pixel_values"] = pixel_values[0] -+ -+ return processed_outputs -+ -+ def _get_mm_fields_config( -+ self, -+ hf_inputs: BatchFeature, -+ hf_processor_mm_kwargs: Mapping[str, object], -+ ) -> Mapping[str, MultiModalFieldConfig]: -+ return dict( -+ pixel_values=MultiModalFieldConfig.batched("image"), -+ image_embeds=MultiModalFieldConfig.batched("image"), -+ ) -+ -+ def _get_prompt_replacements( -+ self, -+ mm_items: MultiModalDataItems, -+ hf_processor_mm_kwargs: Mapping[str, object], -+ out_mm_kwargs: MultiModalKwargs, -+ ) -> list[PromptReplacement]: -+ hf_config = self._get_hf_config() -+ image_token_id = hf_config.image_token_index -+ -+ processor = self._get_hf_processor() -+ image_token = processor.image_token -+ image_break_token = processor.image_break_token -+ image_end_token = processor.image_end_token -+ -+ vision_config = hf_config.vision_config -+ assert isinstance(vision_config, PixtralVisionConfig) -+ -+ def get_replacement(item_idx: int): -+ images = mm_items.get_items("image", ImageProcessorItems) -+ image_size = images.get_image_size(item_idx) -+ -+ ncols, nrows = get_pixtral_hf_image_feature_grid_size( -+ vision_config, -+ image_width=image_size.width, -+ image_height=image_size.height, -+ ) -+ -+ tokens = ([image_token] * ncols + [image_break_token]) * nrows -+ tokens[-1] = image_end_token -+ -+ return "".join(tokens) -+ -+ return [ -+ PromptReplacement( -+ modality="image", -+ target=[image_token_id], -+ replacement=get_replacement, -+ ), -+ ] -+ -+ -+def _build_llava_or_pixtral_hf_processor( -+ ctx: InputProcessingContext, -+ *, -+ cache: Optional[ProcessingCache] = None, -+ enable_sanity_checks: bool = True, -+) -> BaseMultiModalProcessor: -+ hf_config = ctx.get_hf_config(LlavaConfig) -+ -+ if isinstance(hf_config.vision_config, PixtralVisionConfig): -+ return PixtralHFMultiModalProcessor( -+ ctx, -+ cache=cache, -+ enable_sanity_checks=enable_sanity_checks, -+ ) -+ -+ return LlavaMultiModalProcessor( -+ ctx, -+ cache=cache, -+ enable_sanity_checks=enable_sanity_checks, -+ ) - - - def _get_num_hidden_layers(hf_config: LlavaLikeConfig) -> int: -@@ -302,8 +460,7 @@ def init_vision_tower_for_llava( - raise NotImplementedError(msg) - - --@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_llava_image_tokens) --@MULTIMODAL_REGISTRY.register_processor(LlavaMultiModalProcessor) -+@MULTIMODAL_REGISTRY.register_processor(_build_llava_or_pixtral_hf_processor) - class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): - # BitandBytes specific attributes - bitsandbytes_stacked_params_mapping = { -@@ -379,7 +536,6 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): - def _parse_and_validate_image_input( - self, **kwargs: object) -> Optional[LlavaImageInputs]: - pixel_values = kwargs.pop("pixel_values", None) -- is_pixtral = kwargs.pop("is_pixtral", torch.tensor([False])) - image_embeds = kwargs.pop("image_embeds", None) - - if pixel_values is None and image_embeds is None: -@@ -390,33 +546,6 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): - raise ValueError("Incorrect type of pixel values. " - f"Got type: {type(pixel_values)}") - -- assert isinstance(is_pixtral, torch.Tensor) -- if is_pixtral.any(): -- images = pixel_values -- -- def flatten_to_3d_tensors(item): -- if isinstance(item, torch.Tensor): -- if item.dim() >= 3: -- return [t for t in item.view(-1, *item.shape[-3:])] -- else: -- raise ValueError( -- f"Unexpected tensor dimension: {item.dim()}") -- elif isinstance(item, list): -- return [ -- t for subitem in item -- for t in flatten_to_3d_tensors(subitem) -- ] -- else: -- raise ValueError(f"Unexpected type: {type(item)}") -- -- # Restructure the batched images into a list of lists of images -- images = flatten_to_3d_tensors(pixel_values) -- -- return LlavaImagePixelInputs( -- type="pixel_values", -- data=images, -- ) -- - return LlavaImagePixelInputs( - type="pixel_values", - data=self._validate_pixel_values( -@@ -586,24 +715,81 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): - - class MantisMultiModalProcessor(LlavaMultiModalProcessor): - -- def _get_hf_processor(self) -> ProcessorMixin: -- try: -- from mantis.models.mllava import MLlavaProcessor -- except ModuleNotFoundError as exc: -- raise ModuleNotFoundError( -- "You need to `pip install " -- "git+https://github.com/TIGER-AI-Lab/Mantis.git` " -- "to use this model") from exc -+ def apply( -+ self, -+ prompt_text: str, -+ mm_data: MultiModalDataDict, -+ hf_processor_mm_kwargs: Mapping[str, object], -+ ) -> MultiModalInputsV2: -+ hf_config = self._get_hf_config() -+ image_token_id = hf_config.image_token_index -+ -+ # Assume that it doesn't depend on the image size -+ num_image_tokens = self._get_num_image_tokens( -+ image_width=-1, -+ image_height=-1, -+ ) -+ -+ result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs) - -- processor = MLlavaProcessor.from_pretrained( -- self.ctx.model_config.tokenizer) -- assert isinstance(processor, ProcessorMixin) -- return processor -+ mm_items = self._to_mm_items(mm_data) -+ mm_item_counts = mm_items.get_all_counts() -+ mm_kwargs = result["mm_kwargs"] -+ -+ # We reimplement the functionality of MLlavaProcessor from -+ # https://github.com/TIGER-AI-Lab/Mantis.git -+ def get_replacement_mantis(item_idx: int): -+ return "".join([ -+ f"(image {item_idx+1}: ", # 7 tokens -+ "" * num_image_tokens, -+ ")", # 3 tokens -+ ]) -+ -+ mantis_mm_repls = self._bind_and_group_repls([ -+ PromptReplacement( -+ modality="image", -+ target=[image_token_id] * num_image_tokens, -+ replacement=get_replacement_mantis, -+ ) -+ ]) -+ -+ prompt_ids, prompt_text, _ = self._apply_prompt_replacements( -+ result["prompt_token_ids"], -+ mantis_mm_repls, -+ mm_item_counts, -+ ) -+ -+ unbound_orig_repls = self._get_prompt_replacements( -+ mm_items, -+ hf_processor_mm_kwargs, -+ mm_kwargs, -+ ) -+ orig_repls = self._bind_and_group_repls(unbound_orig_repls) -+ -+ mm_placeholders = self._find_mm_placeholders( -+ orig_repls, -+ prompt_ids, -+ mm_item_counts, -+ ) -+ -+ self._validate_mm_placeholders(mm_placeholders, mm_item_counts) -+ -+ mm_placeholder_ranges = { -+ modality: [item.to_range() for item in placeholders] -+ for modality, placeholders in mm_placeholders.items() -+ } -+ -+ return MultiModalInputsV2( -+ type="multimodal", -+ prompt=prompt_text, -+ prompt_token_ids=prompt_ids, -+ mm_kwargs=mm_kwargs, -+ mm_placeholders=mm_placeholder_ranges, -+ ) - - - # To use this model, please use - # `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` --@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_llava_image_tokens) - @MULTIMODAL_REGISTRY.register_processor(MantisMultiModalProcessor) - class MantisForConditionalGeneration(LlavaForConditionalGeneration): - pass -diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py -index a39f2f412..c76ec164a 100644 ---- a/vllm/model_executor/models/llava_next.py -+++ b/vllm/model_executor/models/llava_next.py -@@ -1,34 +1,31 @@ - from functools import cached_property --from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, -- TypedDict, Union) -+from typing import (Final, Iterable, List, Literal, Mapping, Optional, -+ Protocol, Set, Tuple, TypedDict, Union) - -+import numpy as np - import torch - import torch.nn as nn --from PIL import Image --from transformers import CLIPVisionConfig, LlavaNextConfig, SiglipVisionConfig -+from transformers import BatchFeature, LlavaNextConfig, LlavaNextProcessor - from transformers.models.llava_next.modeling_llava_next import ( - get_anyres_image_grid_shape, unpad_image) - from typing_extensions import NotRequired - - from vllm.attention import AttentionMetadata - from vllm.config import VllmConfig --from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, -- InputContext) - from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler - from vllm.model_executor.sampling_metadata import SamplingMetadata - from vllm.multimodal import MULTIMODAL_REGISTRY --from vllm.multimodal.inputs import NestedTensors -+from vllm.multimodal.inputs import MultiModalFieldConfig, NestedTensors -+from vllm.multimodal.parse import ImageSize -+from vllm.multimodal.profiling import BaseProfilingInfo - from vllm.sequence import IntermediateTensors --from vllm.utils import is_list_of - --from .clip import (CLIPVisionModel, dummy_image_for_clip, -- dummy_seq_data_for_clip, get_clip_image_feature_size, -- get_clip_patch_grid_length, input_processor_for_clip) -+from .clip import CLIPVisionModel - from .interfaces import SupportsMultiModal, SupportsPP --from .llava import LlavaMultiModalProjector, init_vision_tower_for_llava --from .siglip import (SiglipVisionModel, dummy_image_for_siglip, -- dummy_seq_data_for_siglip, get_siglip_image_feature_size, -- get_siglip_patch_grid_length, input_processor_for_siglip) -+from .llava import (BaseLlavaMultiModalProcessor, BaseLlavaProcessingMixin, -+ BaseLlavaProfilingInfo, LlavaLikeConfig, -+ LlavaMultiModalProjector, init_vision_tower_for_llava) -+from .siglip import SiglipVisionModel - from .utils import (AutoWeightsLoader, embed_multimodal, flatten_bn, - init_vllm_registered_model, maybe_prefix) - -@@ -65,218 +62,132 @@ LlavaNextImageInputs = Union[LlavaNextImagePixelInputs, - LlavaNextImageEmbeddingInputs] - - --# Based on: https://github.com/huggingface/text-generation-inference/blob/v2.2.0/server/text_generation_server/models/vlm_causal_lm.py#L79 --def _get_llava_next_num_unpadded_features( -- original_height: int, -- original_width: int, -- npatches: int, -- num_patch_height: int, -- num_patch_width: int, --) -> Tuple[int, int]: -- current_height = npatches * num_patch_height -- current_width = npatches * num_patch_width -- -- original_aspect_ratio = original_width / original_height -- current_aspect_ratio = current_width / current_height -- -- if original_aspect_ratio > current_aspect_ratio: -- scale_factor = current_width / original_width -- new_height = int(original_height * scale_factor) -- padding = (current_height - new_height) // 2 -- current_height -= 2 * padding -- else: -- scale_factor = current_height / original_height -- new_width = int(original_width * scale_factor) -- padding = (current_width - new_width) // 2 -- current_width -= 2 * padding -- -- unpadded_features = current_height * current_width -- newline_features = current_height -- return (unpadded_features, newline_features) -- -- --# Based on: https://github.com/huggingface/text-generation-inference/blob/v2.2.0/server/text_generation_server/models/vlm_causal_lm.py#L106 --def get_llava_next_image_feature_size( -- hf_config: LlavaNextConfig, -- *, -- input_height: int, -- input_width: int, --) -> int: -- vision_config = hf_config.vision_config -- -- if isinstance(vision_config, CLIPVisionConfig): -- num_patches = get_clip_patch_grid_length( -- image_size=vision_config.image_size, -- patch_size=vision_config.patch_size, -- ) -- base_feature_size = get_clip_image_feature_size(vision_config) -- elif isinstance(vision_config, SiglipVisionConfig): -- num_patches = get_siglip_patch_grid_length( -- image_size=vision_config.image_size, -- patch_size=vision_config.patch_size, -- ) -- base_feature_size = get_siglip_image_feature_size(vision_config) -- else: -- msg = f"Unsupported vision config: {type(vision_config)}" -- raise NotImplementedError(msg) -- -- strategy = hf_config.vision_feature_select_strategy -- if strategy == "default": -- base_feature_size -= 1 -- elif strategy == "full": -- pass -- else: -- raise ValueError(f"Unexpected select feature strategy: {strategy}") -+class LlavaNextLikeConfig(LlavaLikeConfig, Protocol): -+ image_grid_pinpoints: Final[list[list[int]]] - -- num_patch_height, num_patch_width = get_anyres_image_grid_shape( -- image_size=(input_height, input_width), -- grid_pinpoints=hf_config.image_grid_pinpoints, -- patch_size=vision_config.image_size, -- ) -- -- ( -- unpadded_feature_size, -- newline_feature_size, -- ) = _get_llava_next_num_unpadded_features(input_height, input_width, -- num_patches, num_patch_height, -- num_patch_width) -- -- return unpadded_feature_size + newline_feature_size + base_feature_size -- -- --def get_max_llava_next_image_tokens(ctx: InputContext): -- """Compute the max feature size for all possible image grid pinpoints.""" -- return _get_pinpoint_with_largest_features(ctx)[0] -- -- --def _get_pinpoint_with_largest_features( -- ctx: InputContext) -> Tuple[int, Tuple[int, int]]: -- """Get the grid pinpoint with the largest features & its feature size.""" -- hf_config = ctx.get_hf_config(LlavaNextConfig) -- largest_feature_size = 0 -- largest_feature_pinpoint = None -- for (height, width) in hf_config.image_grid_pinpoints: -- feat_size = get_llava_next_image_feature_size( -- hf_config, -- input_height=height, -- input_width=width, -- ) -- if feat_size > largest_feature_size: -- largest_feature_size = feat_size -- largest_feature_pinpoint = (height, width) -- if not largest_feature_size or largest_feature_pinpoint is None: -- raise ValueError("Cannot have a largest feature size of 0!") -- return largest_feature_size, largest_feature_pinpoint -- -- --def dummy_data_for_llava_next(ctx: InputContext, seq_len: int, -- mm_counts: Mapping[str, int]): -- hf_config = ctx.get_hf_config(LlavaNextConfig) -- vision_config = hf_config.vision_config -- num_images = mm_counts["image"] -- -- image_feature_size, pinpoint = _get_pinpoint_with_largest_features(ctx) -- max_feat_height, max_feat_width = pinpoint -- -- if isinstance(vision_config, CLIPVisionConfig): -- seq_data, ranges = dummy_seq_data_for_clip( -- vision_config, -- seq_len, -- num_images, -- image_token_id=hf_config.image_token_index, -- image_feature_size_override=image_feature_size, -- ) - -- mm_data = dummy_image_for_clip( -- vision_config, -- num_images, -- image_width_override=max_feat_width, -- image_height_override=max_feat_height, -+class LlavaNextProcessingMixin(BaseLlavaProcessingMixin): -+ -+ def _get_hf_config(self) -> LlavaNextLikeConfig: -+ return self.ctx.get_hf_config(LlavaNextConfig) -+ -+ def _get_hf_processor(self): -+ return self.ctx.get_hf_processor(LlavaNextProcessor) -+ -+ # Based on: https://github.com/huggingface/text-generation-inference/blob/v2.2.0/server/text_generation_server/models/vlm_causal_lm.py#L106 -+ def _get_num_image_tokens( -+ self, -+ *, -+ image_width: int, -+ image_height: int, -+ ) -> int: -+ hf_config = self._get_hf_config() -+ vision_encoder_info = self._get_vision_encoder_info() -+ -+ base_feature_size = self._apply_feature_select_strategy( -+ hf_config.vision_feature_select_strategy, -+ vision_encoder_info.get_num_image_tokens( -+ image_width=image_width, -+ image_height=image_height, -+ ), - ) - -- return DummyData(seq_data, mm_data, ranges) -- elif isinstance(vision_config, SiglipVisionConfig): -- seq_data, ranges = dummy_seq_data_for_siglip( -- vision_config, -- seq_len, -- num_images, -- image_token_id=hf_config.image_token_index, -- image_feature_size_override=image_feature_size, -+ num_patch_height, num_patch_width = get_anyres_image_grid_shape( -+ image_size=(image_height, image_width), -+ grid_pinpoints=hf_config.image_grid_pinpoints, -+ patch_size=vision_encoder_info.get_image_size(), - ) - -- mm_data = dummy_image_for_siglip( -- vision_config, -- num_images, -- image_width_override=max_feat_width, -- image_height_override=max_feat_height, -+ ( -+ unpadded_feature_size, -+ newline_feature_size, -+ ) = self._get_num_unpadded_features( -+ original_height=image_height, -+ original_width=image_width, -+ npatches=vision_encoder_info.get_patch_grid_length(), -+ num_patch_height=num_patch_height, -+ num_patch_width=num_patch_width, - ) - -- return DummyData(seq_data, mm_data, ranges) -+ return unpadded_feature_size + newline_feature_size + base_feature_size - -- msg = f"Unsupported vision config: {type(vision_config)}" -- raise NotImplementedError(msg) -+ # Based on: https://github.com/huggingface/text-generation-inference/blob/v2.2.0/server/text_generation_server/models/vlm_causal_lm.py#L79 -+ def _get_num_unpadded_features( -+ self, -+ *, -+ original_height: int, -+ original_width: int, -+ npatches: int, -+ num_patch_height: int, -+ num_patch_width: int, -+ ) -> tuple[int, int]: -+ current_height = npatches * num_patch_height -+ current_width = npatches * num_patch_width -+ -+ # NOTE: Use float32 to remain consistent with HF output -+ original_aspect_ratio = np.array(original_width / original_height, -+ dtype=np.float32) -+ current_aspect_ratio = np.array(current_width / current_height, -+ dtype=np.float32) -+ -+ if original_aspect_ratio > current_aspect_ratio: -+ scale_factor = np.array(current_width / original_width, -+ dtype=np.float32) -+ new_height = int(original_height * scale_factor) -+ padding = (current_height - new_height) // 2 -+ current_height -= 2 * padding -+ else: -+ scale_factor = np.array(current_height / original_height, -+ dtype=np.float32) -+ new_width = int(original_width * scale_factor) -+ padding = (current_width - new_width) // 2 -+ current_width -= 2 * padding - -+ unpadded_features = current_height * current_width -+ newline_features = current_height - --def input_processor_for_llava_next(ctx: InputContext, -- inputs: DecoderOnlyInputs): -- multi_modal_data = inputs.get("multi_modal_data") -- if multi_modal_data is None or "image" not in multi_modal_data: -- return inputs -+ return (unpadded_features, newline_features) - -- model_config = ctx.model_config -- hf_config = ctx.get_hf_config(LlavaNextConfig) -- vision_config = hf_config.vision_config - -- image_data = multi_modal_data["image"] -- if isinstance(image_data, Image.Image): -- width, height = image_data.size -+class LlavaNextProfilingInfo(LlavaNextProcessingMixin, BaseLlavaProfilingInfo): - -- image_feature_size = get_llava_next_image_feature_size( -- hf_config, -- input_height=height, -- input_width=width, -- ) -- elif is_list_of(image_data, Image.Image): -- image_feature_size = [ -- get_llava_next_image_feature_size(hf_config, -- input_height=img.height, -- input_width=img.width) -- for img in image_data -- ] -- elif isinstance(image_data, torch.Tensor): -- num_images, image_feature_size, hidden_size = image_data.shape -- elif is_list_of(image_data, torch.Tensor): -- image_feature_size = [item.shape[1] for item in image_data] -- else: -- raise TypeError(f"Invalid image type: {type(image_data)}") -- -- vision_config = hf_config.vision_config -- -- if isinstance(vision_config, CLIPVisionConfig): -- return input_processor_for_clip( -- model_config, -- vision_config, -- inputs, -- image_token_id=hf_config.image_token_index, -- image_feature_size_override=image_feature_size, -- ) -- elif isinstance(vision_config, SiglipVisionConfig): -- return input_processor_for_siglip( -- model_config, -- vision_config, -- inputs, -- image_token_id=hf_config.image_token_index, -- image_feature_size_override=image_feature_size, -- ) -+ def _get_image_size_with_most_features(self) -> ImageSize: -+ hf_config = self._get_hf_config() -+ -+ largest_feature_size, largest_feature_pinpoint = 0, None -+ for (height, width) in hf_config.image_grid_pinpoints: -+ feat_size = self._get_num_image_tokens(image_width=width, -+ image_height=height) -+ if feat_size > largest_feature_size: -+ largest_feature_size = feat_size -+ largest_feature_pinpoint = ImageSize(width=width, -+ height=height) -+ -+ if largest_feature_size == 0 or largest_feature_pinpoint is None: -+ raise ValueError("Cannot have a largest feature size of 0!") - -- msg = f"Unsupported vision config: {type(vision_config)}" -- raise NotImplementedError(msg) -+ return largest_feature_pinpoint - - --@MULTIMODAL_REGISTRY.register_image_input_mapper() --@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_llava_next_image_tokens) --@INPUT_REGISTRY.register_dummy_data(dummy_data_for_llava_next) --@INPUT_REGISTRY.register_input_processor(input_processor_for_llava_next) -+class LlavaNextMultiModalProcessor(LlavaNextProcessingMixin, -+ BaseLlavaMultiModalProcessor): -+ -+ def _get_profiling_info(self) -> BaseProfilingInfo: -+ return LlavaNextProfilingInfo(self.ctx) -+ -+ def _get_mm_fields_config( -+ self, -+ hf_inputs: BatchFeature, -+ hf_processor_mm_kwargs: Mapping[str, object], -+ ) -> Mapping[str, MultiModalFieldConfig]: -+ return dict( -+ pixel_values=MultiModalFieldConfig.batched("image"), -+ image_sizes=MultiModalFieldConfig.batched("image"), -+ image_embeds=MultiModalFieldConfig.batched("image"), -+ ) -+ -+ -+@MULTIMODAL_REGISTRY.register_processor(LlavaNextMultiModalProcessor) - class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal, - SupportsPP): - -@@ -507,7 +418,7 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal, - def _process_image_pixels( - self, - inputs: LlavaNextImagePixelInputs, -- ) -> Union[torch.Tensor, List[torch.Tensor]]: -+ ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]: - assert self.vision_tower is not None - - pixel_values = inputs["data"] -@@ -528,10 +439,8 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal, - stacked_image_features = self._image_pixels_to_features( - self.vision_tower, stacked_pixel_values) - -- return [ -- self.multi_modal_projector(image_features) for image_features in -- torch.split(stacked_image_features, num_patches_per_batch) -- ] -+ return torch.split(self.multi_modal_projector(stacked_image_features), -+ num_patches_per_batch) - - def _process_image_input( - self, -diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py -index 0de9d8c5e..6e82cee1c 100644 ---- a/vllm/model_executor/models/llava_next_video.py -+++ b/vllm/model_executor/models/llava_next_video.py -@@ -3,38 +3,35 @@ from functools import cached_property - from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, - TypedDict, Union) - --import numpy as np - import torch - import torch.nn as nn --from transformers import (CLIPVisionConfig, LlavaNextVideoConfig, -- SiglipVisionConfig) -+from transformers import (BatchFeature, LlavaNextVideoConfig, -+ LlavaNextVideoProcessor) - - from vllm.attention import AttentionMetadata - from vllm.config import VllmConfig --from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, -- InputContext, token_inputs) - from vllm.model_executor.layers.activation import get_act_fn - from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler - from vllm.model_executor.models.clip import CLIPVisionModel - from vllm.model_executor.sampling_metadata import SamplingMetadata - from vllm.multimodal import MULTIMODAL_REGISTRY --from vllm.multimodal.inputs import NestedTensors --from vllm.multimodal.utils import (cached_get_tokenizer, -- repeat_and_pad_placeholder_tokens) -+from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, -+ NestedTensors) -+from vllm.multimodal.parse import (ImageSize, VideoEmbeddingItems, -+ VideoProcessorItems) -+from vllm.multimodal.processing import (BaseMultiModalProcessor, -+ MultiModalDataItems, ProcessingMixin, -+ PromptReplacement) -+from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs - from vllm.sequence import IntermediateTensors - from vllm.utils import is_list_of - --from .clip import dummy_image_for_clip, dummy_seq_data_for_clip - from .interfaces import SupportsMultiModal, SupportsPP - from .llava import init_vision_tower_for_llava --from .siglip import (SiglipVisionModel, dummy_image_for_siglip, -- dummy_seq_data_for_siglip) -+from .siglip import SiglipVisionModel - from .utils import (AutoWeightsLoader, init_vllm_registered_model, - maybe_prefix, merge_multimodal_embeddings) -- --# For profile run --_MAX_FRAMES_PER_VIDEO = 32 --_MAX_NUM_VIDEOS = 1 -+from .vision import get_vision_encoder_info - - - class LlavaNextVideoPixelInputs(TypedDict): -@@ -50,149 +47,175 @@ class LlavaNextVideoPixelInputs(TypedDict): - """ - - --def get_llava_next_video_frame_feature_size( -- hf_config: LlavaNextVideoConfig) -> int: -- # Support both CLIPVisionConfig and SiglipVisionConfig -- image_size = hf_config.vision_config.image_size -- patch_size = hf_config.vision_config.patch_size -- spatial_pool_stride = hf_config.spatial_pool_stride -+class LlavaNextVideoProcessingMixin(ProcessingMixin): - -- return int((image_size / patch_size / spatial_pool_stride)**2) -+ def _get_hf_config(self): -+ return self.ctx.get_hf_config(LlavaNextVideoConfig) - -+ def _get_vision_encoder_info(self): -+ return get_vision_encoder_info(self._get_hf_config()) - --def _get_max_llm_tokens(ctx: InputContext) -> int: -- """ -- Calculated from the maximum video frames under the context length -- constraints of the language model. -- """ -- hf_text_config = ctx.model_config.hf_text_config -- model_config = ctx.model_config -- max_tokens = model_config.max_model_len -- rope_scaling = model_config.rope_scaling -- -- if rope_scaling: -- rope_scaling_factor = hf_text_config.rope_scaling["factor"] -- else: -- rope_scaling_factor = 1 -- -- max_tokens *= rope_scaling_factor -- -- return max_tokens -- -- --def get_max_llava_next_video_tokens(ctx: InputContext) -> int: -- # Currently set to 32 frames -- # TODO: max_tokens = _get_max_llm_tokens(ctx) -- hf_config = ctx.get_hf_config(LlavaNextVideoConfig) -- tokens_per_frame = get_llava_next_video_frame_feature_size(hf_config) -- return _MAX_FRAMES_PER_VIDEO * tokens_per_frame -- -- --def dummy_data_for_llava_next_video(ctx: InputContext, seq_len: int, -- mm_counts: Mapping[str, int]): -- hf_config = ctx.get_hf_config(LlavaNextVideoConfig) -- vision_config = hf_config.vision_config -- -- # TODO: support multiple videos -- num_videos = mm_counts["video"] -- if num_videos != _MAX_NUM_VIDEOS: -- raise NotImplementedError( -- f"Only {_MAX_NUM_VIDEOS} videos are supported") -- -- # TODO: support configuring the number of frames -- frames_per_video = _MAX_FRAMES_PER_VIDEO -- # num_images = num_videos * frames_per_video -- -- # fills the sequence with as longer video data as possible -- tokens_per_frame = get_llava_next_video_frame_feature_size(hf_config) -- video_feature_size = frames_per_video * tokens_per_frame -- -- if isinstance(vision_config, CLIPVisionConfig): -- seq_data, ranges = dummy_seq_data_for_clip( -- vision_config, -- seq_len, -- num_videos, -- image_token_id=hf_config.video_token_index, -- image_feature_size_override=video_feature_size, -- mm_key="video", -+ def _get_hf_processor(self): -+ return self.ctx.get_hf_processor(LlavaNextVideoProcessor) -+ -+ def _get_num_frame_tokens( -+ self, -+ *, -+ image_width: int, -+ image_height: int, -+ ) -> int: -+ hf_config = self._get_hf_config() -+ spatial_pool_stride = hf_config.spatial_pool_stride -+ -+ vision_encoder_info = self._get_vision_encoder_info() -+ patch_grid_length = vision_encoder_info.get_patch_grid_length() -+ pooled_grid_length = math.ceil(patch_grid_length / spatial_pool_stride) -+ -+ return pooled_grid_length * pooled_grid_length -+ -+ def _get_num_video_tokens( -+ self, -+ *, -+ image_width: int, -+ image_height: int, -+ num_frames: int, -+ ) -> int: -+ num_frame_tokens = self._get_num_frame_tokens( -+ image_width=image_width, -+ image_height=image_height, - ) - -- pil_frame = dummy_image_for_clip(vision_config, num_images=1) -- np_frame = np.array(pil_frame["image"]) -- mm_data_per_video = np.repeat([np_frame], frames_per_video, axis=0) -- mm_data = {"video": mm_data_per_video} -- return DummyData(seq_data, mm_data, ranges) -- elif isinstance(vision_config, SiglipVisionConfig): -- seq_data, ranges = dummy_seq_data_for_siglip( -- vision_config, -- seq_len, -- num_videos, -- image_token_id=hf_config.video_token_index, -- image_feature_size_override=video_feature_size, -- mm_key="video", -+ return num_frame_tokens * num_frames -+ -+ -+class LlavaNextVideoProfilingInfo(LlavaNextVideoProcessingMixin, -+ BaseProfilingInfo): -+ -+ def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: -+ return {"video": 1} -+ -+ def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: -+ target_width, target_height = self._get_image_size_with_most_features() -+ -+ max_video_tokens = self._get_num_video_tokens( -+ image_width=target_width, -+ image_height=target_height, -+ num_frames=self._get_dummy_num_frames(seq_len), - ) - -- pil_frame = dummy_image_for_siglip(vision_config, num_images=1) -- np_frame = np.array(pil_frame["image"]) -- mm_data_per_video = np.repeat([np_frame], frames_per_video, axis=0) -- mm_data = {"video": mm_data_per_video} -- return DummyData(seq_data, mm_data, ranges) -+ return {"video": max_video_tokens} -+ -+ def _get_image_size_with_most_features(self) -> ImageSize: -+ vision_encoder_info = self._get_vision_encoder_info() -+ width = height = vision_encoder_info.get_image_size() -+ return ImageSize(width=width, height=height) - -- msg = f"Unsupported vision config: {type(vision_config)}" -- raise NotImplementedError(msg) -+ def _get_max_video_frames(self, max_tokens: int) -> int: -+ target_width, target_height = self._get_image_size_with_most_features() - -+ num_frames = 0 - --def input_processor_for_llava_next_video(ctx: InputContext, -- inputs: DecoderOnlyInputs): -- multi_modal_data = inputs.get("multi_modal_data") -- if multi_modal_data is None or "video" not in multi_modal_data: -- return inputs -+ while True: -+ next_num_frames = num_frames + 1 -+ next_max_tokens = self._get_num_video_tokens( -+ image_width=target_width, -+ image_height=target_height, -+ num_frames=next_num_frames, -+ ) - -- if "multi_modal_placeholders" in inputs and "video" in inputs[ -- "multi_modal_placeholders"]: -- # The inputs already have placeholders. -- return inputs -+ if next_max_tokens > max_tokens: -+ break - -- video_data = multi_modal_data["video"] -+ num_frames = next_num_frames - -- model_config = ctx.model_config -- hf_config = ctx.get_hf_config(LlavaNextVideoConfig) -- vision_config = hf_config.vision_config -+ return num_frames - -- if isinstance(video_data, np.ndarray): -- # Supports both CLIP and Siglip -- num_frames = video_data.shape[0] -- frame_feature_size = \ -- get_llava_next_video_frame_feature_size(hf_config) -- video_feature_size = num_frames * frame_feature_size -+ def _get_dummy_num_frames(self, seq_len: int) -> int: -+ mm_config = self.ctx.get_mm_config() -+ max_videos = mm_config.limit_per_prompt.get("video", 1) - -- tokenizer = cached_get_tokenizer(model_config.tokenizer) -+ max_total_frames = self._get_max_video_frames(seq_len) - -- new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens( -- tokenizer, -- inputs.get("prompt"), -- inputs["prompt_token_ids"], -- placeholder_token_id=hf_config.video_token_index, -- repeat_count=video_feature_size, -+ return max(max_total_frames // max(max_videos, 1), 1) -+ -+ def get_dummy_processor_inputs( -+ self, -+ seq_len: int, -+ mm_counts: Mapping[str, int], -+ ) -> ProcessorInputs: -+ num_videos = mm_counts.get("video", 0) -+ -+ processor = self._get_hf_processor() -+ video_token = processor.video_token -+ target_width, target_height = self._get_image_size_with_most_features() -+ -+ mm_data = { -+ "video": -+ self._get_dummy_videos( -+ width=target_width, -+ height=target_height, -+ num_frames=self._get_dummy_num_frames(seq_len), -+ num_videos=num_videos, -+ ) -+ } -+ -+ return ProcessorInputs( -+ prompt_text=video_token * num_videos, -+ mm_data=mm_data, - ) - -- return token_inputs(prompt_token_ids=new_token_ids, -- prompt=new_prompt, -- multi_modal_data=multi_modal_data, -- multi_modal_placeholders={"video": ranges}) - -- elif is_list_of(video_data, np.ndarray): -- raise NotImplementedError( -- "Processing multiple videos is not supported") -+class LlavaNextVideoMultiModalProcessor(LlavaNextVideoProcessingMixin, -+ BaseMultiModalProcessor): -+ -+ def _get_profiling_info(self) -> BaseProfilingInfo: -+ return LlavaNextVideoProfilingInfo(self.ctx) - -- msg = f"Unsupported vision config: {type(vision_config)}" -- raise NotImplementedError(msg) -+ def _get_mm_fields_config( -+ self, -+ hf_inputs: BatchFeature, -+ hf_processor_mm_kwargs: Mapping[str, object], -+ ) -> Mapping[str, MultiModalFieldConfig]: -+ return dict(pixel_values_videos=MultiModalFieldConfig.batched("video")) -+ -+ def _get_prompt_replacements( -+ self, -+ mm_items: MultiModalDataItems, -+ hf_processor_mm_kwargs: Mapping[str, object], -+ out_mm_kwargs: MultiModalKwargs, -+ ) -> list[PromptReplacement]: -+ hf_config = self._get_hf_config() -+ video_token_id = hf_config.video_token_index -+ -+ def get_replacement(item_idx: int): -+ videos = mm_items.get_items( -+ "video", (VideoEmbeddingItems, VideoProcessorItems)) -+ -+ if isinstance(videos, VideoEmbeddingItems): -+ num_video_tokens = videos.get_feature_size(item_idx) -+ else: -+ image_size = videos.get_frame_size(item_idx) -+ num_video_tokens = self._get_num_video_tokens( -+ image_width=image_size.width, -+ image_height=image_size.height, -+ num_frames=videos.get_num_frames(item_idx), -+ ) -+ -+ return [video_token_id] * num_video_tokens -+ -+ return [ -+ PromptReplacement( -+ modality="video", -+ target=[video_token_id], -+ replacement=get_replacement, -+ ), -+ ] - - - # adopted from transformers modeling_llava_next_video.py - class LlavaNextVideoPooler(nn.Module): - -- def __init__(self, config): -+ def __init__(self, config: LlavaNextVideoConfig): - super().__init__() - - mode = config.spatial_pool_mode -@@ -210,7 +233,7 @@ class LlavaNextVideoPooler(nn.Module): - raise ValueError( - f"Unknown pooling mode: {mode}. Expected [`average`, `max`]") - -- def forward(self, image_features): -+ def forward(self, image_features: torch.Tensor): - ori_width = int( - math.sqrt(image_features.shape[1] * self.image_size // - self.image_size)) -@@ -246,11 +269,7 @@ class LlavaNextMultiModalProjector(nn.Module): - return hidden_states - - --@MULTIMODAL_REGISTRY.register_input_mapper("video") --@MULTIMODAL_REGISTRY.register_max_multimodal_tokens( -- "video", get_max_llava_next_video_tokens) --@INPUT_REGISTRY.register_dummy_data(dummy_data_for_llava_next_video) --@INPUT_REGISTRY.register_input_processor(input_processor_for_llava_next_video) -+@MULTIMODAL_REGISTRY.register_processor(LlavaNextVideoMultiModalProcessor) - class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal, - SupportsPP): - -diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py -index 0bebc1c74..6dccc1e0d 100644 ---- a/vllm/model_executor/models/llava_onevision.py -+++ b/vllm/model_executor/models/llava_onevision.py -@@ -1,49 +1,40 @@ - import math - from functools import cached_property --from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, -- TypedDict, Union) -+from typing import (Final, Iterable, List, Literal, Mapping, Optional, -+ Protocol, Set, Tuple, TypedDict, Union) - - import numpy as np - import torch - import torch.nn as nn --from PIL import Image --from transformers import (CLIPVisionConfig, LlavaOnevisionConfig, -- SiglipVisionConfig) -+from transformers import (BatchFeature, LlavaOnevisionConfig, -+ LlavaOnevisionProcessor) - from transformers.models.llava_onevision.modeling_llava_onevision import ( - get_anyres_image_grid_shape, unpad_image) - from typing_extensions import NotRequired - - from vllm.attention import AttentionMetadata - from vllm.config import VllmConfig --from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, -- InputContext, token_inputs) - from vllm.model_executor.layers.activation import get_act_fn - from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler - from vllm.model_executor.sampling_metadata import SamplingMetadata - from vllm.multimodal import MULTIMODAL_REGISTRY --from vllm.multimodal.inputs import NestedTensors --from vllm.multimodal.utils import (cached_get_tokenizer, -- repeat_and_pad_placeholder_tokens) -+from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors -+from vllm.multimodal.parse import (MultiModalDataItems, VideoEmbeddingItems, -+ VideoProcessorItems) -+from vllm.multimodal.processing import MultiModalFieldConfig, PromptReplacement -+from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs - from vllm.sequence import IntermediateTensors - from vllm.utils import is_list_of - --from .clip import (CLIPVisionModel, dummy_seq_data_for_clip, -- dummy_video_for_clip, get_clip_image_feature_size, -- get_clip_patch_grid_length, input_processor_for_clip) -+from .clip import CLIPVisionModel - from .interfaces import SupportsMultiModal, SupportsPP --from .llava import init_vision_tower_for_llava --from .siglip import (SiglipVisionModel, dummy_seq_data_for_siglip, -- dummy_video_for_siglip, get_siglip_image_feature_size, -- get_siglip_patch_grid_length, input_processor_for_siglip) -+from .llava import BaseLlavaProfilingInfo, init_vision_tower_for_llava -+from .llava_next import (LlavaNextLikeConfig, LlavaNextMultiModalProcessor, -+ LlavaNextProcessingMixin) -+from .siglip import SiglipVisionModel - from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model, - maybe_prefix, merge_multimodal_embeddings) - --# Result in the max possible feature size (2x2 grid of 336x336px tiles) --MAX_IMAGE_FEATURE_SIZE_HEIGHT = MAX_IMAGE_FEATURE_SIZE_WIDTH = 448 -- --# For profile run --_MAX_FRAMES_PER_VIDEO = 16 -- - - class LlavaOnevisionVideoPixelInputs(TypedDict): - type: Literal["pixel_values_videos"] -@@ -92,286 +83,278 @@ LlavaOnevisionMultiInputs = Union[LlavaOnevisionImageInputs, - LlavaOnevisionVideoPixelInputs] - - --def _get_llava_onevision_image_unppaded_feature_size(height, width, patches, -- scale_height, -- scale_width): -- current_height = patches * scale_height -- current_width = patches * scale_width -- -- original_aspect_ratio = width / height -- current_aspect_ratio = current_width / current_height -- if original_aspect_ratio > current_aspect_ratio: -- new_height = int(height * (current_width / width)) -- padding = (current_height - new_height) // 2 -- current_height -= padding * 2 -- else: -- new_width = int(width * (current_height / height)) -- padding = (current_width - new_width) // 2 -- current_width -= padding * 2 -- -- unpadded_features = current_height * current_width -- newline_features = current_height -- -- ratio = math.sqrt(current_height * current_width / (9 * patches**2)) -- if ratio > 1.1: -- unpadded_features = int(current_height // ratio) * int( -- current_width // ratio) -- newline_features = int(current_height // ratio) -- -- return (unpadded_features, newline_features) -- -- --def get_llava_onevision_image_feature_size( -- hf_config: LlavaOnevisionConfig, -- *, -- input_height: int, -- input_width: int, --) -> int: -- vision_config = hf_config.vision_config -- -- if isinstance(vision_config, CLIPVisionConfig): -- num_patches = get_clip_patch_grid_length( -- image_size=vision_config.image_size, -- patch_size=vision_config.patch_size, -- ) -- base_feature_size = get_clip_image_feature_size(vision_config) -- elif isinstance(vision_config, SiglipVisionConfig): -- num_patches = get_siglip_patch_grid_length( -- image_size=vision_config.image_size, -- patch_size=vision_config.patch_size, -+class LlavaOnevisionLikeConfig(LlavaNextLikeConfig, Protocol): -+ video_token_index: Final[int] -+ -+ -+class LlavaOnevisionProcessingMixin(LlavaNextProcessingMixin): -+ -+ def _get_hf_config(self) -> LlavaOnevisionLikeConfig: -+ return self.ctx.get_hf_config(LlavaOnevisionConfig) -+ -+ def _get_hf_processor(self): -+ return self.ctx.get_hf_processor(LlavaOnevisionProcessor) -+ -+ def _get_num_unpadded_features( -+ self, -+ *, -+ original_height: int, -+ original_width: int, -+ npatches: int, -+ num_patch_height: int, -+ num_patch_width: int, -+ ) -> tuple[int, int]: -+ current_height = npatches * num_patch_height -+ current_width = npatches * num_patch_width -+ -+ # NOTE: Use float32 to remain consistent with HF output -+ original_aspect_ratio = np.array(original_width / original_height, -+ dtype=np.float32) -+ current_aspect_ratio = np.array(current_width / current_height, -+ dtype=np.float32) -+ -+ if original_aspect_ratio > current_aspect_ratio: -+ scale_factor = np.array(current_width / original_width, -+ dtype=np.float32) -+ new_height = int(original_height * scale_factor) -+ padding = (current_height - new_height) // 2 -+ current_height -= 2 * padding -+ else: -+ scale_factor = np.array(current_height / original_height, -+ dtype=np.float32) -+ new_width = int(original_width * scale_factor) -+ padding = (current_width - new_width) // 2 -+ current_width -= 2 * padding -+ -+ unpadded_features = current_height * current_width -+ newline_features = current_height -+ -+ ratio = math.sqrt(current_height * current_width / (9 * npatches**2)) -+ if ratio > 1.1: -+ unpadded_features = int(current_height // ratio) * int( -+ current_width // ratio) -+ newline_features = int(current_height // ratio) -+ -+ return (unpadded_features, newline_features) -+ -+ def _get_num_frame_tokens( -+ self, -+ *, -+ image_width: int, -+ image_height: int, -+ ) -> int: -+ hf_config = self._get_hf_config() -+ spatial_pool_stride = getattr(hf_config, "spatial_pool_stride", 2) -+ -+ vision_encoder_info = self._get_vision_encoder_info() -+ patch_grid_length = vision_encoder_info.get_patch_grid_length() -+ pooled_grid_length = math.ceil(patch_grid_length / spatial_pool_stride) -+ -+ return pooled_grid_length * pooled_grid_length -+ -+ def _get_num_video_tokens( -+ self, -+ *, -+ image_width: int, -+ image_height: int, -+ num_frames: int, -+ ) -> int: -+ num_frame_tokens = self._get_num_frame_tokens( -+ image_width=image_width, -+ image_height=image_height, - ) -- base_feature_size = get_siglip_image_feature_size(vision_config) -- else: -- msg = f"Unsupported vision config: {type(vision_config)}" -- raise NotImplementedError(msg) -- -- strategy = hf_config.vision_feature_select_strategy -- if strategy == "default": -- base_feature_size -= 1 -- elif strategy == "full": -- pass -- else: -- raise ValueError(f"Unexpected select feature strategy: {strategy}") - -- num_patch_height, num_patch_width = get_anyres_image_grid_shape( -- image_size=(input_height, input_width), -- grid_pinpoints=hf_config.image_grid_pinpoints, -- patch_size=vision_config.image_size, -- ) -+ return num_frame_tokens * num_frames + 1 # Newline token -+ -+ -+class LlavaOnevisionProfilingInfo(LlavaOnevisionProcessingMixin, -+ BaseLlavaProfilingInfo): -+ -+ def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: -+ return {"image": None, "video": None} -+ -+ def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: -+ return { -+ "image": self._get_max_image_tokens(), -+ "video": self._get_max_video_tokens(seq_len), -+ } -+ -+ def _get_max_video_frames(self, max_tokens: int) -> int: -+ target_width, target_height = self._get_image_size_with_most_features() -+ -+ num_frames = 0 -+ -+ while True: -+ next_num_frames = num_frames + 1 -+ next_max_tokens = self._get_num_video_tokens( -+ image_width=target_width, -+ image_height=target_height, -+ num_frames=next_num_frames, -+ ) - -- ( -- unpadded_feature_size, -- newline_feature_size, -- ) = _get_llava_onevision_image_unppaded_feature_size( -- input_height, input_width, num_patches, num_patch_height, -- num_patch_width) -- -- return unpadded_feature_size + newline_feature_size + base_feature_size -- -- --def get_max_llava_onevision_image_tokens(ctx: InputContext): -- return get_llava_onevision_image_feature_size( -- ctx.get_hf_config(LlavaOnevisionConfig), -- input_height=MAX_IMAGE_FEATURE_SIZE_HEIGHT, -- input_width=MAX_IMAGE_FEATURE_SIZE_WIDTH, -- ) -- -- --def get_llava_onevision_video_frame_feature_size( -- hf_config: LlavaOnevisionConfig) -> int: -- # Support both CLIPVisionConfig and SiglipVisionConfig -- image_size = hf_config.vision_config.image_size -- patch_size = hf_config.vision_config.patch_size -- spatial_pool_stride = hf_config.spatial_pool_stride if hasattr( -- hf_config, "spatial_pool_stride") else 2 -- -- height = width = image_size // patch_size -- return math.ceil(height / spatial_pool_stride) * math.ceil( -- width / spatial_pool_stride) -- -- --def get_llava_onevision_video_tokens(ctx: InputContext, -- num_frames: int) -> int: -- hf_config = ctx.get_hf_config(LlavaOnevisionConfig) -- -- # TODO: support configuring (not supported by HF right now) -- num_token_image_newline = 1 -- tokens_per_frame = get_llava_onevision_video_frame_feature_size(hf_config) -- video_feature_size = num_frames * tokens_per_frame + num_token_image_newline -- -- return video_feature_size -- -- --def get_max_llava_onevision_video_tokens(ctx: InputContext) -> int: -- return get_llava_onevision_video_tokens(ctx, _MAX_FRAMES_PER_VIDEO) -- -- --def dummy_data_for_llava_onevision(ctx: InputContext, seq_len: int, -- mm_counts: Mapping[str, int]): -- hf_config = ctx.get_hf_config(LlavaOnevisionConfig) -- vision_config = hf_config.vision_config -- -- num_videos = mm_counts["video"] -- -- # TODO: support configuring the number of frames -- num_frames = _MAX_FRAMES_PER_VIDEO -- video_feature_size = get_llava_onevision_video_tokens(ctx, num_frames) -- -- if isinstance(vision_config, CLIPVisionConfig): -- seq_data, ranges = dummy_seq_data_for_clip( -- vision_config, -- seq_len, -- num_videos, -- image_token_id=hf_config.video_token_index, -- image_feature_size_override=video_feature_size, -- mm_key="video") -- -- mm_data = dummy_video_for_clip(vision_config, -- num_frames=num_frames, -- num_videos=num_videos) -- return DummyData(seq_data, mm_data, ranges) -- elif isinstance(vision_config, SiglipVisionConfig): -- seq_data, ranges = dummy_seq_data_for_siglip( -- vision_config, -- seq_len, -- num_videos, -- image_token_id=hf_config.video_token_index, -- image_feature_size_override=video_feature_size, -- mm_key="video") -- -- mm_data = dummy_video_for_siglip(vision_config, -- num_frames=num_frames, -- num_videos=num_videos) -- return DummyData(seq_data, mm_data, ranges) -- -- msg = f"Unsupported vision config: {type(vision_config)}" -- raise NotImplementedError(msg) -- -- --def input_processor_when_multimodal_input_image(ctx: InputContext, -- inputs: DecoderOnlyInputs): -- multi_modal_data = inputs.get("multi_modal_data") -- if multi_modal_data is None or "image" not in multi_modal_data: -- return inputs -- -- model_config = ctx.model_config -- hf_config = ctx.get_hf_config(LlavaOnevisionConfig) -- vision_config = hf_config.vision_config -- -- image_data = multi_modal_data["image"] -- if isinstance(image_data, Image.Image): -- width, height = image_data.size -- -- image_feature_size = get_llava_onevision_image_feature_size( -- hf_config, -- input_height=height, -- input_width=width, -+ if next_max_tokens > max_tokens: -+ break -+ -+ num_frames = next_num_frames -+ -+ return num_frames -+ -+ def _get_dummy_num_frames(self, seq_len: int) -> int: -+ mm_config = self.ctx.get_mm_config() -+ max_images = mm_config.limit_per_prompt.get("image", 1) -+ max_videos = mm_config.limit_per_prompt.get("video", 1) -+ -+ max_image_tokens = self._get_max_image_tokens() * max_images -+ max_total_frames = self._get_max_video_frames(seq_len - -+ max_image_tokens) -+ -+ return max(max_total_frames // max(max_videos, 1), 1) -+ -+ def _get_max_video_tokens(self, seq_len: int) -> int: -+ target_width, target_height = self._get_image_size_with_most_features() -+ -+ return self._get_num_video_tokens( -+ image_width=target_width, -+ image_height=target_height, -+ num_frames=self._get_dummy_num_frames(seq_len), - ) -- elif is_list_of(image_data, Image.Image): -- image_feature_size = [ -- get_llava_onevision_image_feature_size(hf_config, -- input_height=img.height, -- input_width=img.width) -- for img in image_data -- ] -- elif isinstance(image_data, torch.Tensor): -- num_images, image_feature_size, hidden_size = image_data.shape -- elif is_list_of(image_data, torch.Tensor): -- image_feature_size = [item.shape[1] for item in image_data] -- else: -- raise TypeError(f"Invalid image type: {type(image_data)}") -- -- vision_config = hf_config.vision_config -- -- if isinstance(vision_config, CLIPVisionConfig): -- return input_processor_for_clip( -- model_config, -- vision_config, -- inputs, -- image_token_id=hf_config.image_token_index, -- image_feature_size_override=image_feature_size, -+ -+ def get_dummy_processor_inputs( -+ self, -+ seq_len: int, -+ mm_counts: Mapping[str, int], -+ ) -> ProcessorInputs: -+ num_images = mm_counts.get("image", 0) -+ num_videos = mm_counts.get("video", 0) -+ -+ processor = self._get_hf_processor() -+ image_token = processor.image_token -+ video_token = processor.video_token -+ target_width, target_height = self._get_image_size_with_most_features() -+ -+ mm_data = { -+ "image": -+ self._get_dummy_images(width=target_width, -+ height=target_height, -+ num_images=num_images), -+ "video": -+ self._get_dummy_videos( -+ width=target_width, -+ height=target_height, -+ num_frames=self._get_dummy_num_frames(seq_len), -+ num_videos=num_videos, -+ ) -+ } -+ -+ return ProcessorInputs( -+ prompt_text=image_token * num_images + video_token * num_videos, -+ mm_data=mm_data, - ) -- elif isinstance(vision_config, SiglipVisionConfig): -- return input_processor_for_siglip( -- model_config, -- vision_config, -- inputs, -- image_token_id=hf_config.image_token_index, -- image_feature_size_override=image_feature_size, -+ -+ -+class LlavaOnevisionMultiModalProcessor(LlavaOnevisionProcessingMixin, -+ LlavaNextMultiModalProcessor): -+ -+ def _get_profiling_info(self) -> BaseProfilingInfo: -+ return LlavaOnevisionProfilingInfo(self.ctx) -+ -+ def _get_mm_fields_config( -+ self, -+ hf_inputs: BatchFeature, -+ hf_processor_mm_kwargs: Mapping[str, object], -+ ) -> Mapping[str, MultiModalFieldConfig]: -+ return dict( -+ pixel_values=MultiModalFieldConfig.batched("image"), -+ image_sizes=MultiModalFieldConfig.batched("image"), -+ image_embeds=MultiModalFieldConfig.batched("image"), -+ pixel_values_videos=MultiModalFieldConfig.batched("video"), - ) - -- msg = f"Unsupported vision config: {type(vision_config)}" -- raise NotImplementedError(msg) -+ def _call_hf_processor( -+ self, -+ prompt: str, -+ mm_data: Mapping[str, object], -+ mm_kwargs: Mapping[str, object], -+ ) -> BatchFeature: -+ mm_data = dict(mm_data) -+ videos = mm_data.pop("videos", []) -+ assert isinstance(videos, list) -+ -+ if not videos: -+ return super()._call_hf_processor( -+ prompt=prompt, -+ mm_data=mm_data, -+ mm_kwargs=mm_kwargs, -+ ) -+ -+ processor = self._get_hf_processor() -+ video_token = processor.video_token - -+ # LLaVA-OneVision processor doesn't support multiple videos -+ # with different sizes when converting back to tensors -+ text_image_outputs = super()._call_hf_processor( -+ prompt=prompt, -+ mm_data=mm_data, -+ mm_kwargs=mm_kwargs, -+ ) - --def input_processor_when_multimodal_input_video(ctx: InputContext, -- inputs: DecoderOnlyInputs): -- multi_modal_data = inputs.get("multi_modal_data") -- if multi_modal_data is None or "video" not in multi_modal_data: -- return inputs -- video_data = multi_modal_data["video"] -+ pixel_values_videos = [] -+ for video in videos: -+ item_processor_data = dict(prompt=video_token, videos=video) - -- model_config = ctx.model_config -- hf_config = ctx.get_hf_config(LlavaOnevisionConfig) -+ item_outputs = super()._call_hf_processor( -+ prompt=prompt, -+ mm_data=item_processor_data, -+ mm_kwargs=mm_kwargs, -+ ) - -- if isinstance(video_data, np.ndarray): -- # Supports both CLIP and Siglip -- num_frames = video_data.shape[0] -- video_feature_size = get_llava_onevision_video_tokens(ctx, num_frames) -- tokenizer = cached_get_tokenizer(model_config.tokenizer) -+ pixel_values_videos.append( -+ item_outputs.pop("pixel_values_videos")[0]) - -- new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens( -- tokenizer, -- inputs.get("prompt"), -- inputs["prompt_token_ids"], -- placeholder_token_id=hf_config.video_token_index, -- repeat_count=video_feature_size, -+ combined_outputs = dict( -+ **text_image_outputs, -+ pixel_values_videos=pixel_values_videos, - ) -+ return BatchFeature(combined_outputs) - -- return token_inputs(prompt_token_ids=new_token_ids, -- prompt=new_prompt, -- multi_modal_data=multi_modal_data, -- multi_modal_placeholders={"video": ranges}) -- -- elif is_list_of(video_data, np.ndarray): -- video_feature_size = [] -- for video in video_data: -- num_frames = video.shape[0] -- video_feature_size.append( -- get_llava_onevision_video_tokens(ctx, num_frames)) -- -- tokenizer = cached_get_tokenizer(model_config.tokenizer) -- new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens( -- tokenizer, -- inputs.get("prompt"), -- inputs["prompt_token_ids"], -- placeholder_token_id=hf_config.video_token_index, -- repeat_count=video_feature_size, -+ def _get_prompt_replacements( -+ self, -+ mm_items: MultiModalDataItems, -+ hf_processor_mm_kwargs: Mapping[str, object], -+ out_mm_kwargs: MultiModalKwargs, -+ ) -> list[PromptReplacement]: -+ image_repls = super()._get_prompt_replacements( -+ mm_items=mm_items, -+ hf_processor_mm_kwargs=hf_processor_mm_kwargs, -+ out_mm_kwargs=out_mm_kwargs, - ) -- return token_inputs(prompt_token_ids=new_token_ids, -- prompt=new_prompt, -- multi_modal_data=multi_modal_data, -- multi_modal_placeholders={"video": ranges}) -- else: -- raise TypeError(f"Invalid video type: {type(video_data)}") - -- msg = f"Unsupported video type: {type(video_data)}" -- raise NotImplementedError(msg) -+ hf_config = self._get_hf_config() -+ video_token_id = hf_config.video_token_index - -+ def get_video_replacement(item_idx: int): -+ videos = mm_items.get_items( -+ "video", (VideoEmbeddingItems, VideoProcessorItems)) - --def input_processor_for_llava_onevision(ctx: InputContext, -- inputs: DecoderOnlyInputs): -- multi_modal_data = inputs.get("multi_modal_data") -- if multi_modal_data is None or ("video" not in multi_modal_data -- and "image" not in multi_modal_data): -- return inputs -- if "image" in multi_modal_data: -- return input_processor_when_multimodal_input_image(ctx, inputs) -- if "video" in multi_modal_data: -- return input_processor_when_multimodal_input_video(ctx, inputs) -+ if isinstance(videos, VideoEmbeddingItems): -+ num_video_tokens = videos.get_feature_size(item_idx) -+ else: -+ image_size = videos.get_frame_size(item_idx) -+ num_video_tokens = self._get_num_video_tokens( -+ image_width=image_size.width, -+ image_height=image_size.height, -+ num_frames=videos.get_num_frames(item_idx), -+ ) -+ -+ return [video_token_id] * num_video_tokens - -- msg = "Unsupported multi data type" -- raise NotImplementedError(msg) -+ return image_repls + [ -+ PromptReplacement( -+ modality="video", -+ target=[video_token_id], -+ replacement=get_video_replacement, -+ ), -+ ] - - - class LlavaOnevisionMultiModalProjector(nn.Module): -@@ -394,14 +377,7 @@ class LlavaOnevisionMultiModalProjector(nn.Module): - return hidden_states - - --@MULTIMODAL_REGISTRY.register_image_input_mapper() --@MULTIMODAL_REGISTRY.register_input_mapper("video") --@MULTIMODAL_REGISTRY.register_max_multimodal_tokens( -- "image", get_max_llava_onevision_image_tokens) --@MULTIMODAL_REGISTRY.register_max_multimodal_tokens( -- "video", get_max_llava_onevision_video_tokens) --@INPUT_REGISTRY.register_dummy_data(dummy_data_for_llava_onevision) --@INPUT_REGISTRY.register_input_processor(input_processor_for_llava_onevision) -+@MULTIMODAL_REGISTRY.register_processor(LlavaOnevisionMultiModalProcessor) - class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal, - SupportsPP): - -diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py -index 06c8d9723..553bc9c28 100644 ---- a/vllm/model_executor/models/mamba.py -+++ b/vllm/model_executor/models/mamba.py -@@ -38,10 +38,12 @@ class MambaDecoderLayer(nn.Module): - def __init__(self, - config: MambaConfig, - cache_config: Optional[CacheConfig] = None, -- quant_config: Optional[QuantizationConfig] = None) -> None: -+ quant_config: Optional[QuantizationConfig] = None, -+ is_lora_enabled: Optional[bool] = False) -> None: - super().__init__() - self.config = config - self.is_falcon_mamba = config.model_type == "falcon_mamba" -+ self.is_lora_enabled = is_lora_enabled - mixer_rms_eps = config.mixer_rms_eps if self.is_falcon_mamba else None - self.mixer = MambaMixer(hidden_size=config.hidden_size, - ssm_state_size=config.state_size, -@@ -53,7 +55,8 @@ class MambaDecoderLayer(nn.Module): - use_rms_norm=self.is_falcon_mamba, - rms_norm_has_weight=not self.is_falcon_mamba, - rms_norm_eps=mixer_rms_eps, -- activation=config.hidden_act) -+ activation=config.hidden_act, -+ is_lora_enabled=self.is_lora_enabled) - - self.norm = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon) - -@@ -85,6 +88,7 @@ class MambaModel(nn.Module): - cache_config = vllm_config.cache_config - quant_config = vllm_config.quant_config - lora_config = vllm_config.lora_config -+ is_lora_enabled = bool(lora_config) - - self.config = config - self.padding_idx = config.pad_token_id -@@ -101,8 +105,10 @@ class MambaModel(nn.Module): - - self.start_layer, self.end_layer, self.layers = make_layers( - config.num_hidden_layers, -- lambda prefix: MambaDecoderLayer( -- config, cache_config=cache_config, quant_config=quant_config), -+ lambda prefix: MambaDecoderLayer(config, -+ cache_config=cache_config, -+ quant_config=quant_config, -+ is_lora_enabled=is_lora_enabled), - prefix=f"{prefix}.layers") - - self.norm_f = RMSNorm(config.hidden_size, -diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py -index 1e8f9bd4c..d72e55089 100644 ---- a/vllm/model_executor/models/minicpmv.py -+++ b/vllm/model_executor/models/minicpmv.py -@@ -141,7 +141,7 @@ class Resampler2_5(BaseResampler): - self.max_size = max_size - self._set_2d_pos_cache(self.max_size) - -- self.apply(self._init_weights) -+ #self.apply(self._init_weights) - - def _set_2d_pos_cache(self, - max_size: Tuple[int, int], -@@ -487,6 +487,12 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP): - image_embeds = kwargs.pop("image_embeds", None) - - if image_embeds is not None: -+ if not isinstance(image_embeds, (torch.Tensor, list)): -+ raise ValueError(f"Incorrect type of image embeds. " -+ f"Got type: {type(image_embeds)}") -+ if isinstance(image_embeds, list): -+ image_embeds = torch.concat(image_embeds) -+ - return MiniCPMVImageEmbeddingInputs( - image_bounds=self._get_image_bounds(input_ids, im_start_id, - im_end_id, slice_start_id, -@@ -698,7 +704,7 @@ class MiniCPMV2_0(MiniCPMVBaseModel): - quant_config=quant_config, - prefix=prefix) - -- return resampler.to(device="cuda", dtype=torch.get_default_dtype()) -+ return resampler - - def get_vision_embedding( - self, -@@ -807,7 +813,7 @@ class MiniCPMV2_5(MiniCPMVBaseModel, SupportsLoRA): - quant_config=quant_config, - prefix=prefix) - -- return resampler.to(device="cuda", dtype=torch.get_default_dtype()) -+ return resampler - - def get_vision_embedding( - self, -@@ -846,11 +852,15 @@ class MiniCPMV2_5(MiniCPMVBaseModel, SupportsLoRA): - for i in range(B): - patch_attn_mask[i, :tgt_sizes[i][0] * tgt_sizes[i][1]] = True - -- return self.get_vision_embedding(all_pixel_values.type(dtype), -- patch_attn_mask, tgt_sizes) -+ return self.get_vision_embedding(all_pixel_values.type(dtype).to(device), -+ patch_attn_mask, tgt_sizes.to(device)) -+ -+ def is_default_weight_loading(self, name: str) -> bool: -+ return "resampler" in name - - --class MiniCPMV2_6(MiniCPMVBaseModel, SupportsLoRA): -+class MiniCPMV2_6(MiniCPMVBaseModel): -+ - packed_modules_mapping = { - "qkv_proj": [ - "q_proj", -@@ -928,7 +938,7 @@ class MiniCPMV2_6(MiniCPMVBaseModel, SupportsLoRA): - quant_config=quant_config, - prefix=prefix) - -- return resampler.to(device="cuda", dtype=torch.get_default_dtype()) -+ return resampler - - def get_vision_embedding( - self, -@@ -969,9 +979,9 @@ class MiniCPMV2_6(MiniCPMVBaseModel, SupportsLoRA): - for i in range(B): - patch_attn_mask[i, 0, :tgt_sizes[i][0] * tgt_sizes[i][1]] = True - vision_embedding = self.vpm( -- all_pixel_values.type(dtype), -+ all_pixel_values.type(dtype).to(device), - patch_attention_mask=patch_attn_mask, -- tgt_sizes=tgt_sizes, -+ tgt_sizes=tgt_sizes.to(device), - ) - - return self.resampler(vision_embedding, tgt_sizes) -diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py -index 8938f62d0..cc25be9f5 100644 ---- a/vllm/model_executor/models/molmo.py -+++ b/vllm/model_executor/models/molmo.py -@@ -36,6 +36,7 @@ from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler - from vllm.model_executor.layers.vocab_parallel_embedding import ( - ParallelLMHead, VocabParallelEmbedding) - from vllm.model_executor.model_loader.weight_utils import default_weight_loader -+from vllm.model_executor.models.module_mapping import MultiModelKeys - from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs - from vllm.multimodal.inputs import NestedTensors, PlaceholderRange - from vllm.multimodal.utils import cached_get_tokenizer -@@ -43,7 +44,7 @@ from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors, - SequenceData) - from vllm.transformers_utils.processor import get_processor - --from .interfaces import SupportsMultiModal, SupportsPP -+from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP - from .utils import (AutoWeightsLoader, WeightsMapper, is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers, - maybe_prefix, merge_multimodal_embeddings) -@@ -461,30 +462,71 @@ class MolmoAttention(nn.Module): - return output - - --class MolmoMLP(nn.Module): -+class SwiGLU(nn.Module): -+ -+ def forward(self, x: torch.Tensor) -> torch.Tensor: -+ x, gate = x.chunk(2, dim=-1) -+ # Note that the order is reversed compared to -+ # SiluAndMul. -+ return x * F.silu(gate) -+ -+ -+class LanuageModelMLP(nn.Module): - """Molmo's LLM mlp.""" - - def __init__(self, - config: PretrainedConfig, - input_dim: Optional[int] = None, -- quant_config: Optional[QuantizationConfig] = None, -- proj_name: str = "gate_up_proj") -> None: -+ quant_config: Optional[QuantizationConfig] = None) -> None: - super().__init__() - self.hidden_size = config.hidden_size - self.intermediate_size = config.intermediate_size // 2 - -- # Molmo's LLM proj weights are already merged into the disk, while -- # image_projector proj is separate. If the same proj_name were used, it -- # would create ambiguity and make it difficult to support BNB and LoRA. -- self.proj_name = proj_name -- setattr( -- self, proj_name, -- MergedColumnParallelLinear( -- input_dim or self.hidden_size, -- [self.intermediate_size] * 2, -- bias=False, -- quant_config=quant_config, -- )) -+ self.gate_up_proj = MergedColumnParallelLinear( -+ input_dim or self.hidden_size, -+ [self.intermediate_size] * 2, -+ bias=False, -+ quant_config=quant_config, -+ ) -+ # Activation function. -+ self.act_fn = SwiGLU() -+ # Feed-forward output projection. -+ self.down_proj = RowParallelLinear( -+ self.intermediate_size, -+ self.hidden_size, -+ bias=False, -+ quant_config=quant_config, -+ ) -+ -+ def forward( -+ self, -+ x: torch.Tensor, -+ ) -> torch.Tensor: -+ gate_up, _ = self.gate_up_proj(x) -+ x = self.act_fn(gate_up) -+ x, _ = self.down_proj(x) -+ return x -+ -+ -+class ImageProjectorMLP(nn.Module): -+ """Molmo's image_projector mlp.""" -+ -+ def __init__( -+ self, -+ config: PretrainedConfig, -+ input_dim: Optional[int] = None, -+ quant_config: Optional[QuantizationConfig] = None, -+ ) -> None: -+ super().__init__() -+ self.hidden_size = config.hidden_size -+ self.intermediate_size = config.intermediate_size // 2 -+ -+ self.merged_linear = MergedColumnParallelLinear( -+ input_dim or self.hidden_size, -+ [self.intermediate_size] * 2, -+ bias=False, -+ quant_config=quant_config, -+ ) - # Activation function. - self.act_fn = SiluAndMul() - -@@ -500,7 +542,7 @@ class MolmoMLP(nn.Module): - self, - x: torch.Tensor, - ) -> torch.Tensor: -- gate_up, _ = getattr(self, self.proj_name)(x) -+ gate_up, _ = self.merged_linear(x) - x = self.act_fn(gate_up) - x, _ = self.down_proj(x) - return x -@@ -523,9 +565,7 @@ class MolmoDecoderLayer(nn.Module): - prefix=f"{prefix}.self_attn") - - # MLP block. -- self.mlp = MolmoMLP(config, -- quant_config=quant_config, -- proj_name="gate_up_proj") -+ self.mlp = LanuageModelMLP(config, quant_config=quant_config) - - # LayerNorm - assert config.layer_norm_type == "rms" -@@ -617,11 +657,10 @@ class MolmoVisionBackbone(nn.Module): - vision_config, - nlayers=len(self.vit_layers), - quant_config=quant_config) -- self.image_projector = MolmoMLP( -+ self.image_projector = ImageProjectorMLP( - config, - input_dim=vision_config.image_emb_dim, - quant_config=quant_config, -- proj_name="merged_linear", - ) - - image_dim = vision_config.image_emb_dim * len(self.vit_layers) -@@ -842,10 +881,6 @@ class MolmoModel(nn.Module): - loaded_params: Set[str] = set() - - for name, loaded_weight in weights: -- if "gate_up_proj" in name: -- up_proj, gate_proj = loaded_weight.chunk(2, dim=0) -- loaded_weight = torch.cat([gate_proj, up_proj], dim=0) -- - if name.endswith(".bias") and name not in params_dict: - continue - if is_pp_missing_parameter(name, self): -@@ -1127,8 +1162,8 @@ def input_processor_for_molmo(ctx: InputContext, inputs: DecoderOnlyInputs): - @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_molmo_image_tokens) - @INPUT_REGISTRY.register_dummy_data(dummy_data_for_molmo) - @INPUT_REGISTRY.register_input_processor(input_processor_for_molmo) --class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): -- -+class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, -+ SupportsLoRA): - hf_to_vllm_mapper = WeightsMapper( - orig_to_new_substr={ - # vision backbone mapping -@@ -1157,13 +1192,47 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): - }, - ) - -+ packed_modules_mapping = { -+ "qkv_proj": ["qkv_proj"], -+ "gate_up_proj": ["gate_up_proj"], # language model -+ "merged_linear": ["gate_proj", "up_proj"] # image_projector -+ } -+ -+ # LoRA specific attributes -+ supported_lora_modules = [ -+ # language model -+ "qkv_proj", -+ "o_proj", -+ "gate_up_proj", -+ "down_proj", # same name with image_projector -+ # vision tower -+ "wq", -+ "wk", -+ "wv", -+ "wo", -+ "w1", -+ "w2", -+ # image_projector -+ "merged_linear", -+ ] -+ embedding_modules = {} -+ embedding_padding_modules = [] -+ -+ # BitandBytes specific attributes -+ bitsandbytes_stacked_params_mapping = { -+ "gate_proj": ("merged_linear", 0), -+ "up_proj": ("merged_linear", 1), -+ } -+ - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - config = vllm_config.model_config.hf_config - quant_config = vllm_config.quant_config - multimodal_config = vllm_config.model_config.multimodal_config -+ lora_config = vllm_config.lora_config - self.config = config - self.multimodal_config = multimodal_config -+ self.lora_config = lora_config - - vision_config = VisionBackboneConfig() - self.vision_backbone = MolmoVisionBackbone(config, vision_config, -@@ -1337,6 +1406,16 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): - weights = _get_weights_with_merged_embedding(weights) - return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) - -+ def get_mm_mapping(self) -> MultiModelKeys: -+ """ -+ Get the module prefix in multimodal models -+ """ -+ return MultiModelKeys.from_string_field( -+ language_model="model", -+ connector="vision_backbone.image_projector", -+ tower_model="vision_backbone", -+ ) -+ - - def _get_weights_with_merged_embedding( - weights: Iterable[Tuple[str, torch.Tensor]] -diff --git a/vllm/model_executor/models/na_vit.py b/vllm/model_executor/models/na_vit.py -new file mode 100644 -index 000000000..d96085f46 ---- /dev/null -+++ b/vllm/model_executor/models/na_vit.py -@@ -0,0 +1,831 @@ -+import logging -+import math -+import os -+import warnings -+from typing import Optional, Tuple, Union -+ -+import numpy as np -+import torch -+import torch.nn.functional as F -+from torch import nn -+from torch.nn.init import _calculate_fan_in_and_fan_out -+from transformers.activations import ACT2FN -+from transformers.configuration_utils import PretrainedConfig -+from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask -+from transformers.modeling_outputs import (BaseModelOutput, -+ BaseModelOutputWithPooling) -+from transformers.modeling_utils import PreTrainedModel -+from transformers.utils import (ModelOutput, is_flash_attn_2_available, -+ replace_return_docstrings) -+ -+logger = logging.getLogger("vllm") -+ -+ -+# For Siglip: copied from -+# HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit and add tgt_sizes -+# Remove hints as there's little possibility to change these code. -+class SiglipVisionConfig(PretrainedConfig): -+ -+ model_type = "siglip_vision_model" -+ -+ def __init__( -+ self, -+ hidden_size=768, -+ intermediate_size=3072, -+ num_hidden_layers=12, -+ num_attention_heads=12, -+ num_channels=3, -+ image_size=224, -+ patch_size=16, -+ hidden_act="gelu_pytorch_tanh", -+ layer_norm_eps=1e-6, -+ attention_dropout=0.0, -+ **kwargs, -+ ): -+ super().__init__(**kwargs) -+ -+ self.hidden_size = hidden_size -+ self.intermediate_size = intermediate_size -+ self.num_hidden_layers = num_hidden_layers -+ self.num_attention_heads = num_attention_heads -+ self.num_channels = num_channels -+ self.patch_size = patch_size -+ self.image_size = image_size -+ self.attention_dropout = attention_dropout -+ self.layer_norm_eps = layer_norm_eps -+ self.hidden_act = hidden_act -+ -+ @classmethod -+ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, -+ os.PathLike], -+ **kwargs) -> "PretrainedConfig": -+ cls._set_token_in_kwargs(kwargs) -+ -+ config_dict, kwargs = cls.get_config_dict( -+ pretrained_model_name_or_path, **kwargs) -+ -+ # get the vision config dict if we are loading from SiglipConfig -+ if config_dict.get("model_type") == "siglip": -+ config_dict = config_dict["vision_config"] -+ -+ if "model_type" in config_dict and hasattr( -+ cls, -+ "model_type") and config_dict["model_type"] != cls.model_type: -+ logger.warning( -+ "You are using a model of type %s to " -+ "instantiate a model of type %s. " -+ "This is not supported for all configurations" -+ "of models and can yield errors.", config_dict['model_type'], -+ cls.model_type) -+ -+ return cls.from_dict(config_dict, **kwargs) -+ -+ -+_CHECKPOINT_FOR_DOC = "google/siglip-base-patch16-224" -+ -+SIGLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [ -+ "google/siglip-base-patch16-224", -+ # See all SigLIP models at https://huggingface.co/models?filter=siglip -+] -+ -+if is_flash_attn_2_available(): -+ from flash_attn import flash_attn_func, flash_attn_varlen_func -+ from flash_attn.bert_padding import pad_input # noqa -+ from flash_attn.bert_padding import index_first_axis, unpad_input -+ -+ -+# Copied from transformers.models.llama.modeling_llama._get_unpad_data -+def _get_unpad_data(attention_mask): -+ seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) -+ indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() -+ max_seqlen_in_batch = seqlens_in_batch.max().item() -+ cu_seqlens = F.pad( -+ torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0)) -+ return ( -+ indices, -+ cu_seqlens, -+ max_seqlen_in_batch, -+ ) -+ -+ -+def _trunc_normal_(tensor, mean, std, a, b): -+ -+ def norm_cdf(x): -+ # Computes standard normal cumulative distribution function -+ return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0 -+ -+ if (mean < a - 2 * std) or (mean > b + 2 * std): -+ warnings.warn( -+ "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. " -+ "The distribution of values may be incorrect.", -+ stacklevel=2, -+ ) -+ -+ # Values are generated by using a truncated uniform distribution and -+ # then using the inverse CDF for the normal distribution. -+ # Get upper and lower cdf values -+ l_ = norm_cdf((a - mean) / std) -+ u = norm_cdf((b - mean) / std) -+ -+ # Uniformly fill tensor with values from [l, u], then translate to -+ # [2l-1, 2u-1]. -+ tensor.uniform_(2 * l_ - 1, 2 * u - 1) -+ -+ # Use inverse cdf transform for normal distribution to get truncated -+ # standard normal -+ if tensor.dtype in [torch.float16, torch.bfloat16]: -+ # The `erfinv_` op is not (yet?) defined in float16+cpu, bfloat16+gpu -+ og_dtype = tensor.dtype -+ tensor = tensor.to(torch.float32) -+ tensor.erfinv_() -+ tensor = tensor.to(og_dtype) -+ else: -+ tensor.erfinv_() -+ -+ # Transform to proper mean, std -+ tensor.mul_(std * math.sqrt(2.0)) -+ tensor.add_(mean) -+ -+ # Clamp to ensure it's in the proper range -+ if tensor.dtype == torch.float16: -+ # The `clamp_` op is not (yet?) defined in float16+cpu -+ tensor = tensor.to(torch.float32) -+ tensor.clamp_(min=a, max=b) -+ tensor = tensor.to(torch.float16) -+ else: -+ tensor.clamp_(min=a, max=b) -+ -+ -+def trunc_normal_tf_(tensor: torch.Tensor, -+ mean: float = 0.0, -+ std: float = 1.0, -+ a: float = -2.0, -+ b: float = 2.0) -> torch.Tensor: -+ with torch.no_grad(): -+ _trunc_normal_(tensor, 0, 1.0, a, b) -+ tensor.mul_(std).add_(mean) -+ -+ -+def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="normal"): -+ fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor) -+ if mode == "fan_in": -+ denom = fan_in -+ elif mode == "fan_out": -+ denom = fan_out -+ elif mode == "fan_avg": -+ denom = (fan_in + fan_out) / 2 -+ -+ variance = scale / denom -+ -+ if distribution == "truncated_normal": -+ # constant is stddev of standard normal truncated to (-2, 2) -+ trunc_normal_tf_(tensor, std=math.sqrt(variance) / 0.87962566103423978) -+ elif distribution == "normal": -+ with torch.no_grad(): -+ tensor.normal_(std=math.sqrt(variance)) -+ elif distribution == "uniform": -+ bound = math.sqrt(3 * variance) -+ with torch.no_grad(): -+ tensor.uniform_(-bound, bound) -+ else: -+ raise ValueError(f"invalid distribution {distribution}") -+ -+ -+def lecun_normal_(tensor): -+ variance_scaling_(tensor, mode="fan_in", distribution="truncated_normal") -+ -+ -+def default_flax_embed_init(tensor): -+ variance_scaling_(tensor, mode="fan_in", distribution="normal") -+ -+ -+class SiglipVisionModelOutput(ModelOutput): -+ image_embeds: Optional[torch.FloatTensor] = None -+ last_hidden_state: torch.FloatTensor = None -+ hidden_states: Optional[Tuple[torch.FloatTensor]] = None -+ attentions: Optional[Tuple[torch.FloatTensor]] = None -+ -+ -+class SiglipVisionEmbeddings(nn.Module): -+ -+ def __init__(self, config: SiglipVisionConfig): -+ super().__init__() -+ self.config = config -+ self.embed_dim = config.hidden_size -+ self.image_size = config.image_size -+ self.patch_size = config.patch_size -+ -+ self.patch_embedding = nn.Conv2d( -+ in_channels=config.num_channels, -+ out_channels=self.embed_dim, -+ kernel_size=self.patch_size, -+ stride=self.patch_size, -+ padding="valid", -+ ) -+ -+ self.num_patches_per_side = self.image_size // self.patch_size -+ self.num_patches = self.num_patches_per_side**2 -+ self.num_positions = self.num_patches -+ self.position_embedding = nn.Embedding(self.num_positions, -+ self.embed_dim) -+ -+ def forward(self, -+ pixel_values: torch.FloatTensor, -+ patch_attention_mask: torch.BoolTensor, -+ tgt_sizes: Optional[torch.IntTensor] = None) -> torch.Tensor: -+ batch_size = pixel_values.size(0) -+ -+ patch_embeds = self.patch_embedding(pixel_values) -+ embeddings = patch_embeds.flatten(2).transpose(1, 2) -+ -+ max_im_h, max_im_w = pixel_values.size(2), pixel_values.size(3) -+ max_nb_patches_h, max_nb_patches_w = (max_im_h // self.patch_size, -+ max_im_w // self.patch_size) -+ boundaries = torch.arange(1 / self.num_patches_per_side, 1.0, -+ 1 / self.num_patches_per_side) -+ position_ids = torch.full( -+ size=( -+ batch_size, -+ max_nb_patches_h * max_nb_patches_w, -+ ), -+ fill_value=0, -+ ) -+ -+ for batch_idx, p_attn_mask in enumerate(patch_attention_mask): -+ if tgt_sizes is not None: -+ nb_patches_h = tgt_sizes[batch_idx][0] -+ nb_patches_w = tgt_sizes[batch_idx][1] -+ else: -+ nb_patches_h = p_attn_mask[:, 0].sum() -+ nb_patches_w = p_attn_mask[0].sum() -+ -+ fractional_coords_h = torch.arange(0, 1 - 1e-6, 1 / nb_patches_h) -+ fractional_coords_w = torch.arange(0, 1 - 1e-6, 1 / nb_patches_w) -+ -+ bucket_coords_h = torch.bucketize(fractional_coords_h, -+ boundaries, -+ right=True) -+ bucket_coords_w = torch.bucketize(fractional_coords_w, -+ boundaries, -+ right=True) -+ -+ pos_ids = (bucket_coords_h[:, None] * self.num_patches_per_side + -+ bucket_coords_w).flatten() -+ position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids -+ -+ position_ids = position_ids.to(self.position_embedding.weight.device) -+ -+ embeddings = embeddings + self.position_embedding(position_ids) -+ return embeddings -+ -+ -+def attention_softmax(attn_weights: torch.Tensor, training: bool): -+ if attn_weights.is_contiguous() and attn_weights.device.type == "xpu" and not training: -+ import xe_addons -+ xe_addons.attn_softmax_inplaced(attn_weights) -+ else: -+ attn_weights = torch.nn.functional.softmax(attn_weights, dim=-1, -+ dtype=torch.float32).to(attn_weights.dtype) -+ return attn_weights -+ -+ -+class SiglipAttention(nn.Module): -+ """Multi-headed attention from 'Attention Is All You Need' paper""" -+ -+ # Copied from transformers.models.clip.modeling_clip.CLIPAttention.__init__ -+ def __init__(self, config): -+ super().__init__() -+ self.config = config -+ self.embed_dim = config.hidden_size -+ self.num_heads = config.num_attention_heads -+ self.head_dim = self.embed_dim // self.num_heads -+ if self.head_dim * self.num_heads != self.embed_dim: -+ raise ValueError( -+ "embed_dim must be divisible by num_heads (got `embed_dim`: " -+ f"{self.embed_dim} and `num_heads`:" -+ f" {self.num_heads}).") -+ self.scale = self.head_dim**-0.5 -+ self.dropout = config.attention_dropout -+ -+ self.k_proj = nn.Linear(self.embed_dim, self.embed_dim) -+ self.v_proj = nn.Linear(self.embed_dim, self.embed_dim) -+ self.q_proj = nn.Linear(self.embed_dim, self.embed_dim) -+ self.out_proj = nn.Linear(self.embed_dim, self.embed_dim) -+ -+ def forward( -+ self, -+ hidden_states: torch.Tensor, -+ attention_mask: Optional[torch.Tensor] = None, -+ output_attentions: Optional[bool] = False, -+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], -+ Optional[Tuple[torch.Tensor]]]: -+ """Input shape: Batch x Time x Channel""" -+ -+ batch_size, q_len, _ = hidden_states.size() -+ -+ # query_states = self.q_proj(hidden_states) -+ # key_states = self.k_proj(hidden_states) -+ # value_states = self.v_proj(hidden_states) -+ -+ # query_states = query_states.view(batch_size, q_len, self.num_heads, -+ # self.head_dim).transpose(1, 2) -+ # key_states = key_states.view(batch_size, q_len, self.num_heads, -+ # self.head_dim).transpose(1, 2) -+ # value_states = value_states.view(batch_size, q_len, self.num_heads, -+ # self.head_dim).transpose(1, 2) -+ -+ qkv = self.qkv_proj(hidden_states) -+ qkv = qkv.view(batch_size, q_len, self.num_heads * 3, self.head_dim) -+ qkv = qkv.transpose(1, 2) -+ query_states, key_states, value_states = qkv.chunk(3, dim=1) -+ -+ from ipex_llm.transformers.models.common import padding_qkv_hd -+ query_states, key_states, value_states = padding_qkv_hd( -+ query_states, key_states, value_states, -+ 72, 80 -+ ) -+ from ipex_llm.transformers.models.utils import use_sdp_non_causal -+ if use_sdp_non_causal(self.head_dim, query_states.device, query_states.dtype): -+ import xe_addons -+ attn_weights = None -+ attn_output = xe_addons.sdp_non_causal(query_states, key_states.contiguous(), value_states.contiguous(), attention_mask) -+ else: -+ k_v_seq_len = key_states.shape[-2] -+ attn_weights = torch.matmul(query_states, key_states.transpose( -+ 2, 3)) * self.scale -+ -+ if attn_weights.size() != (batch_size, self.num_heads, q_len, -+ k_v_seq_len): -+ raise ValueError( -+ "Attention weights should be of size " -+ f"{(batch_size, self.num_heads, q_len, k_v_seq_len)}, but is" -+ f" {attn_weights.size()}") -+ -+ if attention_mask is not None: -+ if attention_mask.size() != (batch_size, 1, q_len, k_v_seq_len): -+ raise ValueError( -+ "Attention mask should be of size " -+ f"{(batch_size, 1, q_len, k_v_seq_len)}", -+ f"but is {attention_mask.size()}") -+ attn_weights = attn_weights + attention_mask -+ -+ # upcast attention to fp32 -+ # attn_weights = nn.functional.softmax(attn_weights, -+ # dim=-1, -+ # dtype=torch.float32).to( -+ # query_states.dtype) -+ attn_weights = attention_softmax(attn_weights, self.training) -+ attn_weights = nn.functional.dropout(attn_weights, -+ p=self.dropout, -+ training=self.training) -+ attn_output = torch.matmul(attn_weights, value_states) -+ -+ if attn_output.size() != (batch_size, self.num_heads, q_len, -+ self.head_dim): -+ raise ValueError( -+ "`attn_output` should be of size " -+ f"{(batch_size, self.num_heads, q_len, self.head_dim)}, " -+ "but is" -+ f" {attn_output.size()}") -+ attn_output = attn_output[:, :, :, :self.head_dim] -+ attn_output = attn_output.transpose(1, 2).contiguous() -+ attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim) -+ -+ attn_output = self.out_proj(attn_output) -+ -+ return attn_output, attn_weights -+ -+ -+class SiglipFlashAttention2(SiglipAttention): -+ -+ def __init__(self, *args, **kwargs): -+ super().__init__(*args, **kwargs) -+ self.is_causal = False # Hack to make sure we don't use a causal mask -+ -+ def forward( -+ self, -+ hidden_states: torch.Tensor, -+ attention_mask: Optional[torch.LongTensor] = None, -+ position_ids: Optional[torch.LongTensor] = None, -+ past_key_value: Optional[Tuple[torch.Tensor]] = None, -+ output_attentions: bool = False, -+ use_cache: bool = False, -+ **kwargs, -+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], -+ Optional[Tuple[torch.Tensor]]]: -+ output_attentions = False -+ -+ bsz, q_len, _ = hidden_states.size() -+ -+ query_states = self.q_proj(hidden_states) -+ key_states = self.k_proj(hidden_states) -+ value_states = self.v_proj(hidden_states) -+ -+ query_states = query_states.view(bsz, q_len, self.num_heads, -+ self.head_dim).transpose(1, 2) -+ key_states = key_states.view(bsz, q_len, self.num_heads, -+ self.head_dim).transpose(1, 2) -+ value_states = value_states.view(bsz, q_len, self.num_heads, -+ self.head_dim).transpose(1, 2) -+ -+ kv_seq_len = key_states.shape[-2] -+ if past_key_value is not None: -+ kv_seq_len += past_key_value.get_usable_length( -+ kv_seq_len, self.layer_idx) -+ -+ query_states = query_states.transpose(1, 2) -+ key_states = key_states.transpose(1, 2) -+ value_states = value_states.transpose(1, 2) -+ -+ dropout_rate = self.dropout if self.training else 0.0 -+ -+ input_dtype = query_states.dtype -+ if input_dtype == torch.float32: -+ if torch.is_autocast_enabled(): -+ target_dtype = torch.get_autocast_gpu_dtype() -+ # Handle the case where the model is quantized -+ elif hasattr(self.config, "_pre_quantization_dtype"): -+ target_dtype = self.config._pre_quantization_dtype -+ else: -+ target_dtype = self.q_proj.weight.dtype -+ -+ logger.warning( -+ "The input hidden states seems to be " -+ "silently casted in float32, " -+ "this might be related to the fact " -+ "you have upcasted embedding or layer norm layers in float32. " -+ "We will cast back the input in" -+ " %s.", target_dtype) -+ -+ query_states = query_states.to(target_dtype) -+ key_states = key_states.to(target_dtype) -+ value_states = value_states.to(target_dtype) -+ -+ attn_output = self._flash_attention_forward(query_states, -+ key_states, -+ value_states, -+ attention_mask, -+ q_len, -+ dropout=dropout_rate) -+ -+ attn_output = attn_output.reshape(bsz, q_len, -+ self.embed_dim).contiguous() -+ attn_output = self.out_proj(attn_output) -+ -+ if not output_attentions: -+ attn_weights = None -+ -+ return attn_output, attn_weights -+ -+ def _flash_attention_forward(self, -+ query_states, -+ key_states, -+ value_states, -+ attention_mask, -+ query_length, -+ dropout=0.0, -+ softmax_scale=None): -+ causal = self.is_causal and query_length != 1 -+ -+ # Contains at least one padding token in the sequence -+ if attention_mask is not None: -+ batch_size = query_states.shape[0] -+ (query_states, key_states, value_states, indices_q, cu_seq_lens, -+ max_seq_lens) = self._upad_input(query_states, key_states, -+ value_states, attention_mask, -+ query_length) -+ -+ cu_seqlens_q, cu_seqlens_k = cu_seq_lens -+ max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens -+ -+ attn_output_unpad = flash_attn_varlen_func( -+ query_states, -+ key_states, -+ value_states, -+ cu_seqlens_q=cu_seqlens_q, -+ cu_seqlens_k=cu_seqlens_k, -+ max_seqlen_q=max_seqlen_in_batch_q, -+ max_seqlen_k=max_seqlen_in_batch_k, -+ dropout_p=dropout, -+ softmax_scale=softmax_scale, -+ causal=causal, -+ ) -+ -+ attn_output = pad_input(attn_output_unpad, indices_q, batch_size, -+ query_length) -+ else: -+ attn_output = flash_attn_func(query_states, -+ key_states, -+ value_states, -+ dropout, -+ softmax_scale=softmax_scale, -+ causal=causal) -+ -+ return attn_output -+ -+ def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, -+ query_length): -+ indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data( -+ attention_mask) -+ batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape -+ -+ key_layer = index_first_axis( -+ key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, -+ head_dim), indices_k) -+ value_layer = index_first_axis( -+ value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, -+ head_dim), indices_k) -+ if query_length == kv_seq_len: -+ query_layer = index_first_axis( -+ query_layer.reshape(batch_size * kv_seq_len, self.num_heads, -+ head_dim), indices_k) -+ cu_seqlens_q = cu_seqlens_k -+ max_seqlen_in_batch_q = max_seqlen_in_batch_k -+ indices_q = indices_k -+ elif query_length == 1: -+ max_seqlen_in_batch_q = 1 -+ cu_seqlens_q = torch.arange( -+ batch_size + 1, dtype=torch.int32, device=query_layer.device -+ ) # There is a memcpy here, that is very bad. -+ indices_q = cu_seqlens_q[:-1] -+ query_layer = query_layer.squeeze(1) -+ else: -+ # The -q_len: slice assumes left padding. -+ attention_mask = attention_mask[:, -query_length:] -+ (query_layer, indices_q, cu_seqlens_q, -+ max_seqlen_in_batch_q) = unpad_input(query_layer, attention_mask) -+ -+ return ( -+ query_layer, -+ key_layer, -+ value_layer, -+ indices_q, -+ (cu_seqlens_q, cu_seqlens_k), -+ (max_seqlen_in_batch_q, max_seqlen_in_batch_k), -+ ) -+ -+ -+# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->Siglip -+class SiglipMLP(nn.Module): -+ -+ def __init__(self, config): -+ super().__init__() -+ self.config = config -+ self.activation_fn = ACT2FN[config.hidden_act] -+ self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size) -+ self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size) -+ -+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: -+ hidden_states = self.fc1(hidden_states) -+ hidden_states = self.activation_fn(hidden_states) -+ hidden_states = self.fc2(hidden_states) -+ return hidden_states -+ -+ -+# Copied from transformers.models.clip.modeling_clip.CLIPEncoderLayer -+# with CLIP->Siglip -+class SiglipEncoderLayer(nn.Module): -+ -+ def __init__(self, config: SiglipVisionConfig): -+ super().__init__() -+ self.embed_dim = config.hidden_size -+ self._use_flash_attention_2 = ( -+ config._attn_implementation == "flash_attention_2") -+ self.self_attn = (SiglipAttention(config) -+ if not self._use_flash_attention_2 else -+ SiglipFlashAttention2(config)) -+ self.layer_norm1 = nn.LayerNorm(self.embed_dim, -+ eps=config.layer_norm_eps) -+ self.mlp = SiglipMLP(config) -+ self.layer_norm2 = nn.LayerNorm(self.embed_dim, -+ eps=config.layer_norm_eps) -+ -+ def forward( -+ self, -+ hidden_states: torch.Tensor, -+ attention_mask: torch.Tensor, -+ output_attentions: Optional[bool] = False, -+ ) -> Tuple[torch.FloatTensor]: -+ residual = hidden_states -+ -+ hidden_states = self.layer_norm1(hidden_states) -+ hidden_states, attn_weights = self.self_attn( -+ hidden_states=hidden_states, -+ attention_mask=attention_mask, -+ output_attentions=output_attentions, -+ ) -+ hidden_states = residual + hidden_states -+ -+ residual = hidden_states -+ hidden_states = self.layer_norm2(hidden_states) -+ hidden_states = self.mlp(hidden_states) -+ hidden_states = residual + hidden_states -+ -+ outputs = (hidden_states, ) -+ -+ if output_attentions: -+ outputs += (attn_weights, ) -+ -+ return outputs -+ -+ -+class SiglipPreTrainedModel(PreTrainedModel): -+ config_class = SiglipVisionConfig -+ base_model_prefix = "siglip" -+ supports_gradient_checkpointing = True -+ -+ def _init_weights(self, module): -+ """Initialize the weights""" -+ -+ if isinstance(module, SiglipVisionEmbeddings): -+ width = self.config.hidden_size -+ nn.init.normal_(module.position_embedding.weight, -+ std=1 / np.sqrt(width)) -+ elif isinstance(module, nn.Embedding): -+ default_flax_embed_init(module.weight) -+ elif isinstance(module, SiglipAttention): -+ nn.init.normal_(module.q_proj.weight) -+ nn.init.normal_(module.k_proj.weight) -+ nn.init.normal_(module.v_proj.weight) -+ nn.init.normal_(module.out_proj.weight) -+ nn.init.zeros_(module.q_proj.bias) -+ nn.init.zeros_(module.k_proj.bias) -+ nn.init.zeros_(module.v_proj.bias) -+ nn.init.zeros_(module.out_proj.bias) -+ elif isinstance(module, SiglipMLP): -+ nn.init.normal_(module.fc1.weight) -+ nn.init.normal_(module.fc2.weight) -+ nn.init.normal_(module.fc1.bias, std=1e-6) -+ nn.init.normal_(module.fc2.bias, std=1e-6) -+ elif isinstance(module, (nn.Linear, nn.Conv2d)): -+ lecun_normal_(module.weight) -+ if module.bias is not None: -+ nn.init.zeros_(module.bias) -+ elif isinstance(module, nn.LayerNorm): -+ module.bias.data.zero_() -+ module.weight.data.fill_(1.0) -+ -+ -+# Copied from transformers.models.clip.modeling_clip.CLIPEncoder -+# with CLIP->Siglip -+class SiglipEncoder(nn.Module): -+ -+ def __init__(self, config: SiglipVisionConfig): -+ super().__init__() -+ self.config = config -+ self.layers = nn.ModuleList([ -+ SiglipEncoderLayer(config) for _ in range(config.num_hidden_layers) -+ ]) -+ self.gradient_checkpointing = False -+ -+ # Ignore copy -+ def forward( -+ self, -+ inputs_embeds, -+ attention_mask: Optional[torch.Tensor] = None, -+ output_attentions: Optional[bool] = None, -+ output_hidden_states: Optional[bool] = None, -+ return_dict: Optional[bool] = None, -+ ) -> Union[Tuple, BaseModelOutput]: -+ output_attentions = output_attentions if output_attentions is not None \ -+ else self.config.output_attentions -+ output_hidden_states = (output_hidden_states -+ if output_hidden_states is not None else -+ self.config.output_hidden_states) -+ return_dict = return_dict if return_dict is not None \ -+ else self.config.use_return_dict -+ -+ encoder_states = () if output_hidden_states else None -+ all_attentions = () if output_attentions else None -+ -+ hidden_states = inputs_embeds -+ for encoder_layer in self.layers: -+ if output_hidden_states: -+ encoder_states = encoder_states + (hidden_states, ) -+ if self.gradient_checkpointing and self.training: -+ layer_outputs = self._gradient_checkpointing_func( -+ encoder_layer.__call__, -+ hidden_states, -+ attention_mask, -+ output_attentions, -+ ) -+ else: -+ layer_outputs = encoder_layer( -+ hidden_states, -+ attention_mask, -+ output_attentions=output_attentions, -+ ) -+ -+ hidden_states = layer_outputs[0] -+ -+ if output_attentions: -+ all_attentions = all_attentions + (layer_outputs[1], ) -+ -+ if output_hidden_states: -+ encoder_states = encoder_states + (hidden_states, ) -+ -+ if not return_dict: -+ return tuple( -+ v for v in [hidden_states, encoder_states, all_attentions] -+ if v is not None) -+ return BaseModelOutput(last_hidden_state=hidden_states, -+ hidden_states=encoder_states, -+ attentions=all_attentions) -+ -+ -+class SiglipVisionTransformer(SiglipPreTrainedModel): -+ config_class = SiglipVisionConfig -+ main_input_name = "pixel_values" -+ _supports_flash_attn_2 = True -+ -+ def __init__(self, config: SiglipVisionConfig): -+ super().__init__(config) -+ self.config = config -+ embed_dim = config.hidden_size -+ -+ self.embeddings = SiglipVisionEmbeddings(config) -+ self.encoder = SiglipEncoder(config) -+ self.post_layernorm = nn.LayerNorm(embed_dim, -+ eps=config.layer_norm_eps) -+ self._use_flash_attention_2 = ( -+ config._attn_implementation == "flash_attention_2") -+ -+ # Initialize weights and apply final processing -+ self.post_init() -+ -+ def get_input_embeddings(self) -> nn.Module: -+ return self.embeddings.patch_embedding -+ -+ @replace_return_docstrings(output_type=BaseModelOutputWithPooling, -+ config_class=SiglipVisionConfig) -+ def forward( -+ self, -+ pixel_values, -+ patch_attention_mask: Optional[torch.BoolTensor] = None, -+ tgt_sizes: Optional[torch.IntTensor] = None, -+ output_attentions: Optional[bool] = None, -+ output_hidden_states: Optional[bool] = None, -+ return_dict: Optional[bool] = None, -+ ) -> Union[Tuple, BaseModelOutputWithPooling]: -+ r""" -+ Returns: -+ """ -+ output_attentions = output_attentions if output_attentions is not None \ -+ else self.config.output_attentions -+ output_hidden_states = (output_hidden_states -+ if output_hidden_states is not None else -+ self.config.output_hidden_states) -+ return_dict = return_dict if return_dict is not None \ -+ else self.config.use_return_dict -+ -+ batch_size = pixel_values.size(0) -+ if patch_attention_mask is None: -+ patch_attention_mask = torch.ones( -+ size=( -+ batch_size, -+ pixel_values.size(2) // self.config.patch_size, -+ pixel_values.size(3) // self.config.patch_size, -+ ), -+ dtype=torch.bool, -+ device=pixel_values.device, -+ ) -+ -+ hidden_states = self.embeddings( -+ pixel_values=pixel_values, -+ patch_attention_mask=patch_attention_mask, -+ tgt_sizes=tgt_sizes) -+ -+ patch_attention_mask = patch_attention_mask.view(batch_size, -1) -+ # The call to `_upad_input` in `_flash_attention_forward` is expensive -+ # So when the `patch_attention_mask` is full of 1s -+ # (i.e. attending to the whole sequence), -+ # avoiding passing the attention_mask, -+ # which is equivalent to attending to the full sequence -+ if not torch.any(~patch_attention_mask): -+ attention_mask = None -+ else: -+ attention_mask = (_prepare_4d_attention_mask( -+ patch_attention_mask, hidden_states.dtype) -+ if not self._use_flash_attention_2 else -+ patch_attention_mask) -+ -+ encoder_outputs = self.encoder( -+ inputs_embeds=hidden_states, -+ attention_mask=attention_mask, -+ output_attentions=output_attentions, -+ output_hidden_states=output_hidden_states, -+ return_dict=return_dict, -+ ) -+ -+ last_hidden_state = encoder_outputs[0] -+ last_hidden_state = self.post_layernorm(last_hidden_state) -+ -+ if not return_dict: -+ return (last_hidden_state, None) + encoder_outputs[1:] -+ -+ return BaseModelOutputWithPooling( -+ last_hidden_state=last_hidden_state, -+ pooler_output=None, -+ hidden_states=encoder_outputs.hidden_states, -+ attentions=encoder_outputs.attentions, -+ ) -diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py -index 4e2e7f576..c8418c14e 100644 ---- a/vllm/model_executor/models/phi3v.py -+++ b/vllm/model_executor/models/phi3v.py -@@ -12,9 +12,9 @@ - # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - # See the License for the specific language governing permissions and - # limitations under the License. -+from collections.abc import Iterable, Mapping, Sequence - from functools import cached_property --from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, -- TypedDict, Union) -+from typing import Any, List, Literal, Optional, Set, Tuple, TypedDict, Union - - import torch - import torch.nn as nn -@@ -23,23 +23,28 @@ from transformers import (BatchFeature, CLIPVisionConfig, PretrainedConfig, - - from vllm.attention import AttentionMetadata - from vllm.config import VllmConfig --from vllm.inputs import InputContext - from vllm.logger import init_logger - from vllm.model_executor.layers.quantization import QuantizationConfig - from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler - from vllm.model_executor.layers.vocab_parallel_embedding import ( - VocabParallelEmbedding) --from vllm.model_executor.models.clip import CLIPVisionModel - from vllm.model_executor.sampling_metadata import SamplingMetadata - from vllm.multimodal import MULTIMODAL_REGISTRY --from vllm.multimodal.inputs import NestedTensors -+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, -+ MultiModalInputsV2, MultiModalKwargs, -+ NestedTensors, PlaceholderRange) -+from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, -+ ImageSize) - from vllm.multimodal.processing import (BaseMultiModalProcessor, -- MultiModalDataItems, ProcessorInputs, -- PromptReplacement) -+ MultiModalDataItems, ProcessingMixin, -+ PromptReplacement, -+ _BoundPromptReplacement, -+ _PlaceholderInfo) -+from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs - from vllm.sequence import IntermediateTensors - from vllm.utils import is_list_of - --from .clip import dummy_image_for_clip -+from .clip import CLIPVisionModel - from .interfaces import SupportsMultiModal, SupportsPP - from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, - init_vllm_registered_model, maybe_prefix, -@@ -50,10 +55,6 @@ logger = init_logger(__name__) - # Cannot find the following 2 numbers from hf config. - _IMAGE_TOKEN_ID = 32044 - --# Result in the max possible feature size (h:w = 16:1) --MAX_IMAGE_FEATURE_SIZE_HEIGHT = 8000 --MAX_IMAGE_FEATURE_SIZE_WIDTH = 50 -- - CLIP_VIT_LARGE_PATCH14_336_CONFIG = CLIPVisionConfig(dropout=0.0, - hidden_act="quick_gelu", - hidden_size=1024, -@@ -301,24 +302,7 @@ class Phi3HDImageEmbedding(Phi3ImageEmbeddingBase): - return image_features_hd_newline - - --def get_max_phi3v_image_tokens( -- ctx: InputContext, -- *, -- num_crops: Optional[int] = None, --) -> int: -- mm_processor_kwargs = {} -- if num_crops: -- mm_processor_kwargs["num_crops"] = num_crops -- -- processor = ctx.get_hf_processor(**mm_processor_kwargs) -- -- return processor.calc_num_image_tokens_from_image_size( -- width=MAX_IMAGE_FEATURE_SIZE_WIDTH, -- height=MAX_IMAGE_FEATURE_SIZE_HEIGHT, -- ) -- -- --class Phi3VMultiModalProcessor(BaseMultiModalProcessor): -+class Phi3VProcessingMixin(ProcessingMixin): - - def _get_hf_processor( - self, -@@ -327,85 +311,194 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor): - ) -> ProcessorMixin: - if num_crops is not None: - return self.ctx.get_hf_processor(num_crops=num_crops) -+ - return self.ctx.get_hf_processor() - -+ def _get_num_image_tokens( -+ self, -+ *, -+ image_width: int, -+ image_height: int, -+ ) -> int: -+ processor = self._get_hf_processor() -+ -+ return processor.calc_num_image_tokens_from_image_size( # type: ignore -+ width=image_width, -+ height=image_height, -+ ) -+ -+ -+class Phi3VProfilingInfo(Phi3VProcessingMixin, BaseProfilingInfo): -+ -+ def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: -+ return {"image": None} -+ -+ def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: -+ target_width, target_height = self._get_image_size_with_most_features() -+ -+ max_image_tokens = self._get_num_image_tokens( -+ image_width=target_width, -+ image_height=target_height, -+ ) -+ -+ return {"image": max_image_tokens} -+ -+ def _get_image_size_with_most_features(self) -> ImageSize: -+ # Result in the max possible feature size (h:w = 16:1) -+ return ImageSize(height=8000, width=50) -+ -+ def get_dummy_processor_inputs( -+ self, -+ seq_len: int, -+ mm_counts: Mapping[str, int], -+ ) -> ProcessorInputs: -+ num_images = mm_counts.get("image", 0) -+ -+ target_width, target_height = self._get_image_size_with_most_features() -+ -+ mm_data = { -+ "image": -+ self._get_dummy_images(width=target_width, -+ height=target_height, -+ num_images=num_images) -+ } -+ -+ hf_processor = self._get_hf_processor() -+ image_tokens: list[str] = hf_processor.img_tokens # type: ignore -+ -+ return ProcessorInputs( -+ prompt_text="".join(image_tokens[:num_images]), -+ mm_data=mm_data, -+ ) -+ -+ -+class Phi3VMultiModalProcessor(Phi3VProcessingMixin, BaseMultiModalProcessor): -+ -+ def _get_profiling_info(self) -> BaseProfilingInfo: -+ return Phi3VProfilingInfo(self.ctx) -+ - def _call_hf_processor( - self, -- hf_processor: ProcessorMixin, - prompt: str, -- processor_data: Mapping[str, object], -- mm_processor_kwargs: Mapping[str, object], -+ mm_data: Mapping[str, object], -+ mm_kwargs: Mapping[str, object], - ) -> BatchFeature: - processed_outputs = super()._call_hf_processor( -- hf_processor, - prompt=prompt, -- processor_data=processor_data, -- mm_processor_kwargs=mm_processor_kwargs, -+ mm_data=mm_data, -+ mm_kwargs=mm_kwargs, - ) - -+ input_ids = processed_outputs["input_ids"] -+ assert isinstance(input_ids, torch.Tensor) -+ - # Phi3v processor has inserted -1, -2 etc as placeholder in prompt_ids, - # which will cause OverflowError when decoding the prompt_ids. - # Therefore, we need to do an early replacement here -- token_ids = processed_outputs['input_ids'] -- token_ids[token_ids < 0] = _IMAGE_TOKEN_ID -- processed_outputs['input_ids'] = token_ids -+ input_ids.masked_fill_(input_ids < 0, _IMAGE_TOKEN_ID) - - return processed_outputs - -+ def _get_mm_fields_config( -+ self, -+ hf_inputs: BatchFeature, -+ hf_processor_mm_kwargs: Mapping[str, object], -+ ) -> Mapping[str, MultiModalFieldConfig]: -+ return dict( -+ pixel_values=MultiModalFieldConfig.batched("image"), -+ image_sizes=MultiModalFieldConfig.batched("image"), -+ image_embeds=MultiModalFieldConfig.batched("image"), -+ ) -+ - def _get_prompt_replacements( - self, - mm_items: MultiModalDataItems, -- hf_inputs: BatchFeature, -- mm_processor_kwargs: Mapping[str, object], -+ hf_processor_mm_kwargs: Mapping[str, Any], -+ out_mm_kwargs: MultiModalKwargs, - ) -> list[PromptReplacement]: -- hf_processor = self._get_hf_processor() -+ hf_processor = self._get_hf_processor(**hf_processor_mm_kwargs) - image_tokens: list[str] = hf_processor.img_tokens # type: ignore -- image_processor = hf_processor.image_processor # type: ignore - -- mm_config = self.ctx.get_mm_config() -- max_images = mm_config.limit_per_prompt.get("image", 1) -+ tokenizer = self._get_tokenizer() -+ bos_token_id = tokenizer.bos_token_id -+ assert isinstance(bos_token_id, int) - - def get_replacement_phi3v(item_idx: int): -- image_size = mm_items.get_image_size(item_idx) -- num_tokens = image_processor.calc_num_image_tokens_from_image_size( -- width=image_size.width, -- height=image_size.height, -- ) -+ images = mm_items.get_items( -+ "image", (ImageEmbeddingItems, ImageProcessorItems)) -+ -+ if isinstance(images, ImageEmbeddingItems): -+ num_image_tokens = images.get_feature_size(item_idx) -+ else: -+ image_size = images.get_image_size(item_idx) -+ num_image_tokens = self._get_num_image_tokens( -+ image_width=image_size.width, -+ image_height=image_size.height, -+ ) - -- return [_IMAGE_TOKEN_ID] * num_tokens -+ return [_IMAGE_TOKEN_ID] * num_image_tokens + [bos_token_id] -+ -+ num_images = mm_items.get_count("image", strict=False) - - return [ - PromptReplacement( - modality="image", - target=image_token, - replacement=get_replacement_phi3v, -- ) for image_token in image_tokens[:max_images] -+ ) for image_token in image_tokens[:num_images] - ] - -- def _get_dummy_mm_inputs( -+ def _apply_prompt_replacements( - self, -- mm_counts: Mapping[str, int], -- ) -> ProcessorInputs: -- num_images = mm_counts["image"] -- -- data = dummy_image_for_clip( -- CLIP_VIT_LARGE_PATCH14_336_CONFIG, -- num_images, -- image_width_override=MAX_IMAGE_FEATURE_SIZE_WIDTH, -- image_height_override=MAX_IMAGE_FEATURE_SIZE_HEIGHT, -+ token_ids: list[int], -+ mm_prompt_repls: Mapping[str, Sequence[_BoundPromptReplacement]], -+ mm_item_counts: Mapping[str, int], -+ ) -> tuple[list[int], str, Mapping[str, list[_PlaceholderInfo]]]: -+ token_ids, text, placeholders = super()._apply_prompt_replacements( -+ token_ids=token_ids, -+ mm_prompt_repls=mm_prompt_repls, -+ mm_item_counts=mm_item_counts, - ) - -- hf_processor = self._get_hf_processor() -- image_tokens: list[str] = hf_processor.img_tokens # type: ignore -+ # Keep the behavior in line with HF processor -+ if text.startswith(" <|image|>"): -+ text = text.replace(" <|image|>", "<|image|>", 1) -+ token_ids = [token_ids[0], *token_ids[2:]] -+ placeholders = { -+ modality: [ -+ _PlaceholderInfo( -+ modality=p.modality, -+ item_idx=p.item_idx, -+ start_idx=p.start_idx - 1, -+ replacement=p.replacement, -+ ) for p in ps -+ ] -+ for modality, ps in placeholders.items() -+ } -+ -+ return token_ids, text, placeholders -+ -+ def apply( -+ self, -+ prompt_text: str, -+ mm_data: MultiModalDataDict, -+ hf_processor_mm_kwargs: Mapping[str, object], -+ ) -> MultiModalInputsV2: -+ result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs) - -- return ProcessorInputs( -- prompt_text="".join(image_tokens[:num_images]), -- mm_data=data, -- mm_processor_kwargs={}, -- ) -+ # Only <|image|> tokens should be considered as placeholders, -+ # so we ignore the trailing bos_token_id -+ result["mm_placeholders"] = { -+ modality: [ -+ PlaceholderRange(offset=p["offset"], length=p["length"] - 1) -+ for p in ps -+ ] -+ for modality, ps in result["mm_placeholders"].items() -+ } -+ -+ return result - - --@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_phi3v_image_tokens) - @MULTIMODAL_REGISTRY.register_processor(Phi3VMultiModalProcessor) - class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): - hf_to_vllm_mapper = WeightsMapper( -diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py -index f3d66c231..9e1d38512 100644 ---- a/vllm/model_executor/models/pixtral.py -+++ b/vllm/model_executor/models/pixtral.py -@@ -1,8 +1,8 @@ -+import math - from dataclasses import dataclass, fields - from functools import cached_property - from typing import Iterable, List, Mapping, Optional, Set, Tuple, Union - --import numpy - import torch - import torch.nn as nn - import torch.nn.functional as F -@@ -38,6 +38,7 @@ from vllm.sequence import IntermediateTensors, SequenceData - from .interfaces import SupportsMultiModal, SupportsPP - from .utils import (init_vllm_registered_model, maybe_prefix, - merge_multimodal_embeddings) -+from .vision import VisionEncoderInfo - - try: - from xformers import ops as xops -@@ -45,13 +46,6 @@ try: - except ImportError: - USE_XFORMERS_OPS = False - --# These token ids cannot be retrieved from model config --# so we hardcode them here. --PIXTRAL_12B_IMAGE_BREAK_ID = 12 --PIXTRAL_12B_IMAGE_END_ID = 13 --PIXTRAL_LARGE_IMAGE_BREAK_ID = 14 --PIXTRAL_LARGE_IMAGE_END_ID = 15 -- - - def get_max_pixtral_image_tokens(ctx: InputContext): - tokenizer = cached_get_tokenizer( -@@ -201,6 +195,13 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal, - if key in dataclass_fields - } - -+ if not ("image_break_token_id" in vision_args -+ and "image_end_token_id" in vision_args): -+ raise ValueError( -+ "'image_break_token_id' and 'image_end_token_id' not found " -+ "in the vision_encoder arguments. Please download the latest " -+ "version of 'params.json' from the model repository.") -+ - self.vision_args = VisionEncoderArgs(**vision_args) - - # init MistralForCausalLM -@@ -240,9 +241,8 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal, - - # NOTE: Image embeddings are split into separate tensors for each image - # by the indices of `[IMG_END]` token. -- image_end_condition = (image_tokens == PIXTRAL_12B_IMAGE_END_ID) | ( -- image_tokens == PIXTRAL_LARGE_IMAGE_END_ID) -- split_indices = torch.where(image_end_condition)[0] + 1 -+ image_end_mask = image_tokens == self.vision_args.image_end_token_id -+ split_indices = torch.where(image_end_mask)[0] + 1 - if len(split_indices) <= 1: - # Do not split, return as tensor of shape [1, fs, hs] - return image_embeds.unsqueeze(0) -@@ -265,10 +265,8 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal, - inputs_embeds = merge_multimodal_embeddings( - input_ids, inputs_embeds, multimodal_embeddings, [ - self.vision_args.image_token_id, -- PIXTRAL_12B_IMAGE_END_ID, -- PIXTRAL_12B_IMAGE_BREAK_ID, -- PIXTRAL_LARGE_IMAGE_BREAK_ID, -- PIXTRAL_LARGE_IMAGE_END_ID, -+ self.vision_args.image_break_token_id, -+ self.vision_args.image_end_token_id, - ]) - return inputs_embeds - -@@ -309,7 +307,7 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal, - images: Optional[Union[List[List[torch.Tensor]], List[torch.Tensor], - torch.Tensor]] = None, - image_tokens: Optional[torch.Tensor] = None, -- ) -> Optional[List[torch.Tensor]]: -+ ) -> Tuple[Optional[List[torch.Tensor]], Optional[torch.Tensor]]: - if images is None: - return None, None - -@@ -409,6 +407,8 @@ class VisionEncoderArgs: - num_attention_heads: int - rope_theta: float # for rope-2D - image_token_id: int -+ image_break_token_id: int -+ image_end_token_id: int - adapter_bias: bool = True - - -@@ -605,11 +605,11 @@ class VisionTransformer(nn.Module): - return self.args.image_size // self.args.patch_size - - @property -- def device(self) -> torch.device: -+ def device(self) -> torch.types.Device: - return next(self.parameters()).device - - @property -- def dtype(self) -> torch.device: -+ def dtype(self) -> torch.dtype: - return next(self.parameters()).dtype - - @property -@@ -698,10 +698,18 @@ def get_pixtral_hf_patch_grid_length(*, image_size: int, - return image_size // patch_size - - --def get_pixtral_hf_num_patches(*, image_size: int, patch_size: int) -> int: -- grid_length = get_pixtral_hf_patch_grid_length(image_size=image_size, -- patch_size=patch_size) -- return grid_length * grid_length -+def get_pixtral_hf_image_feature_size( -+ *, -+ image_size: int, -+ patch_size: int, -+) -> int: -+ grid_length = get_pixtral_hf_patch_grid_length( -+ image_size=image_size, -+ patch_size=patch_size, -+ ) -+ -+ # Consider the image_break_token -+ return (grid_length + 1) * grid_length - - - def get_max_pixtral_hf_image_tokens(hf_config: PixtralVisionConfig) -> int: -@@ -731,26 +739,58 @@ def dummy_image_for_pixtral_hf( - return {"image": image if num_images == 1 else [image] * num_images} - - --def get_pixtral_hf_image_feature_size(hf_config: PixtralVisionConfig, -- image_width: int, -- image_height: int) -> Tuple[int, int]: -- # Adapted from transformers.models.pixtral.image_processing_pixtral.get_resize_output_image_size # noqa: E501 -- # https://github.com/huggingface/transformers/blob/2bd4d5897dc73e8b172832070a6f9e567a0df017/src/transformers/models/pixtral/image_processing_pixtral.py#L180 # noqa: E501 -- max_width, max_height = hf_config.image_size, hf_config.image_size -- patch_width, patch_height = hf_config.patch_size, hf_config.patch_size -+# Adapted from transformers.models.pixtral.image_processing_pixtral.get_resize_output_image_size # noqa: E501 -+# https://github.com/huggingface/transformers/blob/2bd4d5897dc73e8b172832070a6f9e567a0df017/src/transformers/models/pixtral/image_processing_pixtral.py#L180 -+def get_pixtral_hf_image_feature_grid_size( -+ hf_config: PixtralVisionConfig, -+ *, -+ image_width: int, -+ image_height: int, -+) -> tuple[int, int]: -+ max_width = max_height = hf_config.image_size -+ patch_width = patch_height = hf_config.patch_size - - ratio = max(image_width / max_width, image_height / max_height) - - if ratio > 1: -- image_width = int(numpy.ceil(image_width / ratio)) -- image_height = int(numpy.ceil(image_height / ratio)) -+ image_width = int(math.ceil(image_width / ratio)) -+ image_height = int(math.ceil(image_height / ratio)) - -- num_height_tokens, num_width_tokens = _get_pixtral_hf_num_image_tokens( -+ nrows, ncols = _get_pixtral_hf_num_image_tokens( - (image_height, image_width), - (patch_height, patch_width), -- ) -+ ) # type: ignore - -- return num_width_tokens, num_height_tokens -+ return ncols, nrows -+ -+ -+class PixtralHFEncoderInfo(VisionEncoderInfo[PixtralVisionConfig]): -+ -+ def get_num_image_tokens( -+ self, -+ *, -+ image_width: int, -+ image_height: int, -+ ) -> int: -+ return get_pixtral_hf_image_feature_size( -+ image_size=self.vision_config.image_size, -+ patch_size=self.get_image_size(), -+ ) -+ -+ def get_max_image_tokens(self) -> int: -+ return get_max_pixtral_hf_image_tokens(self.vision_config) -+ -+ def get_image_size(self) -> int: -+ return self.vision_config.image_size -+ -+ def get_patch_size(self) -> int: -+ return self.vision_config.patch_size -+ -+ def get_patch_grid_length(self) -> int: -+ return get_pixtral_hf_patch_grid_length( -+ image_size=self.vision_config.image_size, -+ patch_size=self.vision_config.patch_size, -+ ) - - - class PixtralHFMLP(nn.Module): -diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py -index 63d1374ab..baf955f6b 100644 ---- a/vllm/model_executor/models/qwen.py -+++ b/vllm/model_executor/models/qwen.py -@@ -225,7 +225,7 @@ class VisualAttentionBlock(nn.Module): - d_model: int, - n_head: int, - mlp_ratio: float = 4.0, -- norm_layer: Callable = nn.LayerNorm, -+ norm_layer: Callable[[int], nn.Module] = nn.LayerNorm, - quant_config: Optional[QuantizationConfig] = None, - ): - super().__init__() -@@ -266,7 +266,7 @@ class TransformerBlock(nn.Module): - layers: int, - heads: int, - mlp_ratio: float = 4.0, -- norm_layer: Callable = nn.LayerNorm, -+ norm_layer: Callable[[int], nn.Module] = nn.LayerNorm, - quant_config: Optional[QuantizationConfig] = None, - ): - super().__init__() -diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py -index 6259166a7..a7bb3425e 100644 ---- a/vllm/model_executor/models/qwen2_audio.py -+++ b/vllm/model_executor/models/qwen2_audio.py -@@ -23,10 +23,9 @@ from functools import cached_property - from typing import (Any, Iterable, List, Mapping, Optional, Set, Tuple, - TypedDict, Union) - --import numpy as np - import torch - import torch.nn as nn --from transformers import BatchFeature, ProcessorMixin -+from transformers import BatchFeature - from transformers.models.qwen2_audio import (Qwen2AudioConfig, - Qwen2AudioEncoder, - Qwen2AudioProcessor) -@@ -34,14 +33,16 @@ from transformers.models.whisper import WhisperFeatureExtractor - - from vllm.attention import AttentionMetadata - from vllm.config import VllmConfig --from vllm.inputs import InputContext - from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler - from vllm.model_executor.sampling_metadata import SamplingMetadata - from vllm.multimodal import MULTIMODAL_REGISTRY --from vllm.multimodal.inputs import NestedTensors -+from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, -+ NestedTensors) -+from vllm.multimodal.parse import AudioProcessorItems, MultiModalDataParser - from vllm.multimodal.processing import (BaseMultiModalProcessor, -- MultiModalDataItems, ProcessorInputs, -+ MultiModalDataItems, ProcessingMixin, - PromptReplacement) -+from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs - from vllm.sequence import IntermediateTensors - - from .interfaces import SupportsMultiModal, SupportsPP -@@ -73,84 +74,149 @@ class Qwen2AudioMultiModalProjector(nn.Module): - - - # From Qwen2AudioEncoder._get_feat_extract_output_lengths --def _get_feat_extract_output_lengths(input_lengths: torch.LongTensor): -+def _get_feat_extract_output_lengths(input_lengths: torch.Tensor): - feat_lengths = (input_lengths - 1) // 2 + 1 - output_lengths = (feat_lengths - 2) // 2 + 1 - return feat_lengths, output_lengths - - --def get_max_qwen2_audio_audio_tokens(ctx: InputContext) -> int: -- hf_config = ctx.get_hf_config(Qwen2AudioConfig) -- max_source_position = hf_config.audio_config.max_source_positions -- output_lengths = (max_source_position - 2) // 2 + 1 -- return output_lengths -+class Qwen2AudioProcessingMixin(ProcessingMixin): - -+ def _get_hf_config(self): -+ return self.ctx.get_hf_config(Qwen2AudioConfig) - --class Qwen2AudioMultiModalProcessor(BaseMultiModalProcessor): -- -- def _get_hf_processor(self) -> Qwen2AudioProcessor: -+ def _get_hf_processor( -+ self, -+ *, -+ # Ignored in initialization -+ sampling_rate: Optional[int] = None, -+ ) -> Qwen2AudioProcessor: - return self.ctx.get_hf_processor(Qwen2AudioProcessor) - -- def _get_feature_extractor(self) -> WhisperFeatureExtractor: -- return self._get_hf_processor().feature_extractor # type: ignore -+ def _get_feature_extractor( -+ self, -+ *, -+ # Ignored in initialization -+ sampling_rate: Optional[int] = None, -+ ) -> WhisperFeatureExtractor: -+ hf_processor = self._get_hf_processor(sampling_rate=sampling_rate) -+ feature_extractor = hf_processor.feature_extractor # type: ignore -+ assert isinstance(feature_extractor, WhisperFeatureExtractor) -+ return feature_extractor -+ -+ -+class Qwen2AudioProfilingInfo(Qwen2AudioProcessingMixin, BaseProfilingInfo): - -- def _get_processor_data( -+ def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: -+ return {"audio": None} -+ -+ def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: -+ hf_config = self._get_hf_config() -+ max_source_positions = hf_config.audio_config.max_source_positions -+ max_output_lengths = (max_source_positions - 2) // 2 + 1 -+ -+ return {"audio": max_output_lengths} -+ -+ def get_dummy_processor_inputs( - self, -- mm_items: MultiModalDataItems, -- ) -> tuple[dict[str, Any], dict[str, Any]]: -- # resample audio to the model's sampling rate -+ seq_len: int, -+ mm_counts: Mapping[str, int], -+ ) -> ProcessorInputs: - feature_extractor = self._get_feature_extractor() -- mm_items.resample_audios(feature_extractor.sampling_rate) - -- return super()._get_processor_data(mm_items) -+ sampling_rate = feature_extractor.sampling_rate -+ audio_len = feature_extractor.chunk_length * sampling_rate -+ num_audios = mm_counts.get("audio", 0) -+ -+ mm_data = { -+ "audio": -+ self._get_dummy_audios(length=audio_len, num_audios=num_audios) -+ } -+ -+ return ProcessorInputs( -+ prompt_text="<|AUDIO|>" * num_audios, -+ mm_data=mm_data, -+ ) -+ -+ -+class Qwen2AudioMultiModalProcessor(Qwen2AudioProcessingMixin, -+ BaseMultiModalProcessor): -+ -+ def _get_profiling_info(self) -> BaseProfilingInfo: -+ return Qwen2AudioProfilingInfo(self.ctx) -+ -+ def _get_data_parser(self) -> MultiModalDataParser: -+ feature_extractor = self._get_feature_extractor() -+ return MultiModalDataParser(target_sr=feature_extractor.sampling_rate) - - def _call_hf_processor( - self, -- hf_processor: ProcessorMixin, - prompt: str, -- processor_data: Mapping[str, object], -- mm_processor_kwargs: Mapping[str, object], -+ mm_data: Mapping[str, object], -+ mm_kwargs: Mapping[str, Any], - ) -> BatchFeature: -- processor_data = dict(processor_data) -- audios = processor_data.pop("audios", []) -+ mm_data = dict(mm_data) -+ audios = mm_data.pop("audios", []) - - if audios: -- processor_data["audios"] = audios -+ mm_data["audios"] = audios - -- feature_extractor = self._get_feature_extractor() -- mm_processor_kwargs = dict( -- **mm_processor_kwargs, -+ feature_extractor = self._get_feature_extractor(**mm_kwargs) -+ mm_kwargs = dict( -+ **mm_kwargs, - sampling_rate=feature_extractor.sampling_rate, - ) - else: - # NOTE: WhisperFeatureExtractor cannot handle empty list of audios - pass - -- return super()._call_hf_processor( -- hf_processor, -+ processed_outputs = super()._call_hf_processor( - prompt=prompt, -- processor_data=processor_data, -- mm_processor_kwargs=mm_processor_kwargs, -+ mm_data=mm_data, -+ mm_kwargs=mm_kwargs, -+ ) -+ -+ return processed_outputs -+ -+ def _get_mm_fields_config( -+ self, -+ hf_inputs: BatchFeature, -+ hf_processor_mm_kwargs: Mapping[str, object], -+ ) -> Mapping[str, MultiModalFieldConfig]: -+ return dict( -+ input_features=MultiModalFieldConfig.batched("audio"), -+ feature_attention_mask=MultiModalFieldConfig.batched("audio"), - ) - - def _get_prompt_replacements( - self, - mm_items: MultiModalDataItems, -- hf_inputs: BatchFeature, -- mm_processor_kwargs: Mapping[str, object], -+ hf_processor_mm_kwargs: Mapping[str, object], -+ out_mm_kwargs: MultiModalKwargs, - ) -> list[PromptReplacement]: -- hf_config = self.ctx.get_hf_config(Qwen2AudioConfig) -+ hf_config = self._get_hf_config() - placeholder = hf_config.audio_token_index - -- feature_attention_mask = hf_inputs.get("feature_attention_mask") -+ feature_attention_mask = out_mm_kwargs.get("feature_attention_mask") - if feature_attention_mask is None: - audio_output_lengths = [] - else: -- _, audio_output_lengths = _get_feat_extract_output_lengths( -+ assert isinstance(feature_attention_mask, torch.Tensor) -+ _, audio_output_lens = _get_feat_extract_output_lengths( - feature_attention_mask.sum(-1)) - -+ audio_output_lengths = audio_output_lens.tolist() -+ - def get_replacement_qwen2_audio(item_idx: int): -- return [placeholder] * audio_output_lengths[item_idx] -+ num_placeholders = audio_output_lengths[item_idx] -+ if num_placeholders == 0: -+ audios = mm_items.get_items("audio", AudioProcessorItems) -+ audio = audios.get(item_idx) -+ raise ValueError( -+ f"The audio {audio} (len={len(audio)}) is too short " -+ "to be represented inside the model") -+ -+ return [placeholder] * num_placeholders - - return [ - PromptReplacement( -@@ -160,27 +226,15 @@ class Qwen2AudioMultiModalProcessor(BaseMultiModalProcessor): - ) - ] - -- def _get_dummy_mm_inputs( -- self, -- mm_counts: Mapping[str, int], -- ) -> ProcessorInputs: -- feature_extractor = self._get_feature_extractor() -- sampling_rate = feature_extractor.sampling_rate -- audio_len = feature_extractor.chunk_length * sampling_rate -- -- audio_count = mm_counts["audio"] -- audio = np.zeros(audio_len) -- data = {"audio": [audio] * audio_count} -- -- return ProcessorInputs( -- prompt_text="<|AUDIO|>" * audio_count, -- mm_data=data, -- mm_processor_kwargs={}, -- ) -+ def _always_apply_prompt_replacements(self) -> bool: -+ # HF never applies prompt replacements, so we have to do it ourselves. -+ # NOTE: `_find_placeholders_by_modality` may incorrectly think that HF -+ # has already performed processing for multi-audio input when the input -+ # audios are short (the corresponding placeholders may take up fewer -+ # tokens than the number of audio items) -+ return True - - --@MULTIMODAL_REGISTRY.register_max_multimodal_tokens( -- "audio", get_max_qwen2_audio_audio_tokens) - @MULTIMODAL_REGISTRY.register_processor(Qwen2AudioMultiModalProcessor) - class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal, - SupportsPP): -diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py -index fb97eb191..2a9935cca 100644 ---- a/vllm/model_executor/models/qwen2_vl.py -+++ b/vllm/model_executor/models/qwen2_vl.py -@@ -22,14 +22,13 @@ - # limitations under the License. - """Inference-only Qwen2-VL model compatible with HuggingFace weights.""" - from functools import cached_property, partial --from typing import (Any, Iterable, List, Literal, Mapping, Optional, Set, -- Tuple, Type, TypedDict, Union) -+from typing import (Any, Callable, Iterable, List, Literal, Mapping, Optional, -+ Set, Tuple, Type, TypedDict, Union) - - import torch - import torch.nn as nn - import torch.nn.functional as F - from einops import rearrange, repeat --from PIL import Image - from transformers import BatchFeature - from transformers.models.qwen2_vl import (Qwen2VLImageProcessor, - Qwen2VLProcessor) -@@ -39,9 +38,8 @@ from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize - - from vllm.attention import AttentionMetadata - from vllm.config import VllmConfig --from vllm.distributed import parallel_state -+from vllm.distributed import parallel_state, tensor_model_parallel_all_gather - from vllm.distributed import utils as dist_utils --from vllm.inputs import InputContext - from vllm.logger import init_logger - from vllm.model_executor import SamplingMetadata - from vllm.model_executor.layers.activation import QuickGELU -@@ -53,15 +51,20 @@ from vllm.model_executor.layers.quantization.gptq_marlin import ( - GPTQMarlinConfig) - from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler - from vllm.model_executor.model_loader.weight_utils import default_weight_loader -+from vllm.model_executor.models.module_mapping import MultiModelKeys - from vllm.multimodal import MULTIMODAL_REGISTRY --from vllm.multimodal.inputs import MultiModalDataDict, NestedTensors -+from vllm.multimodal.inputs import (ImageItem, ModalityData, -+ MultiModalFieldConfig, MultiModalKwargs, -+ NestedTensors, VideoItem) -+from vllm.multimodal.parse import (ImageSize, ModalityDataItems, -+ MultiModalDataParser) - from vllm.multimodal.processing import (BaseMultiModalProcessor, -- MultiModalDataItems, ProcessorInputs, -+ MultiModalDataItems, ProcessingMixin, - PromptReplacement) -+from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs - from vllm.platforms import _Backend - from vllm.sequence import IntermediateTensors - from vllm.transformers_utils.config import uses_mrope --from vllm.utils import is_list_of - - from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP - from .utils import (AutoWeightsLoader, WeightsMapper, get_vit_attn_backend, -@@ -229,15 +232,17 @@ class Qwen2VisionAttention(nn.Module): - - def __init__( - self, -- embed_dim: Optional[int] = None, -- num_heads: Optional[int] = None, -- projection_size: Optional[int] = None, -+ embed_dim: int, -+ num_heads: int, -+ projection_size: int, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ) -> None: - super().__init__() - # Per attention head and per partition values. - world_size = parallel_state.get_tensor_model_parallel_world_size() -+ self.tp_size = world_size -+ self.tp_rank = parallel_state.get_tensor_model_parallel_rank() - self.hidden_size_per_attention_head = dist_utils.divide( - projection_size, num_heads) - self.num_attention_heads_per_partition = dist_utils.divide( -@@ -253,31 +258,73 @@ class Qwen2VisionAttention(nn.Module): - prefix=f"{prefix}.proj") - - # Detect attention implementation. -- self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True) -- if self.attn_backend not in { -- _Backend.FLASH_ATTN, _Backend.TORCH_SDPA, _Backend.XFORMERS -- }: -- raise RuntimeError( -- f"Qwen2-VL does not support {self.attn_backend} backend now.") -+ # selected_backend: Optional[_Backend] = get_global_forced_attn_backend() -+ # if selected_backend is None: -+ # backend_by_env_var: Optional[str] = envs.VLLM_ATTENTION_BACKEND -+ # if backend_by_env_var is not None: -+ # selected_backend = backend_name_to_enum(backend_by_env_var) -+ # if selected_backend is None: -+ # # For Volta and Turing GPUs, use xformers instead. -+ # device_available = current_platform.get_device_capability()[0] >= 8 -+ # if device_available: -+ # from transformers.utils import is_flash_attn_2_available -+ -+ # if is_flash_attn_2_available(): -+ # self._use_flash_attn = True -+ # else: -+ # logger.warning( -+ # "Current Qwen2-VL implementation has a bug with " -+ # "`vllm-flash-attn` inside vision module, so we use " -+ # "xformers backend instead. You can run `pip install " -+ # "flash-attn to use flash-attention backend.") -+ # self._use_flash_attn = False -+ # else: -+ # self._use_flash_attn = False -+ # else: -+ # if selected_backend == _Backend.FLASH_ATTN: -+ # self._use_flash_attn = True -+ # elif selected_backend == _Backend.XFORMERS: -+ # self._use_flash_attn = False -+ # else: -+ # raise RuntimeError( -+ # f"Qwen2-VL does not support {selected_backend} backend now." -+ # ) -+ -+ def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]: -+ # [s, b, 3 * head * head_dim] -+ seq_len, bs, _ = qkv.shape -+ if self.tp_size > 1: -+ qkv = tensor_model_parallel_all_gather(qkv) -+ -+ # [s, b, 3 * head * head_dim] -> 3 * [s, b, head * head_dim] -+ q, k, v = qkv.chunk(3, dim=2) -+ -+ # 3 * [s, b, head * head_dim] -+ if self.tp_size > 1: -+ splitter = partial(dist_utils.split_tensor_along_last_dim, -+ num_partitions=self.tp_size) -+ q = splitter(q)[self.tp_rank] -+ k = splitter(k)[self.tp_rank] -+ v = splitter(v)[self.tp_rank] -+ -+ # 3 * [s, b, head * head_dim] -> 3 * [s, b, head, head_dim] -+ new_shape = (seq_len, bs, self.num_attention_heads_per_partition, -+ self.hidden_size_per_attention_head) -+ q, k, v = (x.view(*new_shape) for x in (q, k, v)) -+ return q, k, v - - def forward( - self, - x: torch.Tensor, - cu_seqlens: torch.Tensor, -- rotary_pos_emb: torch.Tensor = None, -+ rotary_pos_emb: torch.Tensor, - ) -> torch.Tensor: -+ seq_length = x.shape[0] - # [s, b, c] --> [s, b, head * 3 * head_dim] - x, _ = self.qkv(x) - -- # [s, b, head * 3 * head_dim] --> [s, b, head, 3 * head_dim] -- new_x_shape = x.size()[:-1] + ( -- self.num_attention_heads_per_partition, -- 3 * self.hidden_size_per_attention_head, -- ) -- x = x.view(*new_x_shape) -- -- # [s, b, head, 3 * head_dim] --> 3 [s, b, head, head_dim] -- q, k, v = dist_utils.split_tensor_along_last_dim(x, 3) -+ # [s, b, 3 * head * head_dim] -> 3 * [s, b, head, head_dim] -+ q, k, v = self.split_qkv(x) - batch_size = q.shape[1] - - q, k, v = (rearrange(x, "s b ... -> b s ...").contiguous() -@@ -285,57 +332,61 @@ class Qwen2VisionAttention(nn.Module): - if rotary_pos_emb is not None: - q = apply_rotary_pos_emb_vision(q, rotary_pos_emb) - k = apply_rotary_pos_emb_vision(k, rotary_pos_emb) -- -- if self.attn_backend == _Backend.FLASH_ATTN: -- # from vllm_flash_attn.flash_attn_interface import ( -- # flash_attn_varlen_func) -- from flash_attn import flash_attn_varlen_func -- -- q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]) -- -- max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item() -- output = flash_attn_varlen_func(q, -- k, -- v, -- cu_seqlens_q=cu_seqlens, -- cu_seqlens_k=cu_seqlens, -- max_seqlen_q=max_seqlen, -- max_seqlen_k=max_seqlen, -- dropout_p=0, -- causal=False) -- -- context_layer = rearrange(output, -- "(b s) ... -> b s ...", -- b=batch_size) -- elif self.attn_backend == _Backend.TORCH_SDPA: -- seq_length = q.size(1) -- q, k, v = (rearrange(x, "b s h d -> b h s d") for x in [q, k, v]) -- attention_mask = torch.zeros([1, seq_length, seq_length], -- device=q.device, -- dtype=torch.bool) -- for i in range(1, len(cu_seqlens)): -- attention_mask[..., cu_seqlens[i - 1]:cu_seqlens[i], -- cu_seqlens[i - 1]:cu_seqlens[i]] = True -- output = F.scaled_dot_product_attention(q, -- k, -- v, -- attention_mask, -- dropout_p=0.0) -- context_layer = rearrange(output, "b h s d -> b s h d ") -- elif self.attn_backend == _Backend.XFORMERS: -- from xformers import ops as xops -- from xformers.ops.fmha.attn_bias import BlockDiagonalMask -- -- seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist() -- attn_bias = BlockDiagonalMask.from_seqlens(q_seqlen=seqlens, -- kv_seqlen=None) -- -- context_layer = xops.memory_efficient_attention_forward( -- q, k, v, attn_bias=attn_bias, p=0, scale=None) -- context_layer = rearrange(context_layer, -- "b s h d -> s b (h d)").contiguous() -- -- output, _ = self.proj(context_layer) -+ query = q.movedim(1, 2) -+ key = k.movedim(1, 2) -+ value = v.movedim(1, 2) -+ head_dim = query.shape[-1] -+ # if len(cu_seqlens) == 2 and cu_seqlens.tolist() == [0, seq_length]: -+ # attention_mask = None -+ # else: -+ # attention_mask = torch.full( -+ # [1, seq_length, seq_length], torch.finfo(q.dtype).min, device=q.device, dtype=q.dtype -+ # ) -+ # for i in range(1, len(cu_seqlens)): -+ # attention_mask[..., cu_seqlens[i - 1]:cu_seqlens[i], -+ # cu_seqlens[i - 1]:cu_seqlens[i]] = 0 -+ #from ipex_llm.transformers.models.common import attention_softmax -+ from ipex_llm.transformers.models.utils import use_sdp_non_causal -+ import math -+ seq_lens = [] -+ for i in range(1, len(cu_seqlens)): -+ seq_lens.append(cu_seqlens[i]-cu_seqlens[i-1]) -+ att_masks = [None] * len(seq_lens) -+ -+ num_tokens = q.shape[0] * q.shape[1] -+ attn_output = torch.empty( -+ (num_tokens, self.num_attention_heads_per_partition, self.hidden_size_per_attention_head), -+ dtype=query.dtype, device=query.device) -+ start = 0 -+ for seq_len, mask in zip(seq_lens, -+ att_masks): -+ end = start + seq_len -+ if use_sdp_non_causal(head_dim, q.device, q.dtype): -+ import xe_addons -+ scale = 1 / math.sqrt(head_dim) -+ if mask is not None: -+ mask = mask.unsqueeze(0) -+ sub_out = xe_addons.sdp_non_causal( -+ query[:, :, start:end, :].contiguous(), -+ key[:, :, start:end, :].contiguous(), -+ value[:, :, start:end, :].contiguous(), -+ mask, -+ scale).squeeze(0).movedim(0, 1) -+ else: -+ sub_out = torch.nn.functional.scaled_dot_product_attention( -+ query[:, :, start:end, :], -+ key[:, :, start:end, :], -+ value[:, :, start:end, :], -+ attn_mask=mask, -+ dropout_p=0.0, -+ is_causal=False, -+ scale= self.hidden_size_per_attention_head**-0.5).squeeze(0).movedim( -+ 0, 1) -+ attn_output[start:end, :, :] = sub_out -+ start = end -+ output = attn_output.reshape(-1, batch_size, self.hidden_size_per_attention_head * self.num_attention_heads_per_partition) -+ -+ output, _ = self.proj(output) - return output - - -@@ -347,7 +398,7 @@ class Qwen2VisionBlock(nn.Module): - num_heads: int, - mlp_ratio: float, - act_layer: Type[nn.Module] = QuickGELU, -- norm_layer: Type[nn.Module] = None, -+ norm_layer: Optional[Callable[[int], nn.Module]] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ) -> None: -@@ -384,7 +435,7 @@ class Qwen2VisionPatchEmbed(nn.Module): - self, - patch_size: int = 14, - temporal_patch_size: int = 2, -- in_chans: int = 3, -+ in_channels: int = 3, - embed_dim: int = 1152, - ) -> None: - super().__init__() -@@ -392,8 +443,8 @@ class Qwen2VisionPatchEmbed(nn.Module): - self.temporal_patch_size = temporal_patch_size - self.embed_dim = embed_dim - -- kernel_size = [temporal_patch_size, patch_size, patch_size] -- self.proj = nn.Conv3d(in_chans, -+ kernel_size = (temporal_patch_size, patch_size, patch_size) -+ self.proj = nn.Conv3d(in_channels, - embed_dim, - kernel_size=kernel_size, - stride=kernel_size, -@@ -413,7 +464,7 @@ class Qwen2VisionPatchMerger(nn.Module): - self, - d_model: int, - context_dim: int, -- norm_layer: Type[nn.Module] = None, -+ norm_layer: Optional[Callable[[int], nn.Module]] = None, - spatial_merge_size: int = 2, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", -@@ -489,15 +540,15 @@ class Qwen2VisionTransformer(nn.Module): - ) -> None: - super().__init__() - -- patch_size: int = vision_config.patch_size -- temporal_patch_size: int = vision_config.temporal_patch_size -- spatial_merge_size: int = vision_config.spatial_merge_size -- in_chans: int = vision_config.in_chans -- hidden_size: int = vision_config.hidden_size -- embed_dim: int = vision_config.embed_dim -- depth: int = vision_config.depth -- num_heads: int = vision_config.num_heads -- mlp_ratio: float = vision_config.mlp_ratio -+ patch_size = vision_config.patch_size -+ temporal_patch_size = vision_config.temporal_patch_size -+ spatial_merge_size = vision_config.spatial_merge_size -+ in_channels = vision_config.in_channels -+ hidden_size = vision_config.hidden_size -+ embed_dim = vision_config.embed_dim -+ depth = vision_config.depth -+ num_heads = vision_config.num_heads -+ mlp_ratio = vision_config.mlp_ratio - - self.spatial_merge_size = spatial_merge_size - self.num_heads = num_heads -@@ -506,7 +557,7 @@ class Qwen2VisionTransformer(nn.Module): - self.patch_embed = Qwen2VisionPatchEmbed( - patch_size=patch_size, - temporal_patch_size=temporal_patch_size, -- in_chans=in_chans, -+ in_channels=in_channels, - embed_dim=embed_dim, - ) - -@@ -570,9 +621,7 @@ class Qwen2VisionTransformer(nn.Module): - grid_thw: torch.Tensor, - ) -> torch.Tensor: - # patchify -- x = x.to(device=self.device, dtype=self.dtype) - x = self.patch_embed(x) -- - # compute position embedding - rotary_pos_emb = self.rot_pos_emb(grid_thw) - -@@ -613,24 +662,6 @@ class Qwen2VisionTransformer(nn.Module): - weight_loader(param, loaded_weight, shard_id) - break - else: -- if name.endswith("qkv.weight"): -- visual_num_heads = self.num_heads -- visual_embed_dim = self.embed_dim -- head_size = visual_embed_dim // visual_num_heads -- loaded_weight = loaded_weight.view(3, visual_num_heads, -- head_size, -- visual_embed_dim) -- loaded_weight = loaded_weight.transpose(0, 1) -- loaded_weight = loaded_weight.reshape(-1, visual_embed_dim) -- elif name.endswith("qkv.bias"): -- visual_num_heads = self.num_heads -- visual_embed_dim = self.embed_dim -- head_size = visual_embed_dim // visual_num_heads -- loaded_weight = loaded_weight.view(3, visual_num_heads, -- head_size) -- loaded_weight = loaded_weight.transpose(0, 1) -- loaded_weight = loaded_weight.reshape(-1) -- - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) -@@ -639,129 +670,76 @@ class Qwen2VisionTransformer(nn.Module): - return loaded_params - - --# === Vision input helpers === # -- -- --def _get_vision_info( -- vision_config: Qwen2VLVisionConfig, -- height: int, -- width: int, -- min_pixels: int, -- max_pixels: int, -- do_resize: bool = True, -- data_type_key: str = "image", -- mm_count: int = 1, --): -- """Get information (resized height / width and number of vision tokens) -- of input image / video frame.""" -- patch_size = vision_config.patch_size -- merge_size = vision_config.spatial_merge_size -- temporal_patch_size = vision_config.temporal_patch_size -- -- if do_resize: -- resized_height, resized_width = smart_resize( -- height=height, -- width=width, -- factor=patch_size * merge_size, -- min_pixels=min_pixels, -- max_pixels=max_pixels, -- ) -- else: -- resized_height, resized_width = height, width -+class Qwen2EmbeddingItems(ModalityDataItems[dict[str, torch.Tensor], -+ dict[str, torch.Tensor]]): - -- if data_type_key == "image": -- grid_t = mm_count -- else: -- assert data_type_key == "video" -- grid_t = max(mm_count // temporal_patch_size, 1) -+ def __init__(self, data: dict, modality: str) -> None: -+ super().__init__(data, modality) - -- grid_h = resized_height // patch_size -- grid_w = resized_width // patch_size -- vision_tokens = grid_t * grid_h * grid_w -- llm_num_vision_tokens = vision_tokens // (merge_size**2) -+ grid_thw = data[f"{modality}_grid_thw"] -+ slice_idxs = [0] + grid_thw.prod(-1).cumsum_(0).tolist() -+ self._slices = [ -+ slice(slice_idxs[i], slice_idxs[i + 1]) -+ for i in range(len(grid_thw)) -+ ] - -- return resized_height, resized_width, llm_num_vision_tokens -+ def get_count(self) -> int: -+ return len(self.data[f"{self.modality}_grid_thw"]) - -+ def get(self, index: int) -> dict[str, torch.Tensor]: -+ out = {} -+ for k, v in self.data.items(): -+ if v != f"{self.modality}_grid_thw": -+ v = v[self._slices[index]] - --def _get_image_processor(hf_processor: Qwen2VLProcessor): -- image_processor = hf_processor.image_processor # type: ignore -- assert isinstance(image_processor, Qwen2VLImageProcessor) -- return image_processor -+ out[k] = v - -+ return out - --def get_max_qwen2_vl_mm_tokens(ctx: InputContext, -- data_type_key: str, -- *, -- min_pixels: Optional[int] = None, -- max_pixels: Optional[int] = None) -> int: -- hf_config = ctx.get_hf_config(Qwen2VLConfig) -- vision_config = hf_config.vision_config -+ def get_processor_data(self) -> Mapping[str, object]: -+ return {} - -- hf_processor = ctx.get_hf_processor(Qwen2VLProcessor) -- image_processor = _get_image_processor(hf_processor) -+ def get_passthrough_data(self) -> Mapping[str, object]: -+ return self.data - -- _, _, max_llm_image_tokens = _get_vision_info( -- vision_config, -- height=9999999, -- width=9999999, -- min_pixels=min_pixels or image_processor.min_pixels, -- max_pixels=max_pixels or image_processor.max_pixels, -- data_type_key=data_type_key, -- ) -- return max_llm_image_tokens - -+class Qwen2ImageEmbeddingItems(Qwen2EmbeddingItems): - --get_max_qwen2_vl_image_tokens = partial(get_max_qwen2_vl_mm_tokens, -- data_type_key="image") --get_max_qwen2_vl_video_tokens = partial(get_max_qwen2_vl_mm_tokens, -- data_type_key="video") -+ def __init__(self, data: dict) -> None: -+ super().__init__(data, "image") - - --class Qwen2VLMultiModalDataItems(MultiModalDataItems): -+class Qwen2VideoEmbeddingItems(Qwen2EmbeddingItems): - -- @staticmethod -- def from_dict(data: MultiModalDataDict) -> "MultiModalDataItems": -- """ -- Normalize :class:`MultiModalDataDict` to :class:`MultiModalDataItems`. -- """ -- multi_data = Qwen2VLMultiModalDataItems() -- -- for k, v in data.items(): -- # TODO: Make a separate modality for embedding inputs -- # to avoid confusion -- # yapf: disable -- if k == "video": -- # Special case since even a single item can be a list -- multi_data[k] = ( # type: ignore[index] -- v if (isinstance(v, (dict, torch.Tensor)) # type: ignore[assignment] -- or is_list_of(v, list)) else [v] -- ) -- elif k in ("image", "audio"): -- multi_data[k] = ( # type: ignore[index] -- v if isinstance(v, (dict, torch.Tensor, list)) else [v] -- ) -- else: -- multi_data[k] = v if isinstance(v, list) else [v] # type: ignore[index] -- # yapf: enable -+ def __init__(self, data: dict) -> None: -+ super().__init__(data, "video") - -- return multi_data - -- def get_item_counts(self) -> Mapping[str, int]: -- return { -- m: ( -- len(items[f"{m}_grid_thw"]) # type: ignore -- if isinstance(items, dict) else len(items)) -- for m, items in self.items() -- } -+class Qwen2MultiModalDataParser(MultiModalDataParser): - -+ def _parse_image_data( -+ self, -+ data: Union[dict[str, torch.Tensor], ModalityData[ImageItem]], -+ ) -> ModalityDataItems[Any, Any]: -+ if isinstance(data, dict): -+ return Qwen2EmbeddingItems(data, modality="image") - --class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor): -+ return super()._parse_image_data(data) - -- def _get_mm_items( -+ def _parse_video_data( - self, -- mm_data: MultiModalDataDict, -- ) -> MultiModalDataItems: -- return Qwen2VLMultiModalDataItems.from_dict(mm_data) -+ data: Union[dict[str, torch.Tensor], ModalityData[VideoItem]], -+ ) -> ModalityDataItems[Any, Any]: -+ if isinstance(data, dict): -+ return Qwen2EmbeddingItems(data, modality="video") -+ -+ return super()._parse_video_data(data) -+ -+ -+class Qwen2VLProcessingMixin(ProcessingMixin): -+ -+ def _get_hf_config(self): -+ return self.ctx.get_hf_config(Qwen2VLConfig) - - def _get_hf_processor( - self, -@@ -770,7 +748,8 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor): - max_pixels: Optional[int] = None, - ) -> Qwen2VLProcessor: - hf_processor = self.ctx.get_hf_processor(Qwen2VLProcessor) -- image_processor = _get_image_processor(hf_processor) -+ image_processor = hf_processor.image_processor # type: ignore -+ assert isinstance(image_processor, Qwen2VLImageProcessor) - - if min_pixels: - image_processor.min_pixels = min_pixels -@@ -784,43 +763,206 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor): - - return hf_processor - -- def _get_processor_data( -+ def _get_image_processor( - self, -- mm_items: MultiModalDataItems, -- ) -> tuple[dict[str, Any], dict[str, Any]]: -- processor_data = dict[str, Any]() -- passthrough_data = dict[str, Any]() -- -- for k, v in mm_items.items(): -- # TODO: Make a separate modality for embedding inputs -- # to avoid confusion -- if k in ("image", "video", "audio"): -- if isinstance(v, dict): -- # Pass through embedding inputs (dict) -- passthrough_data.update(v) -- elif isinstance(v, torch.Tensor) and v.ndim == 3: -- # Pass through embedding inputs (single) -- passthrough_data[f"{k}_embeds"] = [v] -- elif (is_list_of(v, torch.Tensor) and len(v) > 0 -- and v[0].ndim == 2): -- # Pass through embedding inputs (multi) -- passthrough_data[f"{k}_embeds"] = v -- else: -- # Map keys to plural form, e.g.: image -> images -- processor_data[f"{k}s"] = v -- else: -- processor_data[k] = v -+ *, -+ min_pixels: Optional[int] = None, -+ max_pixels: Optional[int] = None, -+ ): -+ hf_processor = self._get_hf_processor(min_pixels=min_pixels, -+ max_pixels=max_pixels) -+ image_processor = hf_processor.image_processor # type: ignore -+ assert isinstance(image_processor, Qwen2VLImageProcessor) -+ return image_processor -+ -+ def _get_vision_info( -+ self, -+ *, -+ image_width: int, -+ image_height: int, -+ num_frames: int = 1, -+ do_resize: bool = True, -+ ) -> tuple[ImageSize, int]: -+ hf_config = self._get_hf_config() -+ vision_config = hf_config.vision_config -+ patch_size = vision_config.patch_size -+ merge_size = vision_config.spatial_merge_size -+ temporal_patch_size = vision_config.temporal_patch_size -+ -+ image_processor = self._get_image_processor() -+ -+ if do_resize: -+ resized_height, resized_width = smart_resize( -+ height=image_height, -+ width=image_width, -+ factor=patch_size * merge_size, -+ min_pixels=image_processor.min_pixels, -+ max_pixels=image_processor.max_pixels, -+ ) -+ preprocessed_size = ImageSize(width=resized_width, -+ height=resized_height) -+ else: -+ preprocessed_size = ImageSize(width=image_width, -+ height=image_height) -+ -+ grid_t = max(num_frames // temporal_patch_size, 1) -+ grid_h = preprocessed_size.height // patch_size -+ grid_w = preprocessed_size.width // patch_size -+ -+ num_patches = grid_t * grid_h * grid_w -+ num_vision_tokens = num_patches // (merge_size**2) -+ -+ return preprocessed_size, num_vision_tokens -+ -+ def _get_num_image_tokens( -+ self, -+ *, -+ image_width: int, -+ image_height: int, -+ ) -> int: -+ _, num_image_tokens = self._get_vision_info( -+ image_width=image_width, -+ image_height=image_height, -+ ) -+ return num_image_tokens -+ -+ def _get_num_video_tokens( -+ self, -+ *, -+ image_width: int, -+ image_height: int, -+ num_frames: int, -+ ) -> int: -+ _, num_video_tokens = self._get_vision_info( -+ image_width=image_width, -+ image_height=image_height, -+ num_frames=num_frames, -+ ) -+ return num_video_tokens -+ -+ -+class Qwen2VLProfilingInfo(Qwen2VLProcessingMixin, BaseProfilingInfo): -+ -+ def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: -+ return {"image": None, "video": None} -+ -+ def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: -+ return { -+ "image": self._get_max_image_tokens(), -+ "video": self._get_max_video_tokens(seq_len), -+ } -+ -+ def _get_image_size_with_most_features(self) -> ImageSize: -+ max_image_size, _ = self._get_vision_info( -+ image_width=9999999, -+ image_height=9999999, -+ ) -+ return max_image_size - -- return processor_data, passthrough_data -+ def _get_max_image_tokens(self) -> int: -+ target_width, target_height = self._get_image_size_with_most_features() -+ -+ return self._get_num_image_tokens( -+ image_width=target_width, -+ image_height=target_height, -+ ) -+ -+ def _get_max_video_frames(self, max_tokens: int) -> int: -+ target_width, target_height = self._get_image_size_with_most_features() -+ -+ num_frames = 0 -+ -+ while True: -+ next_num_frames = num_frames + 1 -+ next_max_tokens = self._get_num_video_tokens( -+ image_width=target_width, -+ image_height=target_height, -+ num_frames=next_num_frames, -+ ) -+ -+ if next_max_tokens > max_tokens: -+ break -+ -+ num_frames = next_num_frames -+ -+ return num_frames -+ -+ def _get_dummy_num_frames(self, seq_len: int) -> int: -+ mm_config = self.ctx.get_mm_config() -+ max_images = mm_config.limit_per_prompt.get("image", 1) -+ max_videos = mm_config.limit_per_prompt.get("video", 1) -+ -+ max_image_tokens = self._get_max_image_tokens() * max_images -+ max_total_frames = self._get_max_video_frames(seq_len - -+ max_image_tokens) -+ -+ num_frames = max(max_total_frames // max(max_videos, 1), 1) -+ -+ # Temporary workaround for https://github.com/huggingface/transformers/issues/35412 -+ if num_frames > 1 and num_frames % 2 == 1: -+ num_frames += 1 -+ -+ return num_frames -+ -+ def _get_max_video_tokens(self, seq_len: int) -> int: -+ target_width, target_height = self._get_image_size_with_most_features() -+ -+ return self._get_num_video_tokens( -+ image_width=target_width, -+ image_height=target_height, -+ num_frames=self._get_dummy_num_frames(seq_len), -+ ) -+ -+ def get_dummy_processor_inputs( -+ self, -+ seq_len: int, -+ mm_counts: Mapping[str, int], -+ ) -> ProcessorInputs: -+ num_images = mm_counts.get("image", 0) -+ num_videos = mm_counts.get("video", 0) -+ -+ hf_processor = self._get_hf_processor() -+ image_token: str = hf_processor.image_token -+ video_token: str = hf_processor.video_token -+ target_width, target_height = self._get_image_size_with_most_features() -+ -+ mm_data = { -+ "image": -+ self._get_dummy_images(width=target_width, -+ height=target_height, -+ num_images=num_images), -+ "video": -+ self._get_dummy_videos( -+ width=target_width, -+ height=target_height, -+ num_frames=self._get_dummy_num_frames(seq_len), -+ num_videos=num_videos, -+ ) -+ } -+ -+ return ProcessorInputs( -+ prompt_text=image_token * num_images + video_token * num_videos, -+ mm_data=mm_data, -+ ) -+ -+ -+class Qwen2VLMultiModalProcessor(Qwen2VLProcessingMixin, -+ BaseMultiModalProcessor): -+ -+ def _get_profiling_info(self) -> BaseProfilingInfo: -+ return Qwen2VLProfilingInfo(self.ctx) -+ -+ def _get_data_parser(self) -> MultiModalDataParser: -+ return Qwen2MultiModalDataParser() - - def _get_prompt_replacements( - self, - mm_items: MultiModalDataItems, -- hf_inputs: BatchFeature, -- mm_processor_kwargs: Mapping[str, object], -+ hf_processor_mm_kwargs: Mapping[str, Any], -+ out_mm_kwargs: MultiModalKwargs, - ) -> list[PromptReplacement]: -- hf_processor = self._get_hf_processor() -- image_processor = _get_image_processor(hf_processor) -+ hf_processor = self._get_hf_processor(**hf_processor_mm_kwargs) -+ image_processor = self._get_image_processor(**hf_processor_mm_kwargs) - - # NOTE: Only Qwen2VLProcessor in transformers 4.47.0 has - # image_token and video_token registered -@@ -831,7 +973,9 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor): - merge_length = image_processor.merge_size**2 - - def get_replacement_qwen2vl(item_idx: int, modality: str): -- grid_thw = hf_inputs[f"{modality}_grid_thw"][item_idx] -+ grid_thw = out_mm_kwargs[f"{modality}_grid_thw"][item_idx] -+ assert isinstance(grid_thw, torch.Tensor) -+ - num_tokens = grid_thw.prod() // merge_length - return placeholder[modality] * num_tokens - -@@ -844,38 +988,36 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor): - ) for modality in ("image", "video") - ] - -- def _get_dummy_mm_inputs( -+ def _get_mm_fields_config( - self, -- mm_counts: Mapping[str, int], -- ) -> ProcessorInputs: -- num_images = mm_counts["image"] -- hf_processor = self._get_hf_processor() -- image_token: str = hf_processor.image_token -- image_processor = _get_image_processor(hf_processor) -- -- data = {} -- resized_height, resized_width = smart_resize( -- height=9999999, -- width=9999999, -- factor=image_processor.patch_size * image_processor.merge_size, -- min_pixels=image_processor.min_pixels, -- max_pixels=image_processor.max_pixels, -- ) -+ hf_inputs: BatchFeature, -+ hf_processor_mm_kwargs: Mapping[str, object], -+ ) -> Mapping[str, MultiModalFieldConfig]: -+ image_grid_thw = hf_inputs.get("image_grid_thw", torch.empty((0, 3))) -+ image_slice_idxs = [0] + image_grid_thw.prod(-1).cumsum_(0).tolist() -+ image_slices = [ -+ slice(image_slice_idxs[i], image_slice_idxs[i + 1]) -+ for i in range(len(image_grid_thw)) -+ ] - -- dummy_image = Image.new("RGB", (resized_width, resized_height), -- color=0) -- data["image"] = [dummy_image] * num_images -+ video_grid_thw = hf_inputs.get("video_grid_thw", torch.empty((0, 3))) -+ video_slice_idxs = [0] + video_grid_thw.prod(-1).cumsum_(0).tolist() -+ video_slices = [ -+ slice(video_slice_idxs[i], video_slice_idxs[i + 1]) -+ for i in range(len(video_grid_thw)) -+ ] - -- return ProcessorInputs( -- prompt_text=image_token * num_images, -- mm_data=data, -- mm_processor_kwargs={}, -+ return dict( -+ pixel_values=MultiModalFieldConfig.flat("image", image_slices), -+ image_embeds=MultiModalFieldConfig.flat("image", image_slices), -+ image_grid_thw=MultiModalFieldConfig.batched("image"), -+ pixel_values_videos=MultiModalFieldConfig.flat( -+ "video", video_slices), -+ video_embeds=MultiModalFieldConfig.flat("video", video_slices), -+ video_grid_thw=MultiModalFieldConfig.batched("video"), - ) - - --@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_qwen2_vl_image_tokens) --@MULTIMODAL_REGISTRY.register_max_multimodal_tokens( -- "video", get_max_qwen2_vl_video_tokens) - @MULTIMODAL_REGISTRY.register_processor(Qwen2VLMultiModalProcessor) - class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, - SupportsLoRA, SupportsPP): -@@ -892,15 +1034,33 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, - } - - # LoRA specific attributes -- # TODO Support LoRA for the visual encoder in the future. - supported_lora_modules = [ - "qkv_proj", - "o_proj", - "gate_up_proj", - "down_proj", -+ # vision tower -+ "qkv", -+ "attn.proj", # Distinguish patch_embed.proj -+ "fc1", -+ "fc2", -+ # projector -+ "mlp.0", -+ "mlp.2" - ] - embedding_modules = {} - embedding_padding_modules = [] -+ -+ # BitandBytes specific attributes -+ bitsandbytes_stacked_params_mapping = { -+ # shard_name, weight_name, index -+ "q_proj": ("qkv_proj", 0), -+ "k_proj": ("qkv_proj", 1), -+ "v_proj": ("qkv_proj", 2), -+ "gate_proj": ("gate_up_proj", 0), -+ "up_proj": ("gate_up_proj", 1), -+ } -+ - # To ensure correct weight loading and mapping. - hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={ - "lm_head.": "language_model.lm_head.", -@@ -950,9 +1110,7 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, - return None - return quant_config - -- def _validate_and_reshape_mm_tensor(self, -- mm_input: Union[torch.Tensor, -- List[torch.Tensor]], -+ def _validate_and_reshape_mm_tensor(self, mm_input: object, - name: str) -> torch.Tensor: - if not isinstance(mm_input, (torch.Tensor, list)): - raise ValueError(f"Incorrect type of {name}. " -@@ -962,7 +1120,8 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, - return mm_input - if mm_input.ndim != 3: - raise ValueError(f"{name} should be 2D or batched 3D tensor. " -- f"Got ndim: {mm_input.ndim}") -+ f"Got ndim: {mm_input.ndim} " -+ f"(shape={mm_input.shape})") - return torch.concat(list(mm_input)) - else: - return torch.concat(mm_input) -@@ -1042,7 +1201,8 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, - if image_input["type"] == "image_embeds": - return image_input["image_embeds"].type(self.visual.dtype) - -- pixel_values = image_input["pixel_values"].type(self.visual.dtype) -+ # pixel_values = image_input["pixel_values"].type(self.visual.dtype) -+ pixel_values = image_input["pixel_values"].to(torch.float16) - image_embeds = self.visual(pixel_values, - grid_thw=image_input["image_grid_thw"]) - return image_embeds -@@ -1198,3 +1358,12 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, - - loader = AutoWeightsLoader(self) - return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) -+ -+ def get_mm_mapping(self) -> MultiModelKeys: -+ """ -+ Get the module prefix in multimodal models -+ """ -+ return MultiModelKeys.from_string_field( -+ language_model="language_model", -+ connector="visual.", -+ tower_model="visual.merger.") -diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py -index feb33bb37..1b5453b6e 100644 ---- a/vllm/model_executor/models/registry.py -+++ b/vllm/model_executor/models/registry.py -@@ -18,7 +18,6 @@ import cloudpickle - import torch.nn as nn - - from vllm.logger import init_logger --from vllm.platforms import current_platform - - from .interfaces import (has_inner_state, is_attention_free, is_hybrid, - supports_cross_encoding, supports_multimodal, -@@ -96,6 +95,7 @@ _TEXT_GENERATION_MODELS = { - "Starcoder2ForCausalLM": ("starcoder2", "Starcoder2ForCausalLM"), - "SolarForCausalLM": ("solar", "SolarForCausalLM"), - "TeleChat2ForCausalLM": ("telechat2", "TeleChat2ForCausalLM"), -+ "TeleChatForCausalLM": ("telechat2", "TeleChat2ForCausalLM"), - "XverseForCausalLM": ("llama", "LlamaForCausalLM"), - # [Encoder-decoder] - "BartModel": ("bart", "BartForConditionalGeneration"), -@@ -113,6 +113,7 @@ _EMBEDDING_MODELS = { - "Gemma2Model": ("gemma2", "Gemma2ForCausalLM"), - "GlmForCausalLM": ("glm", "GlmForCausalLM"), - "GritLM": ("gritlm", "GritLM"), -+ "InternLM2ForRewardModel": ("internlm2", "InternLM2ForRewardModel"), - "JambaForSequenceClassification": ("jamba", "JambaForSequenceClassification"), # noqa: E501 - "LlamaModel": ("llama", "LlamaForCausalLM"), - **{ -@@ -170,6 +171,7 @@ _MULTIMODAL_MODELS = { - "UltravoxModel": ("ultravox", "UltravoxModel"), - # [Encoder-decoder] - "MllamaForConditionalGeneration": ("mllama", "MllamaForConditionalGeneration"), # noqa: E501 -+ "WhisperForConditionalGeneration": ("whisper", "WhisperForConditionalGeneration"), # noqa: E501 - } - - _SPECULATIVE_DECODING_MODELS = { -@@ -187,31 +189,6 @@ _VLLM_MODELS = { - **_SPECULATIVE_DECODING_MODELS, - } - --# Models not supported by ROCm. --_ROCM_UNSUPPORTED_MODELS: List[str] = [] -- --# Models partially supported by ROCm. --# Architecture -> Reason. --_ROCM_SWA_REASON = ("Sliding window attention (SWA) is not yet supported in " -- "Triton flash attention. For half-precision SWA support, " -- "please use CK flash attention by setting " -- "`VLLM_USE_TRITON_FLASH_ATTN=0`") --_ROCM_PARTIALLY_SUPPORTED_MODELS: Dict[str, str] = { -- "Qwen2ForCausalLM": -- _ROCM_SWA_REASON, -- "MistralForCausalLM": -- _ROCM_SWA_REASON, -- "MixtralForCausalLM": -- _ROCM_SWA_REASON, -- "PaliGemmaForConditionalGeneration": -- ("ROCm flash attention does not yet " -- "fully support 32-bit precision on PaliGemma"), -- "Phi3VForCausalLM": -- ("ROCm Triton flash attention may run into compilation errors due to " -- "excessive use of shared memory. If this happens, disable Triton FA " -- "by setting `VLLM_USE_TRITON_FLASH_ATTN=0`") --} -- - - @dataclass(frozen=True) - class _ModelInfo: -@@ -297,17 +274,8 @@ def _try_load_model_cls( - model_arch: str, - model: _BaseRegisteredModel, - ) -> Optional[Type[nn.Module]]: -- if current_platform.is_rocm(): -- if model_arch in _ROCM_UNSUPPORTED_MODELS: -- raise ValueError(f"Model architecture '{model_arch}' is not " -- "supported by ROCm for now.") -- -- if model_arch in _ROCM_PARTIALLY_SUPPORTED_MODELS: -- msg = _ROCM_PARTIALLY_SUPPORTED_MODELS[model_arch] -- logger.warning( -- "Model architecture '%s' is partially " -- "supported by ROCm: %s", model_arch, msg) -- -+ from vllm.platforms import current_platform -+ current_platform.verify_model_arch(model_arch) - try: - return model.load_model_cls() - except Exception: -diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py -index 6fb9e2cc4..7ea177e94 100644 ---- a/vllm/model_executor/models/siglip.py -+++ b/vllm/model_executor/models/siglip.py -@@ -28,6 +28,8 @@ from vllm.multimodal.utils import (cached_get_tokenizer, - resolve_visual_encoder_outputs) - from vllm.sequence import SequenceData - -+from .vision import VisionEncoderInfo -+ - - def get_siglip_patch_grid_length(*, image_size: int, patch_size: int) -> int: - # Since interpolation is applied, the image size need not be divisible -@@ -156,6 +158,32 @@ def input_processor_for_siglip( - multi_modal_placeholders={"image": ranges}) - - -+class SiglipEncoderInfo(VisionEncoderInfo[SiglipVisionConfig]): -+ -+ def get_num_image_tokens( -+ self, -+ *, -+ image_width: int, -+ image_height: int, -+ ) -> int: -+ return get_siglip_image_feature_size(self.vision_config) -+ -+ def get_max_image_tokens(self) -> int: -+ return get_max_siglip_image_tokens(self.vision_config) -+ -+ def get_image_size(self) -> int: -+ return self.vision_config.image_size -+ -+ def get_patch_size(self) -> int: -+ return self.vision_config.patch_size -+ -+ def get_patch_grid_length(self) -> int: -+ return get_siglip_patch_grid_length( -+ image_size=self.vision_config.image_size, -+ patch_size=self.vision_config.patch_size, -+ ) -+ -+ - # Adapted from https://github.com/huggingface/transformers/blob/v4.43.3/src/transformers/models/siglip/modeling_siglip.py#L249 # noqa - class SiglipVisionEmbeddings(nn.Module): - -diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py -index caae0b65d..a7cf65a0e 100644 ---- a/vllm/model_executor/models/solar.py -+++ b/vllm/model_executor/models/solar.py -@@ -565,8 +565,9 @@ class SolarForCausalLM(nn.Module, SupportsLoRA, SupportsPP): - # which is consistent with the practice of setting - # scaling_factor = tensor_amax / FPtype_max - scaling_factor *= 2 -- if hasattr(layer_self_attn, "kv_scale"): -- layer_self_attn.attn._kv_scale = scaling_factor -+ if hasattr(layer_self_attn.attn, "_k_scale"): -+ layer_self_attn.attn._k_scale = scaling_factor -+ layer_self_attn.attn._v_scale = scaling_factor - else: - raise RuntimeError("Self attention has no KV cache scaling " - "factor attribute!") -diff --git a/vllm/model_executor/models/telechat2.py b/vllm/model_executor/models/telechat2.py -index 28c37bb96..35683c70d 100644 ---- a/vllm/model_executor/models/telechat2.py -+++ b/vllm/model_executor/models/telechat2.py -@@ -31,19 +31,6 @@ from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper, - - class TeleChat2Model(LlamaModel): - -- hf_to_vllm_mapper = WeightsMapper( -- orig_to_new_prefix={ -- "transformer.": "model.", -- }, -- orig_to_new_substr={ -- ".h.": ".layers.", -- ".self_attention.": ".self_attn.", -- ".word_embeddings.": ".embed_tokens.", -- ".dense.": ".o_proj.", -- ".ln_f.": ".norm.", -- }, -- ) -- - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - # 1. Initialize the LlamaModel with bias - vllm_config.model_config.hf_config.bias = True -@@ -55,9 +42,9 @@ class TeleChat2Model(LlamaModel): - for layer in self.layers: - if not isinstance(layer, PPMissingLayer): - layer.self_attn.qkv_proj.bias = None -- layer.self_attn.qkv_proj.skip_bias_add = True -+ #layer.self_attn.qkv_proj.skip_bias_add = True - layer.mlp.gate_up_proj.bias = None -- layer.mlp.gate_up_proj.skip_bias_add = True -+ #layer.mlp.gate_up_proj.skip_bias_add = True - - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: -@@ -118,6 +105,19 @@ class TeleChat2Model(LlamaModel): - - class TeleChat2ForCausalLM(LlamaForCausalLM): - -+ hf_to_vllm_mapper = WeightsMapper( -+ orig_to_new_prefix={ -+ "transformer.": "model.", -+ }, -+ orig_to_new_substr={ -+ ".h.": ".layers.", -+ ".self_attention.": ".self_attn.", -+ ".word_embeddings.": ".embed_tokens.", -+ ".dense.": ".o_proj.", -+ ".ln_f.": ".norm.", -+ }, -+ ) -+ - def _init_model(self, vllm_config: VllmConfig, prefix: str = ""): - return TeleChat2Model(vllm_config=vllm_config, prefix=prefix) - -diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py -index 509ad9e58..ba823acec 100644 ---- a/vllm/model_executor/models/ultravox.py -+++ b/vllm/model_executor/models/ultravox.py -@@ -2,11 +2,10 @@ - """PyTorch Ultravox model.""" - - import math --from functools import cached_property, lru_cache -+from functools import cached_property - from typing import (Any, Iterable, List, Literal, Mapping, Optional, Set, - Tuple, TypedDict, Union) - --import numpy as np - import torch - import torch.utils.checkpoint - from torch import nn -@@ -17,19 +16,21 @@ from transformers.models.whisper.modeling_whisper import WhisperEncoder - - from vllm.attention import AttentionMetadata - from vllm.config import VllmConfig --from vllm.inputs import InputContext - from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn - from vllm.model_executor.layers.layernorm import RMSNorm - from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler - from vllm.model_executor.model_loader.loader import DefaultModelLoader - from vllm.model_executor.sampling_metadata import SamplingMetadata --from vllm.multimodal import MULTIMODAL_REGISTRY, NestedTensors -+from vllm.multimodal import MULTIMODAL_REGISTRY -+from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, -+ NestedTensors) -+from vllm.multimodal.parse import MultiModalDataParser - from vllm.multimodal.processing import (BaseMultiModalProcessor, -- MultiModalDataItems, ProcessorInputs, -+ MultiModalDataItems, ProcessingMixin, - PromptReplacement) -+from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs - from vllm.sequence import IntermediateTensors - from vllm.transformers_utils.configs.ultravox import UltravoxConfig --from vllm.utils import is_list_of - - from .interfaces import SupportsMultiModal, SupportsPP - from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, -@@ -55,77 +56,118 @@ UltravoxAudioInputs = Union[UltravoxAudioFeatureInputs, - UltravoxAudioEmbeddingInputs] - - --@lru_cache --def cached_feature_extractor(model_id: str) -> WhisperFeatureExtractor: -- return WhisperFeatureExtractor.from_pretrained(model_id) -+class UltravoxProcessingMixin(ProcessingMixin): - -+ def _get_hf_processor( -+ self, -+ *, -+ # Ignored in initialization -+ sampling_rate: Optional[int] = None, -+ ) -> ProcessorMixin: -+ return self.ctx.get_hf_processor() - --def whisper_feature_extractor(ctx: InputContext) -> WhisperFeatureExtractor: -- hf_config = ctx.get_hf_config(UltravoxConfig) -- return cached_feature_extractor(hf_config.audio_model_id) -+ def _get_feature_extractor( -+ self, -+ *, -+ # Ignored in initialization -+ sampling_rate: Optional[int] = None, -+ ) -> WhisperFeatureExtractor: -+ hf_processor = self._get_hf_processor(sampling_rate=sampling_rate) -+ audio_processor = hf_processor.audio_processor # type: ignore -+ feature_extractor = audio_processor.feature_extractor # type: ignore -+ assert isinstance(feature_extractor, WhisperFeatureExtractor) -+ return feature_extractor - - --def get_ultravox_max_audio_tokens(ctx: InputContext): -- feature_extractor = whisper_feature_extractor(ctx) -- return math.ceil(feature_extractor.chunk_length * _AUDIO_TOKENS_PER_SECOND) -+class UltravoxProfilingInfo(UltravoxProcessingMixin, BaseProfilingInfo): - -+ def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: -+ return {"audio": None} - --class UltravoxMultiModalProcessor(BaseMultiModalProcessor): -+ def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: -+ feature_extractor = self._get_feature_extractor() -+ max_audio_tokens = math.ceil(feature_extractor.chunk_length * -+ _AUDIO_TOKENS_PER_SECOND) - -- def _get_feature_extractor(self) -> WhisperFeatureExtractor: -- hf_processor = self._get_hf_processor() -- return hf_processor.audio_processor.feature_extractor # type: ignore -+ return {"audio": max_audio_tokens} - -- def _get_processor_data( -+ def get_dummy_processor_inputs( - self, -- mm_items: MultiModalDataItems, -- ) -> tuple[dict[str, Any], dict[str, Any]]: -- # resample audio to the model's sampling rate -+ seq_len: int, -+ mm_counts: Mapping[str, int], -+ ) -> ProcessorInputs: - feature_extractor = self._get_feature_extractor() -- mm_items.resample_audios(feature_extractor.sampling_rate) - -- return super()._get_processor_data(mm_items) -+ sampling_rate = feature_extractor.sampling_rate -+ audio_len = feature_extractor.chunk_length * sampling_rate -+ num_audios = mm_counts.get("audio", 0) -+ -+ mm_data = { -+ "audio": -+ self._get_dummy_audios(length=audio_len, num_audios=num_audios) -+ } -+ -+ return ProcessorInputs( -+ prompt_text="<|audio|>" * num_audios, -+ mm_data=mm_data, -+ ) -+ -+ -+class UltravoxMultiModalProcessor(UltravoxProcessingMixin, -+ BaseMultiModalProcessor): -+ -+ def _get_profiling_info(self) -> BaseProfilingInfo: -+ return UltravoxProfilingInfo(self.ctx) -+ -+ def _get_data_parser(self) -> MultiModalDataParser: -+ feature_extractor = self._get_feature_extractor() -+ return MultiModalDataParser(target_sr=feature_extractor.sampling_rate) - - def _call_hf_processor( - self, -- hf_processor: ProcessorMixin, - prompt: str, -- processor_data: Mapping[str, object], -- mm_processor_kwargs: Mapping[str, object], -+ mm_data: Mapping[str, object], -+ mm_kwargs: Mapping[str, object], - ) -> BatchFeature: -- processor_data = dict(processor_data) -- audios = processor_data.pop("audios", []) -+ # Text-only input not supported in composite processor -+ if not mm_data: -+ tokenizer = self._get_tokenizer() -+ -+ prompt_ids = tokenizer.encode( -+ prompt, -+ add_special_tokens=False, # type: ignore -+ ) -+ return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt") -+ -+ mm_data = dict(mm_data) -+ audios = mm_data.pop("audios", []) -+ assert isinstance(audios, list) - - if not audios: - return super()._call_hf_processor( -- hf_processor, - prompt=prompt, -- processor_data=processor_data, -- mm_processor_kwargs=mm_processor_kwargs, -+ mm_data=mm_data, -+ mm_kwargs=mm_kwargs, - ) - - feature_extractor = self._get_feature_extractor() -- mm_processor_kwargs = dict( -- **mm_processor_kwargs, -+ mm_kwargs = dict( -+ **mm_kwargs, - sampling_rate=feature_extractor.sampling_rate, - ) - -- # Already resampled by _get_processor_data -- assert is_list_of(audios, np.ndarray) -- - # Ultravox processor doesn't support multiple inputs, - # therefore we need to input text and audio one by one - audio_features, audio_token_len = [], [] - shared_outputs = {} - for audio in audios: - # NOTE: Ultravox processor accepts "audio" instead of "audios" -- item_processor_data = dict(**processor_data, audio=audio) -+ item_processor_data = dict(**mm_data, audio=audio) - - item_outputs = super()._call_hf_processor( -- hf_processor, - prompt=prompt, -- processor_data=item_processor_data, -- mm_processor_kwargs=mm_processor_kwargs, -+ mm_data=item_processor_data, -+ mm_kwargs=mm_kwargs, - ) - - audio_features.append(item_outputs.pop("audio_values")[0]) -@@ -139,17 +181,28 @@ class UltravoxMultiModalProcessor(BaseMultiModalProcessor): - ) - return BatchFeature(combined_outputs) - -+ def _get_mm_fields_config( -+ self, -+ hf_inputs: BatchFeature, -+ hf_processor_mm_kwargs: Mapping[str, object], -+ ) -> Mapping[str, MultiModalFieldConfig]: -+ return dict( -+ audio_features=MultiModalFieldConfig.batched("audio"), -+ audio_token_len=MultiModalFieldConfig.batched("audio"), -+ audio_embeds=MultiModalFieldConfig.batched("audio"), -+ ) -+ - def _get_prompt_replacements( - self, - mm_items: MultiModalDataItems, -- hf_inputs: BatchFeature, -- mm_processor_kwargs: Mapping[str, object], -+ hf_processor_mm_kwargs: Mapping[str, Any], -+ out_mm_kwargs: MultiModalKwargs, - ) -> list[PromptReplacement]: -- hf_processor = self._get_hf_processor() -+ hf_processor = self._get_hf_processor(**hf_processor_mm_kwargs) - placeholder = hf_processor.audio_token_replacement # type: ignore - - def get_replacement_ultravox(item_idx: int): -- audio_token_len = hf_inputs["audio_token_len"][item_idx] -+ audio_token_len = out_mm_kwargs["audio_token_len"][item_idx] - return placeholder * audio_token_len - - return [ -@@ -160,24 +213,6 @@ class UltravoxMultiModalProcessor(BaseMultiModalProcessor): - ) - ] - -- def _get_dummy_mm_inputs( -- self, -- mm_counts: Mapping[str, int], -- ) -> ProcessorInputs: -- feature_extractor = self._get_feature_extractor() -- sampling_rate = feature_extractor.sampling_rate -- audio_len = feature_extractor.chunk_length * sampling_rate -- -- audio_count = mm_counts["audio"] -- audio = np.zeros(audio_len) -- data = {"audio": [audio] * audio_count} -- -- return ProcessorInputs( -- prompt_text="<|audio|>" * audio_count, -- mm_data=data, -- mm_processor_kwargs={}, -- ) -- - - class StackAudioFrames(nn.Module): - """ -@@ -297,8 +332,6 @@ class ModifiedWhisperEncoder(WhisperEncoder): - return hidden_states - - --@MULTIMODAL_REGISTRY.register_max_multimodal_tokens( -- "audio", get_ultravox_max_audio_tokens) - @MULTIMODAL_REGISTRY.register_processor(UltravoxMultiModalProcessor) - class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP): - -diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py -index 269b66806..31017f16d 100644 ---- a/vllm/model_executor/models/utils.py -+++ b/vllm/model_executor/models/utils.py -@@ -373,7 +373,7 @@ def embed_multimodal( - input_ids: torch.Tensor, - multimodal_token_id: int, - get_text_embeds: Callable[[torch.Tensor], torch.Tensor], -- multimodal_embeds: Union[torch.Tensor, List[torch.Tensor]], -+ multimodal_embeds: NestedTensors, - ) -> torch.Tensor: - """ - Embed token IDs and multimodal inputs and combine their embeddings. -diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py -new file mode 100644 -index 000000000..8516c9f70 ---- /dev/null -+++ b/vllm/model_executor/models/vision.py -@@ -0,0 +1,62 @@ -+from abc import ABC, abstractmethod -+from typing import Final, Generic, Protocol, TypeVar -+ -+from transformers import PretrainedConfig -+ -+_C = TypeVar("_C", bound=PretrainedConfig) -+ -+ -+class VisionEncoderInfo(ABC, Generic[_C]): -+ -+ def __init__(self, vision_config: _C) -> None: -+ super().__init__() -+ -+ self.vision_config = vision_config -+ -+ @abstractmethod -+ def get_num_image_tokens( -+ self, -+ *, -+ image_width: int, -+ image_height: int, -+ ) -> int: -+ raise NotImplementedError -+ -+ @abstractmethod -+ def get_max_image_tokens(self) -> int: -+ raise NotImplementedError -+ -+ @abstractmethod -+ def get_image_size(self) -> int: -+ raise NotImplementedError -+ -+ @abstractmethod -+ def get_patch_size(self) -> int: -+ raise NotImplementedError -+ -+ @abstractmethod -+ def get_patch_grid_length(self) -> int: -+ raise NotImplementedError -+ -+ -+class VisionLanguageConfig(Protocol): -+ vision_config: Final[PretrainedConfig] -+ -+ -+def get_vision_encoder_info( -+ hf_config: VisionLanguageConfig) -> VisionEncoderInfo: -+ # Avoid circular imports -+ from .clip import CLIPEncoderInfo, CLIPVisionConfig -+ from .pixtral import PixtralHFEncoderInfo, PixtralVisionConfig -+ from .siglip import SiglipEncoderInfo, SiglipVisionConfig -+ -+ vision_config = hf_config.vision_config -+ if isinstance(vision_config, CLIPVisionConfig): -+ return CLIPEncoderInfo(vision_config) -+ if isinstance(vision_config, PixtralVisionConfig): -+ return PixtralHFEncoderInfo(vision_config) -+ if isinstance(vision_config, SiglipVisionConfig): -+ return SiglipEncoderInfo(vision_config) -+ -+ msg = f"Unsupported vision config: {type(vision_config)}" -+ raise NotImplementedError(msg) -diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py -new file mode 100644 -index 000000000..cb54b4c3b ---- /dev/null -+++ b/vllm/model_executor/models/whisper.py -@@ -0,0 +1,737 @@ -+import math -+from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict, -+ Union) -+ -+import numpy as np -+import torch -+from torch import nn -+from transformers.models.whisper.modeling_whisper import sinusoids -+ -+from vllm.attention import Attention, AttentionMetadata, AttentionType -+from vllm.config import CacheConfig, VllmConfig -+from vllm.distributed import get_tensor_model_parallel_world_size -+from vllm.inputs import INPUT_REGISTRY, DummyData, InputContext -+from vllm.logger import init_logger -+from vllm.model_executor.layers.activation import get_act_fn -+from vllm.model_executor.layers.linear import (ColumnParallelLinear, -+ QKVParallelLinear, -+ RowParallelLinear) -+from vllm.model_executor.layers.logits_processor import LogitsProcessor -+from vllm.model_executor.layers.quantization.base_config import ( -+ QuantizationConfig) -+from vllm.model_executor.layers.sampler import Sampler, SamplerOutput -+from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead -+from vllm.model_executor.model_loader.weight_utils import default_weight_loader -+from vllm.model_executor.sampling_metadata import SamplingMetadata -+from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs, -+ NestedTensors) -+from vllm.multimodal.audio import resample_audio -+from vllm.sequence import SequenceData -+from vllm.transformers_utils.processor import cached_get_processor -+ -+from .interfaces import SupportsMultiModal -+from .utils import AutoWeightsLoader, WeightsMapper, make_layers -+ -+logger = init_logger(__name__) -+ -+ -+class WhisperAudioInputs(TypedDict): -+ input_features: NestedTensors -+ """Shape: `(batch_size, 128, M)`""" -+ -+ -+class WhisperPositionalEmbedding(nn.Embedding): -+ -+ def __init__(self, -+ num_positions: int, -+ embedding_dim: int, -+ padding_idx: Optional[int] = None): -+ super().__init__(num_positions, embedding_dim) -+ -+ def forward(self, position_ids): -+ return self.weight[position_ids] -+ -+ -+class WhisperAttention(nn.Module): -+ -+ def __init__( -+ self, -+ embed_dim: int, -+ num_heads: int, -+ bias: bool = True, -+ attn_type: AttentionType = AttentionType.DECODER, -+ cache_config: Optional[CacheConfig] = None, -+ quant_config: Optional[QuantizationConfig] = None, -+ prefix: str = "", -+ ): -+ super().__init__() -+ self.embed_dim = embed_dim -+ tp_size = get_tensor_model_parallel_world_size() -+ self.total_num_heads = num_heads -+ assert self.total_num_heads % tp_size == 0 -+ self.num_heads = self.total_num_heads // tp_size -+ if self.total_num_heads >= tp_size: -+ # Number of heads is greater than TP size, so we partition -+ # the KV heads across multiple tensor parallel GPUs. -+ assert self.total_num_heads % tp_size == 0 -+ else: -+ # Number of heads is less than TP size, so we replicate -+ # the KV heads across multiple tensor parallel GPUs. -+ assert tp_size % self.total_num_heads == 0 -+ self.num_kv_heads = max(1, self.total_num_heads // tp_size) -+ self.head_dim = self.embed_dim // self.total_num_heads -+ self.q_size = self.num_heads * self.head_dim -+ self.kv_size = self.num_kv_heads * self.head_dim -+ self.attn_type = attn_type -+ -+ if (self.head_dim * num_heads) != self.embed_dim: -+ raise ValueError( -+ f"embed_dim must be divisible by num_heads (got `embed_dim`: " -+ f"{self.embed_dim} and `num_heads`: {num_heads}).") -+ self.scaling = self.head_dim**-0.5 -+ -+ self._init_qkv(embed_dim, bias, quant_config, prefix=prefix) -+ self.out_proj = RowParallelLinear( -+ input_size=embed_dim, -+ output_size=embed_dim, -+ bias=bias, -+ quant_config=quant_config, -+ prefix=f"{prefix}.out_proj", -+ ) -+ self.attn = Attention( -+ self.num_heads, -+ self.head_dim, -+ self.scaling, -+ num_kv_heads=self.num_kv_heads, -+ cache_config=cache_config, -+ quant_config=quant_config, -+ prefix=f"{prefix}.attn", -+ ) -+ -+ def _init_qkv( -+ self, -+ embed_dim: int, -+ bias: bool = True, -+ quant_config: Optional[QuantizationConfig] = None, -+ prefix: str = "", -+ ) -> None: -+ self.qkv_proj = QKVParallelLinear( -+ hidden_size=embed_dim, -+ head_size=self.head_dim, -+ total_num_heads=self.total_num_heads, -+ total_num_kv_heads=self.total_num_heads, -+ bias=bias, -+ quant_config=quant_config, -+ prefix=f"{prefix}.qkv_proj", -+ ) -+ -+ def forward( -+ self, -+ hidden_states: torch.Tensor, -+ kv_cache: torch.Tensor, -+ attn_metadata: AttentionMetadata, -+ ): -+ qkv, _ = self.qkv_proj(hidden_states) -+ q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) -+ -+ attn_output = self.attn(q, -+ k, -+ v, -+ kv_cache, -+ attn_metadata, -+ attn_type=self.attn_type) -+ -+ output, _ = self.out_proj(attn_output) -+ -+ return output -+ -+ -+class WhisperCrossAttention(WhisperAttention): -+ -+ def __init__( -+ self, -+ embed_dim: int, -+ num_heads: int, -+ bias: bool = True, -+ cache_config: Optional[CacheConfig] = None, -+ quant_config: Optional[QuantizationConfig] = None, -+ prefix: str = "", -+ ): -+ super().__init__( -+ embed_dim=embed_dim, -+ num_heads=num_heads, -+ bias=bias, -+ cache_config=cache_config, -+ quant_config=quant_config, -+ prefix=prefix, -+ ) -+ -+ def _init_qkv( -+ self, -+ embed_dim: int, -+ bias: bool = True, -+ quant_config: Optional[QuantizationConfig] = None, -+ prefix: str = "", -+ ) -> None: -+ self.q_proj = ColumnParallelLinear( -+ input_size=embed_dim, -+ output_size=embed_dim, -+ bias=bias, -+ quant_config=quant_config, -+ prefix=f"{prefix}.q_proj", -+ ) -+ self.kv_proj = QKVParallelLinear( -+ hidden_size=embed_dim, -+ head_size=self.head_dim, -+ total_num_heads=0, -+ total_num_kv_heads=self.total_num_heads, -+ bias=bias, -+ quant_config=quant_config, -+ prefix=f"{prefix}.kv_proj", -+ ) ++ embeddings_indices.shape[-1], ++ ] ++ if long_lora_indices_len is not None: ++ indices_len.append(long_lora_indices_len) ++ else: ++ # If long_lora doesn't exist,append None ++ indices_len.append(None) + -+ def forward( -+ self, -+ hidden_states: torch.Tensor, -+ encoder_hidden_states: Optional[torch.Tensor], -+ kv_cache: torch.Tensor, -+ attn_metadata: AttentionMetadata, -+ ): -+ q, _ = self.q_proj(hidden_states) ++ return ( ++ base_indices, ++ sampler_indices, ++ sampler_indices_padded, ++ embeddings_indices, ++ long_lora_indices, ++ indices_len, ++ ) + -+ # Encoder hidden states are only computed once during prefill phase. -+ # Afterwards, the keys and values should be available in the kv-cache. -+ if encoder_hidden_states is not None: -+ kv, _ = self.kv_proj(encoder_hidden_states) -+ k, v = kv.split([self.kv_size, self.kv_size], dim=-1) -+ else: -+ k = v = None + -+ attn_output = self.attn(q, -+ k, -+ v, -+ kv_cache, -+ attn_metadata, -+ attn_type=AttentionType.ENCODER_DECODER) ++class PunicaWrapper: ++ """ ++ PunicaWrapper is designed to manage and provide metadata for the punica ++ kernel. The main function is to maintain the state information for ++ Multi-LoRA, and to provide the interface for the punica kernel. ++ """ + -+ output, _ = self.out_proj(attn_output) ++ def __init__(self, max_num_batched_tokens: int, max_batches: int, ++ device: torch.device): ++ self.device = device ++ self._token_lora_indices = torch.empty(max_num_batched_tokens, ++ dtype=torch.long, ++ device=device) ++ self._sampler_indices = torch.empty(max_num_batched_tokens, ++ dtype=torch.long, ++ device=device) ++ self._sampler_indices_padded = torch.empty(max_num_batched_tokens, ++ dtype=torch.long, ++ device=device) ++ self._embeddings_indices = torch.empty(2, ++ max_num_batched_tokens, ++ dtype=torch.long, ++ device=device) ++ self._long_lora_indices = torch.empty(max_num_batched_tokens, ++ dtype=torch.long, ++ device=device) + -+ return output ++ # 5 is the number of indicies tensors. ++ # base_indices, sampler_indices, sampler_indices_padded, ++ # embeddings_indices,long_lora_indices ++ self.indices_len: List[Optional[int]] = [None] * 5 ++ # these attributes are the information required for sgmv kernel ++ self._seq_start_locs = torch.empty(max_batches, ++ dtype=torch.long, ++ device=device) ++ self._seq_lengths = torch.empty(max_batches, ++ dtype=torch.long, ++ device=device) ++ self._lora_indices_per_batch = torch.empty(max_batches, ++ dtype=torch.long, ++ device=device) ++ self.max_length: int = 0 ++ self.batch_size: int = -1 ++ self.is_prefill = False ++ self.no_lora = False + ++ def update_metadata( ++ self, ++ mapping: "LoRAMapping", ++ lora_index_to_id: List[Optional[int]], ++ max_loras: int, ++ vocab_size: int, ++ extra_vocab_size: int, ++ long_lora_context: Optional["LongContextLoRAContext"] = None, ++ ): + -+class WhisperMLP(nn.Module): ++ self._update_base_metadata(mapping, lora_index_to_id, max_loras, ++ vocab_size, extra_vocab_size, ++ long_lora_context) ++ if mapping.is_prefill: ++ # Update metadata required for prefill-related operators. ++ self._update_prefill_metada(self.token_lora_indices) ++ self.is_prefill = True ++ else: ++ self.is_prefill = False + -+ def __init__( ++ def _update_base_metadata( + self, -+ embed_dim: int, -+ ffn_dim: int, -+ act_fn: str, -+ quant_config: Optional[QuantizationConfig] = None, -+ prefix: str = "", ++ mapping: "LoRAMapping", ++ lora_index_to_id: List[Optional[int]], ++ max_loras: int, ++ vocab_size: int, ++ extra_vocab_size: int, ++ long_lora_context: Optional["LongContextLoRAContext"] = None, + ): -+ super().__init__() -+ -+ self.activation_fn = get_act_fn(act_fn) -+ self.fc1 = ColumnParallelLinear( -+ input_size=embed_dim, -+ output_size=ffn_dim, -+ quant_config=quant_config, -+ prefix=f"{prefix}.fc1", -+ ) -+ self.fc2 = RowParallelLinear( -+ input_size=ffn_dim, -+ output_size=embed_dim, -+ quant_config=quant_config, -+ prefix=f"{prefix}.fc2", ++ ( ++ base_indices, ++ sampler_indices, ++ sampler_indices_padded, ++ embeddings_indices, ++ long_lora_offsets_tensor, ++ indices_len, ++ ) = convert_mapping( ++ mapping, ++ lora_index_to_id, ++ max_loras, ++ vocab_size, ++ extra_vocab_size, ++ long_lora_context, ++ self.device, + ) ++ self._token_lora_indices[:base_indices.shape[0]].copy_(base_indices) ++ self._sampler_indices[:sampler_indices.shape[0]].copy_(sampler_indices) ++ self._sampler_indices_padded[:sampler_indices_padded.shape[0]].copy_( ++ sampler_indices_padded) ++ self._embeddings_indices[:embeddings_indices. ++ shape[0], :embeddings_indices.shape[1]].copy_( ++ embeddings_indices) ++ if long_lora_offsets_tensor is not None: ++ self._long_lora_indices[:long_lora_offsets_tensor.shape[0]].copy_( ++ long_lora_offsets_tensor) ++ else: ++ self._long_lora_indices.zero_() + -+ def forward(self, hidden_states: torch.Tensor): -+ hidden_states, _ = self.fc1(hidden_states) -+ hidden_states = self.activation_fn(hidden_states) -+ hidden_states, _ = self.fc2(hidden_states) -+ return hidden_states -+ -+ -+class WhisperEncoderLayer(nn.Module): ++ self.indices_len[:] = indices_len + -+ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): -+ super().__init__() -+ config = vllm_config.model_config.hf_config -+ cache_config = vllm_config.cache_config -+ quant_config = vllm_config.quant_config ++ def _update_prefill_metada(self, token_lora_tensor: torch.Tensor) -> None: + -+ self.embed_dim = config.d_model -+ self.self_attn = WhisperAttention( -+ embed_dim=self.embed_dim, -+ num_heads=config.encoder_attention_heads, -+ attn_type=AttentionType.ENCODER, -+ cache_config=cache_config, -+ quant_config=quant_config, -+ prefix=f"{prefix}.self_attn", -+ ) -+ self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) -+ self.mlp = WhisperMLP( -+ embed_dim=config.d_model, -+ ffn_dim=config.encoder_ffn_dim, -+ act_fn=config.activation_function, -+ quant_config=quant_config, -+ prefix=f"{prefix}.mlp", -+ ) -+ self.final_layer_norm = nn.LayerNorm(self.embed_dim) ++ (b_seq_start_tensor, seq_length_tensor, lora_indices_tensor, ++ batch_size, max_length, no_lora) = compute_meta(token_lora_tensor) + -+ def forward( -+ self, -+ hidden_states: torch.Tensor, -+ kv_cache: torch.Tensor, -+ attn_metadata: AttentionMetadata, -+ ): -+ residual = hidden_states -+ hidden_states = self.self_attn_layer_norm(hidden_states) -+ hidden_states = self.self_attn( -+ hidden_states=hidden_states, -+ kv_cache=kv_cache, -+ attn_metadata=attn_metadata, -+ ) -+ hidden_states = residual + hidden_states -+ residual = hidden_states -+ hidden_states = self.final_layer_norm(hidden_states) -+ hidden_states = self.mlp(hidden_states) -+ hidden_states = residual + hidden_states ++ self._seq_start_locs[:b_seq_start_tensor.shape[0]].copy_( ++ b_seq_start_tensor) ++ self._seq_lengths[:seq_length_tensor.shape[0]].copy_(seq_length_tensor) ++ self._lora_indices_per_batch[:lora_indices_tensor.shape[0]].copy_( ++ lora_indices_tensor) ++ self.batch_size = batch_size ++ self.max_length = max_length ++ self.no_lora = no_lora + -+ if hidden_states.isinf().any() or hidden_states.isnan().any(): -+ clamp_value = torch.finfo(hidden_states.dtype).max - 1000 -+ hidden_states = torch.clamp(hidden_states, -+ min=-clamp_value, -+ max=clamp_value) ++ @property ++ def prefill_metadata( ++ self) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, int, int]: ++ """ ++ This property provides a convenient way to access the necessary ++ metadata for prefill-related kernel computations. ++ 1. seq_start_locs: Tensor of sequence start positions ++ 2. seq_lengths: Tensor of sequence lengths ++ 3. lora_indices_per_batch: Tensor of lora indices, and an index of ++ -1 means no lora should be applied. ++ 4. batch_size: batch size after clustering identical lora indices ++ 5. max_length: The maximum sequence length in the batch ++ """ ++ return (self._seq_start_locs[:self.batch_size], ++ self._seq_lengths[:self.batch_size], ++ self._lora_indices_per_batch[:self.batch_size], ++ self.batch_size, self.max_length) + -+ return hidden_states ++ @property ++ def token_lora_indices(self) -> torch.Tensor: ++ """ ++ This property provides the lora indices corresponding to each token ++ in the batch. An index of -1 means no lora should be applied. ++ """ ++ token_lora_len = self.indices_len[0] ++ return self._token_lora_indices[:token_lora_len] + ++ @property ++ def sampler_indices(self) -> torch.Tensor: ++ """ ++ This property is used to access the lora indices specifically for ++ LogitsProcessorWithLoRA ++ """ ++ sampler_indices_len = self.indices_len[1] ++ return self._sampler_indices[:sampler_indices_len] + -+class WhisperDecoderLayer(nn.Module): ++ @property ++ def sampler_indices_padded(self) -> torch.Tensor: ++ """ ++ This property provides access to padded sampler indices ++ """ ++ indices_padded_len = self.indices_len[2] ++ return self._sampler_indices_padded[:indices_padded_len] + -+ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): -+ super().__init__() -+ config = vllm_config.model_config.hf_config -+ cache_config = vllm_config.cache_config -+ quant_config = vllm_config.quant_config ++ @property ++ def embeddings_indices(self) -> torch.Tensor: ++ """ ++ This property provides access to the indices used for lora embeddings, ++ specifically for VocabParallelEmbeddingWithLoRA ++ """ ++ embeddings_indices_len = self.indices_len[3] ++ return self._embeddings_indices[:, :embeddings_indices_len] + -+ self.self_attn = WhisperAttention( -+ embed_dim=config.d_model, -+ num_heads=config.decoder_attention_heads, -+ attn_type=AttentionType.DECODER, -+ cache_config=cache_config, -+ quant_config=quant_config, -+ prefix=f"{prefix}.self_attn", -+ ) -+ self.self_attn_layer_norm = nn.LayerNorm(config.d_model) -+ self.encoder_attn = WhisperCrossAttention( -+ embed_dim=config.d_model, -+ num_heads=config.decoder_attention_heads, -+ cache_config=cache_config, -+ quant_config=quant_config, -+ prefix=f"{prefix}.encoder_attn", -+ ) -+ self.encoder_attn_layer_norm = nn.LayerNorm(config.d_model) -+ self.mlp = WhisperMLP( -+ embed_dim=config.d_model, -+ ffn_dim=config.decoder_ffn_dim, -+ act_fn=config.activation_function, -+ quant_config=quant_config, -+ prefix=f"{prefix}.mlp", -+ ) -+ self.final_layer_norm = nn.LayerNorm(config.d_model) ++ @property ++ def long_lora_indices(self) -> torch.Tensor: ++ """ ++ This property provides access to the indices used for long context ++ lora, specifically for LinearScalingRotaryEmbeddingWithLora ++ """ ++ long_lora_len = self.indices_len[4] ++ return self._long_lora_indices[:long_lora_len] + -+ def forward( ++ def shrink_prefill( + self, -+ hidden_states: torch.Tensor, -+ encoder_hidden_states: Optional[torch.Tensor], -+ kv_cache: torch.Tensor, -+ attn_metadata: AttentionMetadata, ++ y: torch.Tensor, ++ x: torch.Tensor, ++ w_t_all: torch.Tensor, ++ scale: float, + ): -+ residual = hidden_states -+ hidden_states = self.self_attn_layer_norm(hidden_states) -+ hidden_states = self.self_attn(hidden_states=hidden_states, -+ kv_cache=kv_cache, -+ attn_metadata=attn_metadata) -+ hidden_states = residual + hidden_states -+ -+ residual = hidden_states -+ hidden_states = self.encoder_attn_layer_norm(hidden_states) -+ hidden_states = self.encoder_attn( -+ hidden_states=hidden_states, -+ encoder_hidden_states=encoder_hidden_states, -+ kv_cache=kv_cache, -+ attn_metadata=attn_metadata, ++ #No LoRA request, so return directly ++ if self.no_lora: ++ return ++ sgmv_shrink( ++ x, ++ w_t_all, ++ y, ++ *self.prefill_metadata, ++ scale, + ) -+ hidden_states = residual + hidden_states -+ -+ residual = hidden_states -+ hidden_states = self.final_layer_norm(hidden_states) -+ hidden_states = self.mlp(hidden_states) -+ hidden_states = residual + hidden_states -+ -+ return hidden_states + ++ def shrink_decode( ++ self, ++ y: torch.Tensor, ++ x: torch.Tensor, ++ w_t_all: torch.Tensor, ++ scale: float, ++ ): ++ bgmv_shrink(x, w_t_all, y, self.token_lora_indices, scale) + -+class WhisperEncoder(nn.Module): -+ -+ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): -+ super().__init__() -+ config = vllm_config.model_config.hf_config -+ embed_dim = config.d_model -+ self.num_mel_bins = config.num_mel_bins -+ self.padding_idx = config.pad_token_id -+ self.max_source_positions = config.max_source_positions -+ self.embed_scale = (math.sqrt(embed_dim) -+ if config.scale_embedding else 1.0) -+ -+ self.conv1 = nn.Conv1d(self.num_mel_bins, -+ embed_dim, -+ kernel_size=3, -+ padding=1) -+ self.conv2 = nn.Conv1d(embed_dim, -+ embed_dim, -+ kernel_size=3, -+ stride=2, -+ padding=1) -+ self.embed_positions = nn.Embedding(self.max_source_positions, -+ embed_dim) -+ self.start_layer, self.end_layer, self.layers = make_layers( -+ config.encoder_layers, -+ lambda prefix: WhisperEncoderLayer(vllm_config=vllm_config, -+ prefix=f"{prefix}.layers"), -+ prefix=f"{prefix}.layers", ++ def expand_prefill( ++ self, ++ y: torch.Tensor, ++ x: torch.Tensor, ++ w_t_all: torch.Tensor, ++ add_input: bool, ++ ): ++ #No LoRA request, so return directly ++ if self.no_lora: ++ return ++ sgmv_expand( ++ x, ++ w_t_all, ++ y, ++ *self.prefill_metadata, ++ add_input, + ) -+ self.layer_norm = nn.LayerNorm(config.d_model) -+ -+ with torch.no_grad(): -+ self.embed_positions.weight.copy_( -+ sinusoids(*self.embed_positions.weight.shape)) + -+ def forward( ++ def expand_decode( + self, -+ input_features: Union[torch.Tensor, List[torch.Tensor]], -+ kv_caches: List[torch.Tensor], -+ attn_metadata: AttentionMetadata, ++ y: torch.Tensor, ++ x: torch.Tensor, ++ w_t_all: torch.Tensor, ++ add_input: bool, + ): -+ hidden_states = [] -+ for features in input_features: -+ embeds = nn.functional.gelu(self.conv1(features)) -+ embeds = nn.functional.gelu(self.conv2(embeds)) -+ embeds = embeds.permute(1, 0) -+ embeds = embeds + self.embed_positions.weight[:embeds.size(0), :] -+ hidden_states.append(embeds) -+ hidden_states = torch.cat(hidden_states) -+ -+ for idx, encoder_layer in enumerate(self.layers): -+ hidden_states = encoder_layer( -+ hidden_states, -+ kv_cache=kv_caches[idx], -+ attn_metadata=attn_metadata, -+ ) -+ -+ hidden_states = self.layer_norm(hidden_states) -+ return hidden_states -+ -+ -+class WhisperDecoder(nn.Module): ++ bgmv_expand(x, w_t_all, y, self.token_lora_indices, add_input) + -+ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): -+ super().__init__() -+ config = vllm_config.model_config.hf_config -+ self.layerdrop = config.decoder_layerdrop -+ self.padding_idx = config.pad_token_id -+ self.max_target_positions = config.max_target_positions -+ self.max_source_positions = config.max_source_positions -+ self.embed_scale = (math.sqrt(config.d_model) -+ if config.scale_embedding else 1.0) -+ -+ self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, -+ self.padding_idx) -+ self.embed_positions = WhisperPositionalEmbedding( -+ self.max_target_positions, config.d_model) -+ self.start_layer, self.end_layer, self.layers = make_layers( -+ config.decoder_layers, -+ lambda prefix: WhisperDecoderLayer(vllm_config=vllm_config, -+ prefix=f"{prefix}.layers"), -+ prefix=f"{prefix}.layers", ++ def expand_slice_prefill( ++ self, ++ y: torch.Tensor, ++ x: torch.Tensor, ++ w_t_all: torch.Tensor, ++ y_offset: Optional[int], ++ y_slice_size: Optional[int], ++ add_input: bool, ++ ): ++ #No LoRA request, so return directly ++ if self.no_lora: ++ return ++ sgmv_expand_slice( ++ x, ++ w_t_all, ++ y, ++ *self.prefill_metadata, ++ y_offset, ++ y_slice_size, ++ add_input, + ) -+ self.layer_norm = nn.LayerNorm(config.d_model) + -+ def forward( ++ def expand_slice_decode( + self, -+ input_ids, -+ positions: torch.Tensor, -+ encoder_hidden_states: Optional[torch.Tensor], -+ kv_caches: List[torch.Tensor], -+ attn_metadata: AttentionMetadata, ++ y: torch.Tensor, ++ x: torch.Tensor, ++ w_t_all: torch.Tensor, ++ y_offset: Optional[int], ++ y_slice_size: Optional[int], ++ add_input: bool, + ): -+ inputs_embeds = self.get_input_embeddings(input_ids) -+ positions = self.embed_positions(positions) -+ hidden_states = inputs_embeds + positions -+ -+ for idx, decoder_layer in enumerate(self.layers): -+ hidden_states = decoder_layer( -+ hidden_states, -+ encoder_hidden_states=encoder_hidden_states, -+ kv_cache=kv_caches[idx], -+ attn_metadata=attn_metadata, -+ ) -+ -+ hidden_states = self.layer_norm(hidden_states) -+ return hidden_states ++ bgmv_expand_slice(x, w_t_all, y, self.token_lora_indices, y_offset, ++ y_slice_size, add_input) + -+ def get_input_embeddings( ++ def add_shrink( + self, -+ input_ids: torch.Tensor, -+ ) -> torch.Tensor: -+ return self.embed_tokens(input_ids) -+ ++ y: torch.Tensor, ++ x: torch.Tensor, ++ w_t_all: torch.Tensor, ++ scale: float, ++ ): ++ """ ++ Perform the ` y+=x@w_t_all` computation, which is suitable for the ++ GEMM of lora'a. ++ When `is_prefill is` true, it indicates that it is currently the ++ prefill stage, and the `shrink_prefill` function should be called. ++ Otherwise, it is the decode stage, and the shrink_decode function ++ should be called. ++ """ ++ shrink_fun: Callable = (self.shrink_prefill ++ if self.is_prefill else self.shrink_decode) ++ shrink_fun(y, x, w_t_all, scale) + -+class WhisperModel(nn.Module): ++ def add_expand( ++ self, ++ y: torch.Tensor, ++ x: torch.Tensor, ++ w_t_all: torch.Tensor, ++ add_input: bool = True, ++ ): ++ """ ++ Perform the ` y+=x@w_t_all` computation, which is suitable for the ++ GEMM of lora'b. ++ When `is_prefill` is true, it indicates that it is currently the ++ prefill stage, and the `expand_prefill` function should be called. ++ Otherwise, it is the decode stage, and the expand_decode function ++ should be called. ++ """ + -+ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): -+ super().__init__() -+ self.encoder = WhisperEncoder(vllm_config=vllm_config, -+ prefix=f"{prefix}.encoder") -+ self.decoder = WhisperDecoder(vllm_config=vllm_config, -+ prefix=f"{prefix}.decoder") ++ expand_fun: Callable = (self.expand_prefill ++ if self.is_prefill else self.expand_decode) ++ expand_fun(y, x, w_t_all, add_input) + -+ def forward( -+ self, -+ input_features: Optional[Union[torch.Tensor, List[torch.Tensor]]], -+ input_ids: Optional[torch.Tensor], -+ positions: torch.Tensor, -+ kv_caches: List[torch.Tensor], -+ attn_metadata: AttentionMetadata, -+ ) -> torch.Tensor: -+ encoder_outputs = self.get_encoder_outputs( -+ input_features, -+ kv_caches=kv_caches, -+ attn_metadata=attn_metadata, -+ ) -+ decoder_outputs = self.decoder( -+ input_ids=input_ids, -+ positions=positions, -+ encoder_hidden_states=encoder_outputs, -+ kv_caches=kv_caches, -+ attn_metadata=attn_metadata, -+ ) -+ return decoder_outputs ++ def add_expand_slice(self, ++ y: torch.Tensor, ++ x: torch.Tensor, ++ w_t_all: torch.Tensor, ++ y_offset: Optional[int], ++ y_slice_size: Optional[int], ++ add_input: bool = True): ++ """ ++ Similar to `add_expand` ++ """ + -+ def get_encoder_outputs( -+ self, -+ input_features: Optional[Union[torch.Tensor, List[torch.Tensor]]], -+ kv_caches: List[torch.Tensor], -+ attn_metadata: AttentionMetadata, -+ ) -> Optional[torch.Tensor]: -+ if input_features is None: -+ return None -+ return self.encoder( -+ input_features, -+ kv_caches=kv_caches, -+ attn_metadata=attn_metadata, -+ ) ++ expand_slice_fun: Callable = (self.expand_slice_prefill ++ if self.is_prefill else ++ self.expand_slice_decode) ++ expand_slice_fun(y, x, w_t_all, y_offset, y_slice_size, add_input) + -+ def load_weights(self, weights: Iterable[Tuple[str, -+ torch.Tensor]]) -> Set[str]: -+ stacked_params_mapping = [ -+ # (param_name, shard_name, shard_id) -+ (".self_attn.qkv_proj", ".self_attn.q_proj", "q"), -+ (".self_attn.qkv_proj", ".self_attn.k_proj", "k"), -+ (".self_attn.qkv_proj", ".self_attn.v_proj", "v"), -+ (".encoder_attn.kv_proj", ".encoder_attn.k_proj", "k"), -+ (".encoder_attn.kv_proj", ".encoder_attn.v_proj", "v"), -+ ] -+ params_dict = dict(self.named_parameters()) -+ loaded_params: Set[str] = set() -+ for name, loaded_weight in weights: -+ for param_name, weight_name, shard_id in stacked_params_mapping: -+ if weight_name not in name: -+ continue -+ name = name.replace(weight_name, param_name) -+ # Skip loading extra bias for GPTQ models. -+ if name.endswith(".bias") and name not in params_dict: -+ continue ++ def add_lora(self, ++ y: torch.Tensor, ++ x: torch.Tensor, ++ wa_t_all: torch.Tensor, ++ wb_t_all: torch.Tensor, ++ scale: float, ++ y_offset: Optional[int] = None, ++ y_slice_size: Optional[int] = None, ++ *, ++ buffer: Optional[torch.Tensor] = None) -> None: ++ """ ++ Semantics: ++ y[i] += ( ++ x[i].unsqueeze(0) ++ @ wa_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) ++ @ wb_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) ++ * scale ++ ).squeeze(0) ++ Args: ++ y (torch.Tensor): Output tensor. Will be changed in-place. ++ x (torch.Tensor): Input tensor ++ wa_t_all (torch.Tensor): lora_a's weight ++ wb_t_all (torch.Tensor): lora_b's weight ++ scale (float): Scaling factor. ++ y_offset (Optional[int], optional): Offset to apply to the starting ++ column of y. ++ y_slice_size (Optional[int], optional): Size of the y column slice.. ++ buffer (Optional[torch.Tensor], optional): Defaults to None. ++ """ ++ y_org = y ++ y = y.view(-1, y.shape[-1]) ++ x = x.view(-1, x.shape[-1]) ++ r = wb_t_all.size(-1) ++ if buffer is None: ++ # We set the buffer to be float32 by default ,refer to: ++ # https://github.com/triton-lang/triton/issues/1387 ++ buffer = torch.zeros((x.size(0), r), ++ dtype=torch.float32, ++ device=x.device) + -+ param = params_dict[name] -+ weight_loader = param.weight_loader -+ weight_loader(param, loaded_weight, shard_id) -+ break -+ else: -+ # Skip loading extra bias for GPTQ models. -+ if name.endswith(".bias") and name not in params_dict: -+ continue ++ self.add_shrink(buffer, x, wa_t_all, scale) ++ if y_offset is None and y_slice_size is None: ++ self.add_expand(y, buffer, wb_t_all, add_input=True) ++ else: ++ self.add_expand_slice(y, ++ buffer, ++ wb_t_all, ++ y_offset, ++ y_slice_size, ++ add_input=True) ++ y = y.view_as(y_org) + -+ param = params_dict[name] -+ weight_loader = getattr(param, "weight_loader", -+ default_weight_loader) -+ weight_loader(param, loaded_weight) -+ loaded_params.add(name) -+ return loaded_params ++ def add_lora_packed_nslice(self, y: torch.Tensor, x: torch.Tensor, ++ lora_a_stacked: Tuple[torch.Tensor, ++ torch.Tensor, ++ torch.Tensor], ++ lora_b_stacked: Tuple[torch.Tensor, ++ torch.Tensor, ++ torch.Tensor], ++ scale: float, ++ output_slices: Tuple[int, ...]) -> None: ++ """ ++ Applies lora to each input. Similar to add_lora, This method is ++ used for layers that are composed of multiple sublayers ++ (slices) packed together. ++ """ ++ y_org = y ++ x = x.view(-1, x.shape[-1]) ++ y = y.view(-1, y.shape[-1]) ++ offset_left = 0 ++ # TODO fuse these kernels ++ for slice_idx in range(len(output_slices)): ++ self.add_lora(y, x, lora_a_stacked[slice_idx], ++ lora_b_stacked[slice_idx], scale, offset_left, ++ output_slices[slice_idx]) ++ offset_left += output_slices[slice_idx] + ++ y = y.view_as(y_org) + -+def get_max_whisper_audio_tokens(ctx: InputContext) -> int: -+ return ctx.model_config.hf_config.max_source_positions ++ def add_lora_logits(self, ++ y: torch.Tensor, ++ x: torch.Tensor, ++ wa_t_all: torch.Tensor, ++ wb_t_all: torch.Tensor, ++ scale, ++ *, ++ buffer: Optional[torch.Tensor] = None) -> None: ++ """ ++ LogitsProcessorWithLoRA always using bgmv ++ """ ++ y_org = y ++ y = y.view(-1, y.shape[-1]) ++ x = x.view(-1, x.shape[-1]) ++ r = wb_t_all.size(-1) ++ if buffer is None: ++ # We set the buffer to be float32 by default ,refer to: ++ # https://github.com/triton-lang/triton/issues/1387 ++ buffer = torch.zeros((x.size(0), r), ++ dtype=torch.float32, ++ device=x.device) + ++ bgmv_shrink(x, wa_t_all, buffer, self.sampler_indices, scale) ++ bgmv_expand(buffer, wb_t_all, y, self.sampler_indices, add_inputs=True) ++ y = y.view_as(y_org) +diff --git a/vllm/lora/punica_wrapper/punica_gpu.py b/vllm/lora/punica_wrapper/punica_gpu.py +index bb6d2808e..341339b13 100644 +--- a/vllm/lora/punica_wrapper/punica_gpu.py ++++ b/vllm/lora/punica_wrapper/punica_gpu.py +@@ -13,10 +13,14 @@ import torch + import vllm.envs as envs + from vllm.lora.layers import LoRAMapping + from vllm.triton_utils import HAS_TRITON ++from vllm.platforms import current_platform + +-if HAS_TRITON: ++if HAS_TRITON and not current_platform.is_xpu(): + from vllm.lora.ops.triton_ops import (LoRAKernelMeta, lora_expand, + lora_shrink) ++elif current_platform.is_xpu(): ++ # TODO(xiangyu): check here ++ pass + + from .punica_base import PunicaWrapperBase + +diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py +index 1de0f499c..675d601fa 100644 +--- a/vllm/model_executor/layers/activation.py ++++ b/vllm/model_executor/layers/activation.py +@@ -260,6 +260,12 @@ class QuickGELU(CustomOp): + out = torch.empty_like(x) + self.op(out, x) + return out ++ # def forward_xpu(self, x: torch.Tensor) -> torch.Tensor: ++ # from vllm._ipex_ops import ipex_ops as ops + -+def dummy_encoder_data_for_whisper(ctx: InputContext, seq_len: int, -+ mm_counts: Mapping[str, int]): -+ assert mm_counts["audio"] == 1 -+ num_tokens = get_max_whisper_audio_tokens(ctx) -+ processor = cached_get_processor(ctx.model_config.model) -+ chunk_length = processor.feature_extractor.chunk_length -+ sampling_rate = processor.feature_extractor.sampling_rate -+ num_samples = chunk_length * sampling_rate -+ return DummyData( -+ SequenceData.from_prompt_token_counts((0, num_tokens)), -+ {"audio": [(np.zeros(num_samples), sampling_rate)]}, -+ ) ++ # out = torch.empty_like(x) ++ # ops.gelu_quick(out, x) ++ # return out + + # TODO implement forward_xpu for QuickGELU + # def forward_xpu(self, x: torch.Tensor) -> torch.Tensor: +diff --git a/vllm/model_executor/layers/fused_moe/ipex_llm_moe.py b/vllm/model_executor/layers/fused_moe/ipex_llm_moe.py +new file mode 100644 +index 000000000..d55d78b77 +--- /dev/null ++++ b/vllm/model_executor/layers/fused_moe/ipex_llm_moe.py +@@ -0,0 +1,372 @@ ++# SPDX-License-Identifier: Apache-2.0 + ++from abc import abstractmethod ++from enum import Enum ++from typing import Callable, List, Optional, Tuple ++import os + -+def input_processor_for_whisper(ctx: InputContext, inputs): -+ multi_modal_data = inputs["encoder"]["multi_modal_data"] -+ if isinstance(multi_modal_data["audio"], list): -+ assert len(multi_modal_data["audio"]) == 1 -+ multi_modal_data["audio"] = multi_modal_data["audio"][0] -+ # Resample and process audio -+ audio, orig_sr = multi_modal_data["audio"] -+ processor = cached_get_processor(ctx.model_config.model) -+ target_sr = processor.feature_extractor.sampling_rate -+ audio = resample_audio(audio, orig_sr=orig_sr, target_sr=target_sr) -+ multi_modal_data["audio"] = (audio, target_sr) -+ # Pre-allocate placeholder tokens in encoder sequence -+ num_tokens = get_max_whisper_audio_tokens(ctx) -+ inputs["encoder"]["prompt_token_ids"] = [0] * num_tokens -+ return inputs ++import torch ++from torch import nn ++import torch.nn.functional as F ++from torch.nn.parameter import UninitializedParameter + ++import vllm.envs as envs ++from vllm.config import get_current_vllm_config ++from vllm.distributed import (get_dp_group, get_tensor_model_parallel_rank, ++ get_tensor_model_parallel_world_size, ++ tensor_model_parallel_all_reduce) ++from vllm.forward_context import ForwardContext, get_forward_context ++from vllm.logger import init_logger ++from vllm.model_executor.custom_op import CustomOp ++from vllm.model_executor.layers.quantization.base_config import ( ++ QuantizationConfig, QuantizeMethodBase) ++from vllm.model_executor.utils import set_weight_attrs ++from vllm.platforms import current_platform ++from vllm.platforms.interface import CpuArchEnum ++from vllm.utils import direct_register_custom_op + -+def input_mapper_for_whisper( -+ ctx: InputContext, -+ multi_modal_data: Union[np.ndarray, List[np.ndarray]], -+) -> MultiModalKwargs: -+ if not isinstance(multi_modal_data, list): -+ multi_modal_data = [multi_modal_data] ++fused_experts = None # type: ignore ++fused_moe_pallas = None # type: ignore ++if current_platform.is_xpu(): ++ from .moe_pallas import fused_moe_xpu ++else: ++ fused_moe_xpu = None # type: ignore ++logger = init_logger(__name__) + -+ assert len(multi_modal_data) == 1 ++from ipex_llm.ggml.quantize import ggml_tensor_qtype, gguf_mixed_qtype ++import ipex_llm.ggml.model.llama.llama_cpp as ggml ++from ipex_llm.transformers.low_bit_linear import LowBitLinear, FP4Params, \ ++ FP16Linear, BF16Linear, ggml_convert_qtype, ggml_int4_convert_fp32 + -+ if len(multi_modal_data) == 0: -+ return MultiModalKwargs() ++from vllm.model_executor.layers.fused_moe import FusedMoEMethodBase ++from vllm.model_executor.layers.quantization.gguf import GGUFUninitializedParameter ++from torch.nn.parameter import Parameter, UninitializedParameter ++from vllm.model_executor.layers.activation import SiluAndMul + -+ processor = cached_get_processor(ctx.model_config.model) -+ sampling_rate = processor.feature_extractor.sampling_rate + -+ audios = [audio for audio, _ in multi_modal_data] ++import gguf ++from gguf import GGMLQuantizationType as WeightType ++import ipex_llm.ggml.model.llama.llama_cpp as ggml ++from ipex_llm.ggml.quantize import ggml_tensor_qtype + -+ kwargs = processor(audios, -+ sampling_rate=sampling_rate, -+ return_tensors="pt") -+ kwargs["input_features"] = kwargs["input_features"].squeeze(0).to( -+ ctx.model_config.dtype) ++from ipex_llm.transformers.low_bit_linear import MatMulLowBit ++import xe_linear ++import xe_batch ++import xe_addons + -+ return MultiModalKwargs(kwargs) ++from vllm._ipex_ops import ipex_ops ++import vllm._C.ops + ++# @CustomOp.register("unquantized_fused_moe") ++class IPEXLLMFusedMoEMethod(FusedMoEMethodBase): ++ """MoE method without quantization.""" ++ ++ def create_weights(self, layer: torch.nn.Module, num_experts: int, ++ hidden_size: int, intermediate_size_per_partition: int, ++ params_dtype: torch.dtype, **extra_weight_attrs): ++ ++ # Fused gate_up_proj (column parallel) ++ w13_weight = torch.nn.Parameter(torch.empty( ++ num_experts, ++ 2 * intermediate_size_per_partition, ++ hidden_size, ++ dtype=params_dtype), ++ requires_grad=False) ++ layer.register_parameter("w13_weight", w13_weight) ++ set_weight_attrs(w13_weight, extra_weight_attrs) ++ ++ # down_proj (row parallel) ++ w2_weight = torch.nn.Parameter(torch.empty( ++ num_experts, ++ hidden_size, ++ intermediate_size_per_partition, ++ dtype=params_dtype), ++ requires_grad=False) ++ layer.register_parameter("w2_weight", w2_weight) ++ set_weight_attrs(w2_weight, extra_weight_attrs) ++ ++ ++ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: ++ # w1: [num_experts, intermediate_size * 2, hidden_size] ++ # w2: [num_experts, hidden_size, intermediate_size] ++ self.num_experts = layer.w13_weight.data.shape[0] ++ self.intermediate_size = layer.w2_weight.data.shape[2] ++ self.hidden_size = layer.w13_weight.data.shape[2] + -+@INPUT_REGISTRY.register_dummy_encoder_data(dummy_encoder_data_for_whisper) -+@INPUT_REGISTRY.register_input_processor(input_processor_for_whisper) -+@MULTIMODAL_REGISTRY.register_input_mapper("audio", input_mapper_for_whisper) -+@MULTIMODAL_REGISTRY.register_max_multimodal_tokens( -+ "audio", get_max_whisper_audio_tokens) -+class WhisperForConditionalGeneration(nn.Module, SupportsMultiModal): ++ local_rank = os.environ["LOCAL_RANK"] ++ self.device = torch.device(f"xpu:{local_rank}") ++ lowbit = os.getenv("IPEX_LLM_LOWBIT", "sym_int4") ++ qtype = ggml_tensor_qtype[lowbit] ++ self.qtype = qtype ++ ++ w13_params = [] ++ for i in range(self.num_experts): ++ cur_params = FP4Params(data=layer.w13_weight.data[i,:,:], ++ requires_grad=False, ++ quantized=False, ++ _shape=None, ++ convert_shape_only=False, ++ qtype=self.qtype).to(self.device) ++ w13_params.append(cur_params) ++ layer._parameters['w13_weight'] = None ++ self.qw1_weight = w13_params ++ ++ w2_params = [] ++ for i in range(self.num_experts): ++ cur_params = FP4Params(data=layer.w2_weight.data[i,:,:], ++ requires_grad=False, ++ quantized=False, ++ _shape=None, ++ convert_shape_only=False, ++ qtype=self.qtype).to(self.device) ++ w2_params.append(cur_params) ++ layer._parameters['w2_weight'] = None ++ self.qw2_weight = w2_params ++ ++ w1 = self.qw1_weight ++ self.w1_addrs = [expert.data_ptr() for expert in w1] ++ self.w1_addrs = torch.tensor(self.w1_addrs, device=self.device, dtype=torch.uint64) ++ w2 = self.qw2_weight ++ self.w2_addrs = [expert.data_ptr() for expert in w2] ++ self.w2_addrs = torch.tensor(self.w2_addrs, device=self.device, dtype=torch.uint64) ++ ++ logger.warning_once("model processed.") + -+ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): -+ super().__init__() -+ config = vllm_config.model_config.hf_config -+ quant_config = vllm_config.quant_config -+ self.config = config -+ self.dtype = vllm_config.model_config.dtype -+ -+ self.model = WhisperModel(vllm_config=vllm_config, prefix=prefix) -+ self.unpadded_vocab_size = config.vocab_size -+ self.proj_out = ParallelLMHead(config.vocab_size, -+ config.d_model, -+ quant_config=quant_config) -+ self.proj_out = self.proj_out.tie_weights( -+ self.model.decoder.embed_tokens) -+ logit_scale = getattr(config, "logit_scale", 1.0) -+ self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, -+ config.vocab_size, logit_scale) -+ self.sampler = Sampler() + -+ def forward( ++ def apply( + self, -+ input_ids: torch.Tensor, -+ positions: torch.Tensor, -+ kv_caches: List[torch.Tensor], -+ attn_metadata: AttentionMetadata, -+ **kwargs, ++ layer: torch.nn.Module, ++ x: torch.Tensor, ++ router_logits: torch.Tensor, ++ top_k: int, ++ renormalize: bool, ++ use_grouped_topk: bool = False, ++ topk_group: Optional[int] = None, ++ num_expert_group: Optional[int] = None, ++ global_num_experts: int = -1, ++ expert_map: Optional[torch.Tensor] = None, ++ custom_routing_function: Optional[Callable] = None, ++ scoring_func: str = "softmax", ++ e_score_correction_bias: Optional[torch.Tensor] = None, ++ apply_router_weight_on_input: bool = False, ++ activation: str = "silu", + ) -> torch.Tensor: -+ audio_input = self._parse_and_validate_audio_input(**kwargs) -+ decoder_outputs = self.model( -+ input_features=audio_input["input_features"], -+ input_ids=input_ids, -+ positions=positions, -+ kv_caches=kv_caches, -+ attn_metadata=attn_metadata, -+ ) -+ return decoder_outputs ++ return self.forward_xpu( ++ x=x, ++ layer=layer, ++ router_logits=router_logits, ++ top_k=top_k, ++ renormalize=renormalize, ++ use_grouped_topk=use_grouped_topk, ++ topk_group=topk_group, ++ num_expert_group=num_expert_group, ++ global_num_experts=global_num_experts, ++ expert_map=expert_map, ++ custom_routing_function=custom_routing_function, ++ scoring_func=scoring_func, ++ e_score_correction_bias=e_score_correction_bias, ++ activation=activation, ++ apply_router_weight_on_input=apply_router_weight_on_input) ++ + -+ def get_multimodal_embeddings( ++ def forward_xpu( + self, -+ kv_caches: List[torch.Tensor], -+ attn_metadata: AttentionMetadata, ++ layer: torch.nn.Module, ++ x: torch.Tensor, ++ use_grouped_topk: bool, ++ top_k: int, ++ router_logits: torch.Tensor, ++ renormalize: bool, ++ topk_group: Optional[int] = None, ++ num_expert_group: Optional[int] = None, ++ global_num_experts: int = -1, ++ expert_map: Optional[torch.Tensor] = None, ++ custom_routing_function: Optional[Callable] = None, ++ scoring_func: str = "softmax", ++ e_score_correction_bias: Optional[torch.Tensor] = None, ++ activation: str = "silu", ++ apply_router_weight_on_input: bool = False, + **kwargs, -+ ) -> Optional[NestedTensors]: -+ # TODO: This method does not obey the interface for SupportsMultiModal. -+ # Refactor this once encoder/decoder support is implemented in V1. -+ audio_input = self._parse_and_validate_audio_input(**kwargs) -+ return self.model.get_encoder_outputs( -+ audio_input["input_features"], -+ kv_caches=kv_caches, -+ attn_metadata=attn_metadata, -+ ) -+ -+ def get_input_embeddings( ++ ): ++ num_tokens = x.shape[:-1].numel() ++ if not envs.VLLM_USE_V1 and num_tokens > 256: ++ return self.fused_moe_xpu(hidden_states=x, ++ w1=self.qw1_weight, ++ w2=self.qw2_weight, ++ topk=top_k, ++ gating_output=router_logits, ++ global_num_experts=global_num_experts, ++ expert_map=expert_map, ++ renormalize=renormalize) ++ else: ++ return self.fused_moe_xpu_decode(hidden_states=x, ++ w1=self.qw1_weight, ++ w2=self.qw2_weight, ++ topk=top_k, ++ gating_output=router_logits, ++ global_num_experts=global_num_experts, ++ expert_map=expert_map, ++ renormalize=renormalize) ++ ++ def fused_moe_xpu_decode( + self, -+ input_ids: torch.Tensor, -+ multimodal_embeddings: Optional[NestedTensors] = None, -+ attn_metadata: Optional[AttentionMetadata] = None, ++ hidden_states: torch.Tensor, ++ w1, ++ w2, ++ gating_output: torch.Tensor, ++ topk: int, ++ global_num_experts, ++ expert_map, ++ renormalize: bool, + ) -> torch.Tensor: -+ # TODO: This method just returns the decoder sequence embeddings since -+ # Whisper does not have encoder text tokens. Refactor this once -+ # encoder/decoder support is implemented in V1. -+ return self.model.decoder.get_input_embeddings(input_ids) -+ -+ def _parse_and_validate_audio_input( -+ self, **kwargs: object) -> WhisperAudioInputs: -+ input_features = kwargs.pop("input_features", None) -+ -+ if input_features is not None: -+ if not isinstance(input_features, (torch.Tensor, list)): -+ raise ValueError("Incorrect type of audio features. " -+ f"Got type: {type(input_features)}") -+ input_features = [feat.to(self.dtype) for feat in input_features] -+ -+ return WhisperAudioInputs(input_features=input_features) -+ -+ def compute_logits(self, hidden_states: torch.Tensor, -+ sampling_metadata: SamplingMetadata) -> torch.Tensor: -+ logits = self.logits_processor(self.proj_out, hidden_states, -+ sampling_metadata) -+ return logits -+ -+ def sample( -+ self, -+ logits: torch.Tensor, -+ sampling_metadata: SamplingMetadata, -+ ) -> Optional[SamplerOutput]: -+ next_tokens = self.sampler(logits, sampling_metadata) -+ return next_tokens ++ """ ++ Args: ++ hidden_states: [*, hidden_size] ++ w1: [num_experts, intermediate_size * 2, hidden_size] ++ w2: [num_experts, hidden_size, intermediate_size] ++ gating_output: [*, num_experts] ++ """ ++ orig_shape = hidden_states.shape ++ hidden_size = hidden_states.shape[-1] ++ num_tokens = hidden_states.shape[:-1].numel() ++ num_experts = self.num_experts ++ intermediate_size = self.intermediate_size ++ qtype = self.qtype ++ ++ device = hidden_states.device ++ dtype = hidden_states.dtype ++ hidden_states = hidden_states.view(num_tokens, hidden_size) ++ gating_output = gating_output.view(num_tokens, global_num_experts) ++ # topk_weights, topk_indices = F.softmax(gating_output, dim=-1, dtype=torch.float).topk(topk, dim=-1) ++ # if renormalize: ++ # topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True) ++ topk_indices, topk_weights = xe_addons.moe_softmax_topk(gating_output, topk, renormalize) ++ topk_weights = topk_weights.to(dtype) ++ if expert_map is not None: ++ expert_map = expert_map.to(device=device) ++ topk_indices = expert_map[topk_indices] ++ ++ topk_indices = topk_indices.flatten() ++ token_indices = torch.arange(num_tokens, device=device).repeat_interleave(topk) ++ ++ # padding_len = cur_topk_indices[cur_topk_indices == -1].shape[0] + -+ def load_weights(self, weights: Iterable[Tuple[str, -+ torch.Tensor]]) -> Set[str]: -+ loader = AutoWeightsLoader(self, skip_prefixes=["proj_out."]) -+ loaded_weights = [(name, loaded_weight) -+ for name, loaded_weight in weights] -+ mapper = WeightsMapper({".fc1.": ".mlp.fc1.", ".fc2.": ".mlp.fc2."}) -+ return loader.load_weights(loaded_weights, mapper=mapper) -\ No newline at end of file -diff --git a/vllm/model_executor/utils.py b/vllm/model_executor/utils.py -index 39ead08c2..6f1cc9d5e 100644 ---- a/vllm/model_executor/utils.py -+++ b/vllm/model_executor/utils.py -@@ -3,10 +3,9 @@ from typing import Any, Dict, Optional - - import torch - --from vllm.platforms import current_platform -- - - def set_random_seed(seed: int) -> None: -+ from vllm.platforms import current_platform - current_platform.seed_everything(seed) - - -@@ -38,6 +37,7 @@ def set_weight_attrs( - # This sometimes causes OOM errors during model loading. To avoid this, - # we sync the param tensor after its weight loader is called. - # TODO(woosuk): Remove this hack once we have a better solution. -+ from vllm.platforms import current_platform - if current_platform.is_tpu() and key == "weight_loader": - value = _make_synced_weight_loader(value) - setattr(weight, key, value) -diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py -index 9255e062e..e58bbe817 100644 ---- a/vllm/multimodal/__init__.py -+++ b/vllm/multimodal/__init__.py -@@ -1,8 +1,7 @@ - from .base import MultiModalPlaceholderMap, MultiModalPlugin --from .inputs import (BatchedTensorInputs, MultiModalData, -- MultiModalDataBuiltins, MultiModalDataDict, -- MultiModalKwargs, MultiModalPlaceholderDict, -- NestedTensors) -+from .inputs import (BatchedTensorInputs, ModalityData, MultiModalDataBuiltins, -+ MultiModalDataDict, MultiModalKwargs, -+ MultiModalPlaceholderDict, NestedTensors) - from .registry import MultiModalRegistry - - MULTIMODAL_REGISTRY = MultiModalRegistry() -@@ -16,7 +15,7 @@ See also: - - __all__ = [ - "BatchedTensorInputs", -- "MultiModalData", -+ "ModalityData", - "MultiModalDataBuiltins", - "MultiModalDataDict", - "MultiModalKwargs", -diff --git a/vllm/multimodal/audio.py b/vllm/multimodal/audio.py -index ed3bb82bf..de80f22ba 100644 ---- a/vllm/multimodal/audio.py -+++ b/vllm/multimodal/audio.py -@@ -1,17 +1,26 @@ -+import base64 -+from io import BytesIO -+from pathlib import Path -+ - import numpy as np - import numpy.typing as npt - - from vllm.inputs.registry import InputContext - from vllm.utils import PlaceholderModule - --from .base import MultiModalPlugin --from .inputs import AudioItem, MultiModalData, MultiModalKwargs -+from .base import MediaIO, MultiModalPlugin -+from .inputs import AudioItem, ModalityData, MultiModalKwargs - - try: - import librosa - except ImportError: - librosa = PlaceholderModule("librosa") # type: ignore[assignment] - -+try: -+ import soundfile -+except ImportError: -+ soundfile = PlaceholderModule("soundfile") # type: ignore[assignment] ++ x = hidden_states[token_indices] + - - class AudioPlugin(MultiModalPlugin): - """Plugin for audio data.""" -@@ -22,7 +31,7 @@ class AudioPlugin(MultiModalPlugin): - def _default_input_mapper( - self, - ctx: InputContext, -- data: MultiModalData[AudioItem], -+ data: ModalityData[AudioItem], - **mm_processor_kwargs, - ) -> MultiModalKwargs: - raise NotImplementedError("There is no default audio input mapper") -@@ -39,3 +48,28 @@ def resample_audio( - target_sr: float, - ) -> npt.NDArray[np.floating]: - return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr) ++ # x: [bsz * seq_len * num_selected_experts, hidden_size] ++ # w1_out: [bsz * seq_len * num_selected_experts, intermediate_size * 2] ++ # topk_indices: [bsz * seq_len * num_selected_experts] ++ # res: [bsz * seq_len * num_selected_experts, hidden_size] ++ x = vllm._C.ops.fused_moe_forward(x, topk_indices, self.w1_addrs, self.w2_addrs, hidden_size, intermediate_size, qtype) + ++ # if padding_len > 0: ++ # x = vllm._C.ops.fused_moe_forward(x[padding_len:], cur_topk_indices[padding_len:], self.w1_addrs, self.w2_addrs, hidden_size, intermediate_size, qtype) ++ # else: ++ # x = vllm._C.ops.fused_moe_forward(x, cur_topk_indices, self.w1_addrs, self.w2_addrs, hidden_size, intermediate_size, qtype) + -+class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]): ++ # if padding_len > 0: ++ # padding_shape = (padding_len, hidden_size) ++ # padding_x = torch.zeros(padding_shape, dtype=x.dtype, device=x.device) ++ # x = torch.cat((padding_x, x), dim=0) + -+ def load_bytes(self, data: bytes) -> tuple[npt.NDArray, float]: -+ return librosa.load(BytesIO(data), sr=None) ++ x = x.reshape(-1, topk, hidden_size) ++ x = x * topk_weights.unsqueeze_(dim=-1) ++ x = x.sum(dim=-2) ++ x = x.reshape(orig_shape) ++ return x + -+ def load_base64( ++ def fused_moe_xpu( + self, -+ media_type: str, -+ data: str, -+ ) -> tuple[npt.NDArray, float]: -+ return self.load_bytes(base64.b64decode(data)) -+ -+ def load_file(self, filepath: Path) -> tuple[npt.NDArray, float]: -+ return librosa.load(filepath, sr=None) -+ -+ def encode_base64(self, media: tuple[npt.NDArray, float]) -> str: -+ audio, sr = media -+ -+ with BytesIO() as buffer: -+ soundfile.write(buffer, audio, sr, format="WAV") -+ data = buffer.getvalue() -+ -+ return base64.b64encode(data).decode('utf-8') -diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py -index 1e5a46946..7f4029e72 100644 ---- a/vllm/multimodal/base.py -+++ b/vllm/multimodal/base.py -@@ -1,6 +1,7 @@ - from abc import ABC, abstractmethod - from collections import defaultdict --from typing import (TYPE_CHECKING, Any, Callable, Dict, List, NamedTuple, -+from pathlib import Path -+from typing import (TYPE_CHECKING, Any, Callable, Generic, NamedTuple, - Optional, Sequence, Tuple, Type, TypeVar, Union) - - from torch import nn -@@ -14,12 +15,12 @@ if TYPE_CHECKING: - from vllm.config import ModelConfig - from vllm.sequence import SequenceGroupMetadata - --from .inputs import (MultiModalData, MultiModalDataDict, MultiModalKwargs, -+from .inputs import (ModalityData, MultiModalDataDict, MultiModalKwargs, - PlaceholderRange) - - logger = init_logger(__name__) - --MultiModalInputMapper = Callable[[InputContext, MultiModalData[object]], -+MultiModalInputMapper = Callable[[InputContext, ModalityData[object]], - MultiModalKwargs] - """ - Return a dictionary to be passed as keyword arguments to -@@ -68,7 +69,7 @@ class MultiModalPlugin(ABC): - def _default_input_mapper( - self, - ctx: InputContext, -- data: MultiModalData[Any], -+ data: ModalityData[Any], - **mm_processor_kwargs, - ) -> MultiModalKwargs: - """ -@@ -117,8 +118,8 @@ class MultiModalPlugin(ABC): - def map_input( - self, - model_config: "ModelConfig", -- data: MultiModalData[Any], -- mm_processor_kwargs: Optional[Dict[str, Any]], -+ data: ModalityData[Any], -+ mm_processor_kwargs: Optional[dict[str, Any]], - ) -> MultiModalKwargs: - """ - Transform the data into a dictionary of model inputs using the -@@ -254,10 +255,10 @@ class MultiModalPlaceholderMap: - """ - - class IndexMap(NamedTuple): -- src: List[int] -- dest: List[int] -+ src: list[int] -+ dest: list[int] - -- src_ranges: List[range] -+ src_ranges: list[range] - """ - The indices of the multi-modal embeddings that will replace the - corresponding placeholder embeddings pointed to by ``dest_ranges``. -@@ -268,7 +269,7 @@ class MultiModalPlaceholderMap: - The total number of flattened multi-modal embeddings. - """ - -- dest_ranges: List[range] -+ dest_ranges: list[range] - """ - The indices of the placeholder embeddings that will be replaced by the - multimodal embeddings. -@@ -288,7 +289,7 @@ class MultiModalPlaceholderMap: - @classmethod - def from_seq_group( - cls, seq_group: "SequenceGroupMetadata", positions: range -- ) -> Tuple[Optional[MultiModalDataDict], Dict[str, -+ ) -> Tuple[Optional[MultiModalDataDict], dict[str, - "MultiModalPlaceholderMap"]]: - """ - Returns the multi-modal items that intersect with the portion of a -@@ -296,35 +297,37 @@ class MultiModalPlaceholderMap: - ``MultiModalPlaceholderMap`` that relates the multi-modal embedding - vectors to their corresponding placeholders. - -- Consider the following scenarios: -+ Examples: - -- Prompt: |AAAA BBBB What's in these images?| -- Positions: |.................................| -+ .. code-block:: - -- images = [A, B] -- src_ranges = [(0, 4), (4, 8)] -- dest_ranges = [(0, 4), (5, 9)] -+ Prompt: |AAAA BBBB What's in these images?| -+ Positions: |.................................| - -- Prompt: |AAAA BBBB What's in these images?| -- Positions: | ..... | -+ images = [A, B] -+ src_ranges = [(0, 4), (4, 8)] -+ dest_ranges = [(0, 4), (5, 9)] - -- images = [A, B] -- src_ranges = [(2, 4), (4, 6)] -- dest_ranges = [(0, 2), (3, 5)] -+ Prompt: |AAAA BBBB What's in these images?| -+ Positions: | ..... | - -- Prompt: |AAAA BBBB What's in these images?| -- Positions: | ......... | -+ images = [A, B] -+ src_ranges = [(2, 4), (4, 6)] -+ dest_ranges = [(0, 2), (3, 5)] - -- images = [B] -- src_ranges = [(0, 4)] -- dest_ranges = [(0, 4)] -+ Prompt: |AAAA BBBB What's in these images?| -+ Positions: | ......... | - -- Prompt: |AAAA BBBB What's in these images?| -- Positions: | .......................| -+ images = [B] -+ src_ranges = [(0, 4)] -+ dest_ranges = [(0, 4)] - -- images = [] -- src_ranges = [] -- dest_ranges = [] -+ Prompt: |AAAA BBBB What's in these images?| -+ Positions: | .......................| -+ -+ images = [] -+ src_ranges = [] -+ dest_ranges = [] - """ - seq_mm_data = seq_group.multi_modal_data - seq_mm_placeholders = seq_group.multi_modal_placeholders -@@ -376,9 +379,9 @@ class MultiModalPlaceholderMap: - def append_items_from_seq_group( - self, - positions: range, -- multi_modal_items: List[_T], -+ multi_modal_items: list[_T], - multi_modal_placeholders: Sequence[PlaceholderRange], -- ) -> List[_T]: -+ ) -> list[_T]: - """ - Adds the multi-modal items that intersect ```positions`` to this - placeholder map and returns the intersecting items. -@@ -454,3 +457,22 @@ class MultiModalPlaceholderMap: - - return MultiModalPlaceholderMap.IndexMap(src=src_indices, - dest=dest_indices) ++ hidden_states: torch.Tensor, ++ w1, ++ w2, ++ gating_output: torch.Tensor, ++ topk: int, ++ global_num_experts, ++ expert_map, ++ renormalize: bool, ++ ) -> torch.Tensor: ++ """ ++ Args: ++ hidden_states: [*, hidden_size] ++ w1: [num_experts, intermediate_size * 2, hidden_size] ++ w2: [num_experts, hidden_size, intermediate_size] ++ gating_output: [*, num_experts] ++ """ ++ orig_shape = hidden_states.shape ++ hidden_size = hidden_states.shape[-1] ++ num_tokens = hidden_states.shape[:-1].numel() ++ num_experts = self.num_experts ++ intermediate_size = self.intermediate_size ++ qtype = self.qtype ++ ++ device = hidden_states.device ++ dtype = hidden_states.dtype ++ hidden_states = hidden_states.view(num_tokens, hidden_size) ++ gating_output = gating_output.view(num_tokens, global_num_experts) ++ # topk_weights, topk_indices = F.softmax(gating_output, dim=-1, dtype=torch.float).topk(topk, dim=-1) ++ # if renormalize: ++ # topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True) ++ topk_indices, topk_weights = xe_addons.moe_softmax_topk(gating_output, topk, renormalize) ++ topk_weights = topk_weights.to(dtype) ++ if expert_map is not None: ++ expert_map = expert_map.to(device=device) ++ topk_indices = expert_map[topk_indices] ++ ++ topk_indices = topk_indices.flatten() ++ topk_argsort_indices = topk_indices.argsort() ++ topk_argsort_revert_indices = topk_argsort_indices.argsort() ++ token_indices = torch.arange(num_tokens, device=device).repeat_interleave(topk) ++ token_indices = token_indices[topk_argsort_indices] ++ group_sizes = custom_histogram(topk_indices.to(torch.int32), 0, num_experts - 1) ++ ++ x = hidden_states[token_indices] ++ ++ x = custom_gmm(x, w1, group_sizes, intermediate_size * 2) ++ # x = F.silu(x[..., :intermediate_size]) * x[..., intermediate_size:] ++ output = torch.zeros((x.shape[0], intermediate_size), device=x.device, dtype=x.dtype) ++ ipex_ops.silu_and_mul(output, x) ++ x = output ++ x = custom_gmm(x, w2, group_sizes, hidden_size) ++ x = x[topk_argsort_revert_indices].reshape(-1, topk, hidden_size) ++ ++ x = x * topk_weights.unsqueeze_(dim=-1) ++ x = x.sum(dim=-2) ++ x = x.reshape(orig_shape) ++ return x + ++def custom_histogram(indices, min, max): ++ bin_counts = torch.histc(indices, bins=max - min + 1, min=min, max=max).to(torch.int32) ++ return bin_counts + -+class MediaIO(ABC, Generic[_T]): ++def custom_gmm(x, w, group_sizes, out_len): ++ result = torch.zeros( ++ (x.shape[0], out_len), ++ dtype=x.dtype, ++ device=x.device ++ ) ++ start = 0 ++ i = 0 ++ ++ lowbit = os.getenv("IPEX_LLM_LOWBIT", "sym_int4") ++ qtype = ggml_tensor_qtype[lowbit] ++ for end_index in group_sizes.tolist(): ++ if end_index > 0: ++ end = start + end_index + -+ @abstractmethod -+ def load_bytes(self, data: bytes) -> _T: -+ raise NotImplementedError ++ # result[start:end] = torch.matmul(x[start:end], w[i]) ++ ++ # cur_x = x[start:end].contiguous() ++ # cur_w = xe_linear.dequant(cur_x, w[i].contiguous(), qtype) ++ # result[start:end] = torch.matmul(cur_x, cur_w.T) + -+ @abstractmethod -+ def load_base64(self, media_type: str, data: str) -> _T: -+ """ -+ List of media types: -+ https://www.iana.org/assignments/media-types/media-types.xhtml -+ """ -+ raise NotImplementedError ++ cur_x = x[start:end].contiguous() ++ cur_w = w[i] ++ # print(cur_x.shape, " ", cur_w.shape, " ", out_len) ++ cur_res = xe_linear.forward_new(cur_x, cur_w, qtype, out_len) ++ # print(cur_res.shape) ++ result[start:end] = cur_res + -+ @abstractmethod -+ def load_file(self, filepath: Path) -> _T: -+ raise NotImplementedError -diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py -index c705e1a3d..da13a381c 100644 ---- a/vllm/multimodal/image.py -+++ b/vllm/multimodal/image.py -@@ -1,4 +1,7 @@ -+import base64 - from functools import lru_cache -+from io import BytesIO -+from pathlib import Path - from typing import TYPE_CHECKING, Any, Dict, Optional ++ start = end ++ i += 1 ++ return result +diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py +index 0e35d8a80..235f92bf8 100644 +--- a/vllm/model_executor/layers/fused_moe/layer.py ++++ b/vllm/model_executor/layers/fused_moe/layer.py +@@ -32,6 +32,10 @@ if current_platform.is_tpu(): + from .moe_torch_iterative import fused_moe as fused_moe_pallas + else: + fused_moe_pallas = None # type: ignore ++if current_platform.is_xpu(): ++ from .moe_pallas import fused_moe_xpu ++else: ++ fused_moe_xpu = None # type: ignore + logger = init_logger(__name__) - import torch -@@ -9,8 +12,8 @@ from vllm.logger import init_logger - from vllm.transformers_utils.processor import get_image_processor - from vllm.utils import is_list_of --from .base import MultiModalPlugin --from .inputs import ImageItem, MultiModalData, MultiModalKwargs -+from .base import MediaIO, MultiModalPlugin -+from .inputs import ImageItem, ModalityData, MultiModalKwargs +@@ -132,16 +136,15 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): + layer.w2_weight = torch.nn.Parameter(shuffled_w2, + requires_grad=False) - if TYPE_CHECKING: - from vllm.config import ModelConfig -@@ -41,7 +44,7 @@ class ImagePlugin(MultiModalPlugin): - def _default_input_mapper( +- if current_platform.is_cpu(): +- if current_platform.get_cpu_architecture() == CpuArchEnum.X86: +- import intel_extension_for_pytorch as ipex +- layer.ipex_fusion = ipex.llm.modules.GatedMLPMOE( +- layer.w13_weight, +- layer.w2_weight, +- use_prepack=envs.VLLM_CPU_MOE_PREPACK, +- ) +- else: +- raise NotImplementedError("CPU MOE only supports x86 arch.") ++ if current_platform.is_xpu() or (current_platform.is_cpu() and current_platform.get_cpu_architecture() == CpuArchEnum.X86): ++ import intel_extension_for_pytorch as ipex ++ layer.ipex_fusion = ipex.llm.modules.GatedMLPMOE( ++ layer.w13_weight, ++ layer.w2_weight, ++ use_prepack=envs.VLLM_CPU_MOE_PREPACK, ++ ) ++ else: ++ raise NotImplementedError("CPU MOE only supports x86 arch.") + + def apply( + self, +@@ -254,6 +257,42 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): + e_score_correction_bias, + ) + ++ def forward_xpu( ++ self, ++ layer: torch.nn.Module, ++ x: torch.Tensor, ++ use_grouped_topk: bool, ++ top_k: int, ++ router_logits: torch.Tensor, ++ renormalize: bool, ++ topk_group: Optional[int] = None, ++ num_expert_group: Optional[int] = None, ++ global_num_experts: int = -1, ++ expert_map: Optional[torch.Tensor] = None, ++ custom_routing_function: Optional[Callable] = None, ++ scoring_func: str = "softmax", ++ e_score_correction_bias: Optional[torch.Tensor] = None, ++ activation: str = "silu", ++ apply_router_weight_on_input: bool = False, ++ **kwargs, ++ ): ++ assert custom_routing_function is None ++ return layer.ipex_fusion( ++ x, ++ use_grouped_topk, ++ top_k, ++ router_logits, ++ renormalize, ++ topk_group, ++ num_expert_group, ++ ) ++ # return fused_moe_xpu(hidden_states=x, ++ # w1=layer.w13_weight, ++ # w2=layer.w2_weight, ++ # topk=top_k, ++ # gating_output=router_logits, ++ # renormalize=renormalize) ++ + def forward_hpu( self, - ctx: InputContext, -- data: MultiModalData[ImageItem], -+ data: ModalityData[ImageItem], - **mm_processor_kwargs, - ) -> MultiModalKwargs: - model_config = ctx.model_config -@@ -96,3 +99,39 @@ def rescale_image_size(image: Image.Image, - if transpose >= 0: - image = image.transpose(Image.Transpose(transpose)) - return image -+ -+ -+class ImageMediaIO(MediaIO[Image.Image]): -+ -+ def __init__(self, *, image_mode: str = "RGB") -> None: -+ super().__init__() -+ -+ self.image_mode = image_mode -+ -+ def load_bytes(self, data: bytes) -> Image.Image: -+ image = Image.open(BytesIO(data)) -+ image.load() -+ return image.convert(self.image_mode) -+ -+ def load_base64(self, media_type: str, data: str) -> Image.Image: -+ return self.load_bytes(base64.b64decode(data)) -+ -+ def load_file(self, filepath: Path) -> Image.Image: -+ image = Image.open(filepath) -+ image.load() -+ return image.convert(self.image_mode) -+ -+ def encode_base64( -+ self, -+ media: Image.Image, -+ *, -+ image_format: str = "JPEG", -+ ) -> str: -+ image = media -+ -+ with BytesIO() as buffer: -+ image = image.convert(self.image_mode) -+ image.save(buffer, image_format) -+ data = buffer.getvalue() -+ -+ return base64.b64encode(data).decode('utf-8') -diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py -index 9ecae2c1c..b0a110454 100644 ---- a/vllm/multimodal/inputs.py -+++ b/vllm/multimodal/inputs.py -@@ -1,50 +1,76 @@ -+from abc import ABC, abstractmethod - from collections import UserDict, defaultdict --from typing import (Any, Dict, List, Literal, Mapping, Sequence, Tuple, -- TypedDict, TypeVar, Union, cast, final) -+from collections.abc import Mapping, Sequence -+from dataclasses import dataclass -+from typing import (Any, Literal, Optional, TypedDict, TypeVar, Union, cast, -+ final) + layer: torch.nn.Module, +@@ -495,8 +534,15 @@ class FusedMoE(torch.nn.Module): + # Note: get_quant_method will look at the layer's local_num_experts + # for heuristic purposes, so it must be initialized first. + if quant_config is None: +- self.quant_method: Optional[QuantizeMethodBase] = ( +- UnquantizedFusedMoEMethod()) ++ import os ++ lowbit = os.getenv("IPEX_LLM_LOWBIT", None) ++ if lowbit is not None: ++ from vllm.model_executor.layers.fused_moe.ipex_llm_moe import IPEXLLMFusedMoEMethod ++ self.quant_method: Optional[QuantizeMethodBase] = ( ++ IPEXLLMFusedMoEMethod()) ++ else: ++ self.quant_method: Optional[QuantizeMethodBase] = ( ++ UnquantizedFusedMoEMethod()) + else: + self.quant_method = quant_config.get_quant_method(self, prefix) + assert self.quant_method is not None +diff --git a/vllm/model_executor/layers/fused_moe/moe_pallas.py b/vllm/model_executor/layers/fused_moe/moe_pallas.py +index 0365afa10..1e30b61d6 100644 +--- a/vllm/model_executor/layers/fused_moe/moe_pallas.py ++++ b/vllm/model_executor/layers/fused_moe/moe_pallas.py +@@ -2,10 +2,68 @@ - import numpy as np import torch - import torch.types - from PIL.Image import Image -+from transformers import BatchFeature - from typing_extensions import NotRequired, TypeAlias - --from vllm.utils import JSONTree, is_list_of, json_map_leaves -+from vllm.utils import JSONTree, full_groupby, is_list_of, json_map_leaves - - _T = TypeVar("_T") - --# yapf: disable --ImageItem: TypeAlias = Union[Image, np.ndarray, torch.Tensor] -+HfImageItem: TypeAlias = Union[Image, np.ndarray, torch.Tensor] - """ - A :class:`transformers.image_utils.ImageInput` representing a single image - item, which can be passed to a HuggingFace :code:`ImageProcessor`. - """ - --VideoItem: TypeAlias = Union[ -- list[Image], -- np.ndarray, -- torch.Tensor, -- list[np.ndarray], -- list[torch.Tensor], --] -+HfVideoItem: TypeAlias = Union[list[Image], np.ndarray, torch.Tensor, -+ list[np.ndarray], list[torch.Tensor]] - """ - A :class:`transformers.image_utils.VideoInput` representing a single video - item, which can be passed to a HuggingFace :code:`VideoProcessor`. - """ + import torch.nn.functional as F +-from torch_xla.experimental.custom_kernel import _histogram ++# from torch_xla.experimental.custom_kernel import _histogram --AudioItem: TypeAlias = Union[ -- np.ndarray, -- list[float], -- # `(audio, sampling_rate)`: If the audio's sampling rate is different -- # from that expected by the model, we need to resample it. -- tuple[np.ndarray, float], --] -+HfAudioItem: TypeAlias = Union[list[float], np.ndarray, torch.Tensor] - """ - Represents a single audio - item, which can be passed to a HuggingFace :code:`AudioProcessor`. - """ --# yapf: enable --MultiModalData: TypeAlias = Union[_T, List[_T]] -+ImageItem: TypeAlias = Union[HfImageItem, torch.Tensor] -+""" -+A :class:`transformers.image_utils.ImageInput` representing a single image -+item, which can be passed to a HuggingFace :code:`ImageProcessor`. -+ -+Alternatively, a 3-D tensor or batch of 2-D tensors, -+which are treated as image embeddings; -+these are directly passed to the model without HF processing. -+""" +-def fused_moe( ++# def fused_moe( ++# hidden_states: torch.Tensor, ++# w1: torch.Tensor, ++# w2: torch.Tensor, ++# gating_output: torch.Tensor, ++# topk: int, ++# renormalize: bool, ++# ) -> torch.Tensor: ++# """ ++# Args: ++# hidden_states: [*, hidden_size] ++# w1: [num_experts, intermediate_size * 2, hidden_size] ++# w2: [num_experts, hidden_size, intermediate_size] ++# gating_output: [*, num_experts] ++# """ ++# orig_shape = hidden_states.shape ++# hidden_size = hidden_states.shape[-1] ++# num_tokens = hidden_states.shape[:-1].numel() ++# num_experts = w1.shape[0] ++# intermediate_size = w2.shape[-1] ++# device = hidden_states.device ++# dtype = hidden_states.dtype ++# assert (num_tokens * topk) % 16 == 0, ( ++# "The Pallas GMM kernel requires num_tokens * topk to be a multiple of " ++# f"16 but got {num_tokens * topk}") + -+VideoItem: TypeAlias = Union[HfVideoItem, torch.Tensor] -+""" -+A :class:`transformers.image_utils.VideoInput` representing a single video -+item, which can be passed to a HuggingFace :code:`VideoProcessor`. ++# hidden_states = hidden_states.view(num_tokens, hidden_size) ++# gating_output = gating_output.view(num_tokens, num_experts) ++# topk_weights = gating_output.softmax(dim=-1, dtype=torch.float) ++# topk_weights, topk_indices = topk_weights.topk(topk, dim=-1) ++# if renormalize: ++# topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True) ++# topk_weights = topk_weights.to(dtype) + -+Alternatively, a 3-D tensor or batch of 2-D tensors, -+which are treated as video embeddings; -+these are directly passed to the model without HF processing. -+""" ++# topk_indices = topk_indices.flatten() ++# topk_argsort_indices = topk_indices.argsort() ++# topk_argsort_revert_indices = topk_argsort_indices.argsort() ++# token_indices = torch.arange(num_tokens, ++# device=device).repeat_interleave(topk) ++# token_indices = token_indices[topk_argsort_indices] ++# group_sizes = _histogram(topk_indices.to(torch.int32), 0, num_experts - 1) + -+AudioItem: TypeAlias = Union[HfAudioItem, tuple[np.ndarray, float], -+ torch.Tensor] -+""" -+Represents a single audio -+item, which can be passed to a HuggingFace :code:`AudioProcessor`. ++# # NOTE(woosuk): The GMM Pallas kernel requires a different weight layout ++# # from HF Transformers. ++# w1 = w1.transpose(1, 2) ++# w2 = w2.transpose(1, 2) + -+Alternatively, a tuple `(audio, sampling_rate)`, where the sampling rate -+is different from that expected by the model; -+these are resampled to the model's sampling rate before being processed by HF. ++# x = hidden_states[token_indices] ++# x = torch.ops.xla.gmm(x, w1, group_sizes) ++# x = F.silu(x[..., :intermediate_size]) * x[..., intermediate_size:] ++# x = torch.ops.xla.gmm(x, w2, group_sizes) ++# x = x[topk_argsort_revert_indices].reshape(-1, topk, hidden_size) + -+Alternatively, a 3-D tensor or batch of 2-D tensors, -+which are treated as audio embeddings; -+these are directly passed to the model without HF processing. -+""" ++# x = x * topk_weights.unsqueeze_(dim=-1) ++# x = x.sum(dim=-2) ++# x = x.reshape(orig_shape) ++# return x + -+ModalityData: TypeAlias = Union[_T, list[_T]] - """ - Either a single data item, or a list of data items. - -@@ -57,17 +83,17 @@ The number of data items allowed per modality is restricted by - class MultiModalDataBuiltins(TypedDict, total=False): - """Type annotations for modality types predefined by vLLM.""" - -- image: MultiModalData[ImageItem] -+ image: ModalityData[ImageItem] - """The input image(s).""" - -- video: MultiModalData[VideoItem] -+ video: ModalityData[VideoItem] - """The input video(s).""" - -- audio: MultiModalData[AudioItem] -+ audio: ModalityData[AudioItem] - """The input audio(s).""" - - --MultiModalDataDict: TypeAlias = Mapping[str, MultiModalData[Any]] -+MultiModalDataDict: TypeAlias = Mapping[str, ModalityData[Any]] - """ - A dictionary containing an entry for each modality type to input. ++def fused_moe_xpu( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, +@@ -27,14 +85,9 @@ def fused_moe( + intermediate_size = w2.shape[-1] + device = hidden_states.device + dtype = hidden_states.dtype +- assert (num_tokens * topk) % 16 == 0, ( +- "The Pallas GMM kernel requires num_tokens * topk to be a multiple of " +- f"16 but got {num_tokens * topk}") +- + hidden_states = hidden_states.view(num_tokens, hidden_size) + gating_output = gating_output.view(num_tokens, num_experts) +- topk_weights = gating_output.softmax(dim=-1, dtype=torch.float) +- topk_weights, topk_indices = topk_weights.topk(topk, dim=-1) ++ topk_weights, topk_indices = F.softmax(gating_output, dim=-1, dtype=torch.float).topk(topk, dim=-1) + if renormalize: + topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True) + topk_weights = topk_weights.to(dtype) +@@ -42,23 +95,40 @@ def fused_moe( + topk_indices = topk_indices.flatten() + topk_argsort_indices = topk_indices.argsort() + topk_argsort_revert_indices = topk_argsort_indices.argsort() +- token_indices = torch.arange(num_tokens, +- device=device).repeat_interleave(topk) ++ token_indices = torch.arange(num_tokens, device=device).repeat_interleave(topk) + token_indices = token_indices[topk_argsort_indices] +- group_sizes = _histogram(topk_indices.to(torch.int32), 0, num_experts - 1) ++ group_sizes = custom_histogram(topk_indices.to(torch.int32), 0, num_experts - 1) -@@ -83,9 +109,14 @@ class PlaceholderRange(TypedDict): - """ - Placeholder location information for multi-modal data. +- # NOTE(woosuk): The GMM Pallas kernel requires a different weight layout +- # from HF Transformers. + w1 = w1.transpose(1, 2) + w2 = w2.transpose(1, 2) +- + x = hidden_states[token_indices] +- x = torch.ops.xla.gmm(x, w1, group_sizes) ++ x = custom_gmm(x, w1, group_sizes) + x = F.silu(x[..., :intermediate_size]) * x[..., intermediate_size:] +- x = torch.ops.xla.gmm(x, w2, group_sizes) ++ x = custom_gmm(x, w2, group_sizes) + x = x[topk_argsort_revert_indices].reshape(-1, topk, hidden_size) -- For example: -- Prompt: AAAA BBBB What is in these images? -+ Example: -+ -+ Prompt: :code:`AAAA BBBB What is in these images?` + x = x * topk_weights.unsqueeze_(dim=-1) + x = x.sum(dim=-2) + x = x.reshape(orig_shape) + return x + - Images A and B will have: ++def custom_histogram(indices, min, max): ++ bin_counts = torch.histc(indices, bins=max - min + 1, min=min, max=max).to(torch.int32) ++ return bin_counts + -+ .. code-block:: + - A: { "offset": 0, "length": 4 } - B: { "offset": 5, "length": 4 } - """ -@@ -97,25 +128,249 @@ class PlaceholderRange(TypedDict): - """The length of the placeholder.""" - - --NestedTensors = Union[List["NestedTensors"], List[torch.Tensor], torch.Tensor, -- Tuple[torch.Tensor, ...]] -+NestedTensors = Union[list["NestedTensors"], list[torch.Tensor], torch.Tensor, -+ tuple[torch.Tensor, ...]] - """ - Uses a list instead of a tensor if the dimensions of each element do not match. - """ ++def custom_gmm(x, w, group_sizes): ++ result = torch.zeros( ++ (x.shape[0], w.shape[-1]), ++ dtype=x.dtype, ++ device=x.device ++ ) ++ start = 0 ++ i = 0 ++ for end_index in group_sizes.tolist(): ++ if end_index > 0: ++ end = start + end_index ++ result[start:end] = torch.matmul(x[start:end], w[i]) ++ start = end ++ i += 1 ++ return result +diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py +index 5e8eb6c54..4b31fc8e3 100644 +--- a/vllm/model_executor/layers/layernorm.py ++++ b/vllm/model_executor/layers/layernorm.py +@@ -200,11 +200,14 @@ class RMSNorm(CustomOp): + self.variance_epsilon, + ) + return x, residual +- return ops.rms_norm( ++ out = torch.empty_like(x) ++ ops.rms_norm( ++ out, + x, + self.weight.data, + self.variance_epsilon, + ) ++ return out --BatchedTensorInputs: TypeAlias = Dict[str, NestedTensors] -+ -+def nested_tensors_equal(a: NestedTensors, b: NestedTensors) -> bool: -+ """Equality check between :data:`NestedTensors` objects.""" -+ if isinstance(a, torch.Tensor): -+ return isinstance(b, torch.Tensor) and bool((a == b).all().item()) -+ elif isinstance(b, torch.Tensor): -+ return isinstance(a, torch.Tensor) and bool((b == a).all().item()) -+ -+ if isinstance(a, list): -+ return (isinstance(b, list) -+ and all(nested_tensors_equal(a_, b_) for a_, b_ in zip(a, b))) -+ if isinstance(b, list): -+ return (isinstance(a, list) -+ and all(nested_tensors_equal(b_, a_) for b_, a_ in zip(b, a))) -+ -+ # Both a and b are scalars -+ return a == b -+ -+ -+BatchedTensorInputs: TypeAlias = Mapping[str, NestedTensors] - """ - A dictionary containing nested tensors which have been batched via - :meth:`MultiModalKwargs.batch`. - """ + def extra_repr(self) -> str: + s = f"hidden_size={self.weight.data.size(0)}" +diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py +index 1ae574072..02ae72be5 100644 +--- a/vllm/model_executor/layers/linear.py ++++ b/vllm/model_executor/layers/linear.py +@@ -179,8 +179,8 @@ class UnquantizedLinearMethod(LinearMethodBase): + input_size_per_partition, + dtype=params_dtype), + requires_grad=False) +- set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0}) + layer.register_parameter("weight", weight) ++ set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0}) + set_weight_attrs(weight, extra_weight_attrs) + def apply(self, +diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py +index 1c8d6cb1e..5191ae702 100644 +--- a/vllm/model_executor/layers/quantization/gptq.py ++++ b/vllm/model_executor/layers/quantization/gptq.py +@@ -249,6 +249,7 @@ class GPTQLinearMethod(LinearMethodBase): -+@dataclass(frozen=True) -+class MultiModalFieldElem: -+ """Contains metadata and data of an item in :class:`MultiModalKwargs`.""" -+ field: "BaseMultiModalField" -+ data: NestedTensors -+ -+ def __eq__(self, other: object) -> bool: -+ if not isinstance(other, self.__class__): -+ return False -+ -+ return (self.field == other.field -+ and nested_tensors_equal(self.data, other.data)) -+ -+ -+@dataclass(frozen=True) -+class BaseMultiModalField(ABC): -+ """Abstract base class for a field in :class:`MultiModalKwargs`.""" -+ key: str -+ modality: str -+ -+ @abstractmethod -+ def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors: -+ raise NotImplementedError -+ -+ def _build_elem(self, data: NestedTensors) -> MultiModalFieldElem: -+ return MultiModalFieldElem(self, data) -+ -+ def reduce(self, batch: list[MultiModalFieldElem]) -> MultiModalFieldElem: -+ """Merge multiple instances of :class:`MultiModalFieldElem` together.""" -+ fields = [item.field for item in batch] -+ if len(set(fields)) > 1: -+ raise ValueError(f"Cannot merge different {fields=}") -+ -+ data = self._reduce_data([item.data for item in batch]) -+ -+ return self._build_elem(data) -+ -+ -+@dataclass(frozen=True) -+class MultiModalBatchedField(BaseMultiModalField): -+ """ -+ A :class:`BaseMultiModalField` implementation where an element in the batch -+ is obtained by indexing into the first dimension of the underlying data. -+ """ -+ -+ def build_elems(self, batch: NestedTensors) -> list[MultiModalFieldElem]: -+ return [self._build_elem(item) for item in batch] -+ -+ def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors: -+ if len(batch) > 0 and is_list_of(batch, torch.Tensor, check="all"): -+ first_shape = batch[0].shape -+ if all(elem.shape == first_shape for elem in batch): -+ return torch.stack(batch) -+ -+ return batch -+ -+ -+@dataclass(frozen=True) -+class MultiModalFlatField(BaseMultiModalField): -+ """ -+ A :class:`BaseMultiModalField` implementation where an element in the batch -+ is obtained by slicing along the first dimension of the underlying data. -+ """ -+ -+ def build_elems( -+ self, -+ batch: NestedTensors, -+ slices: Sequence[slice], -+ ) -> list[MultiModalFieldElem]: -+ return [self._build_elem(batch[slice_]) for slice_ in slices] -+ -+ def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors: -+ if len(batch) > 0 and is_list_of(batch, torch.Tensor, check="all"): -+ first_shape = batch[0].shape -+ if all(elem.shape[1:] == first_shape[1:] for elem in batch): -+ return torch.concat(batch) -+ -+ return [e for elem in batch for e in elem] -+ -+ -+class MultiModalFieldConfig: -+ -+ @staticmethod -+ def batched(modality: str): -+ return MultiModalFieldConfig( -+ field_cls=MultiModalBatchedField, -+ modality=modality, -+ ) -+ -+ @staticmethod -+ def flat(modality: str, slices: Sequence[slice]): -+ return MultiModalFieldConfig( -+ field_cls=MultiModalFlatField, -+ modality=modality, -+ slices=slices, -+ ) -+ -+ def __init__( -+ self, -+ field_cls: type[BaseMultiModalField], -+ modality: str, -+ **field_config: Any, -+ ) -> None: -+ super().__init__() -+ -+ self.field_cls = field_cls -+ self.modality = modality -+ self.field_config = field_config -+ -+ def build_elems( -+ self, -+ key: str, -+ batch: NestedTensors, -+ ) -> Sequence[MultiModalFieldElem]: -+ field = self.field_cls(key=key, modality=self.modality) -+ return field.build_elems(batch, **self.field_config) # type: ignore -+ -+ -+class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]): -+ """ -+ A collection of :class:`MultiModalFieldElem` -+ corresponding to a data item in :class:`MultiModalDataItems`. -+ """ -+ -+ @staticmethod -+ def from_elems(elems: Sequence[MultiModalFieldElem]): -+ return MultiModalKwargsItem({elem.field.key: elem for elem in elems}) -+ -+ @property -+ def modality(self) -> str: -+ modalities = {elem.field.modality for elem in self.data.values()} -+ assert len(modalities) == 1, f"Found different modalities={modalities}" -+ return next(iter(modalities)) -+ -+ -+# NOTE: UserDict is for V0 compatibility. -+# V1 should access individual items via `get_item`. - class MultiModalKwargs(UserDict[str, NestedTensors]): - """ - A dictionary that represents the keyword arguments to - :meth:`~torch.nn.Module.forward`. -+ -+ The metadata :code:`items` enables us to obtain the keyword arguments -+ corresponding to each data item in :class:`MultiModalDataItems`, via -+ :meth:`get_item` and :meth:`get_items`. - """ + # exllama needs to shuffle the weight after the weight is loaded + # here we do the shuffle on first forward pass ++ ''' + if layer.exllama_state == ExllamaState.UNINITIALIZED: + if self.quant_config.desc_act: + layer.g_idx.data = torch.argsort(layer.g_idx).to(torch.int) +@@ -259,7 +260,7 @@ class GPTQLinearMethod(LinearMethodBase): + layer.exllama_state = ExllamaState.READY + ops.gptq_shuffle(layer.qweight, layer.g_idx, + self.quant_config.weight_bits) +- ++ ''' + def apply(self, + layer: torch.nn.Module, + x: torch.Tensor, +diff --git a/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py +index c09cc13cb..65b58af2f 100644 +--- a/vllm/model_executor/layers/quantization/ipex_quant.py ++++ b/vllm/model_executor/layers/quantization/ipex_quant.py +@@ -99,13 +99,14 @@ class IPEXConfig(QuantizationConfig): + @classmethod + def override_quantization_method(cls, hf_quant_cfg, + user_quant) -> Optional[str]: +- if not current_platform.is_cpu() and not current_platform.is_xpu(): +- return None ++ # not use IPEXConfig ++ # if not current_platform.is_cpu() and not current_platform.is_xpu(): ++ # return None -+ @staticmethod -+ def from_hf_inputs( -+ hf_inputs: BatchFeature, -+ config_by_key: Mapping[str, MultiModalFieldConfig], -+ ): -+ # NOTE: This skips fields in `hf_inputs` that are not in `config_by_key` -+ # We assume that those fields are not used in vLLM -+ elems_by_key = dict[str, Sequence[MultiModalFieldElem]]() -+ keys_by_modality = defaultdict[str, set[str]](set) -+ for key, config in config_by_key.items(): -+ batch = hf_inputs.get(key) -+ if batch is not None: -+ elems = config.build_elems(key, batch) -+ if len(elems) > 0: -+ elems_by_key[key] = elems -+ keys_by_modality[config.modality].add(key) -+ -+ items = list[MultiModalKwargsItem]() -+ for modality, keys in keys_by_modality.items(): -+ elems_in_modality = {k: elems_by_key[k] for k in keys} -+ batch_sizes = {k: len(v) for k, v in elems_in_modality.items()} -+ -+ if len(set(batch_sizes.values())) > 1: -+ raise ValueError( -+ f"Cannot merge different batch sizes for {modality=}! " -+ f"Found: {batch_sizes=}") -+ -+ batch_size = next(iter(batch_sizes.values())) -+ for item_idx in range(batch_size): -+ elems = [v[item_idx] for v in elems_in_modality.values()] -+ items.append(MultiModalKwargsItem.from_elems(elems)) -+ -+ return MultiModalKwargs.from_items(items) -+ -+ @staticmethod -+ def from_items(items: Sequence[MultiModalKwargsItem]): -+ """Construct a new :class:`MultiModalKwargs` from multiple items.""" -+ elems_by_key = defaultdict[str, list[MultiModalFieldElem]](list) -+ for item in items: -+ for key, elem in item.items(): -+ elems_by_key[key].append(elem) -+ -+ data = { -+ key: elems[0].field.reduce(elems).data -+ for key, elems in elems_by_key.items() if len(elems) > 0 -+ } -+ -+ return MultiModalKwargs(data, items=items) -+ -+ def __init__( -+ self, -+ data: Mapping[str, NestedTensors], -+ *, -+ items: Optional[Sequence[MultiModalKwargsItem]] = None, -+ ) -> None: -+ super().__init__(data) -+ -+ items_by_modality = full_groupby(items or [], key=lambda x: x.modality) -+ self._items_by_modality = dict(items_by_modality) -+ -+ @property -+ def modalities(self): -+ return self._items_by_modality.keys() -+ - @staticmethod - def _try_stack(nested_tensors: NestedTensors) -> NestedTensors: - """ -@@ -139,7 +394,7 @@ class MultiModalKwargs(UserDict[str, NestedTensors]): - # Only tensors (not lists) can be stacked. - return stacked +- quant_method = hf_quant_cfg.get("quant_method", "").lower() ++ # quant_method = hf_quant_cfg.get("quant_method", "").lower() + +- if quant_method in ["awq", "gptq"]: +- return cls.get_name() ++ # if quant_method in ["awq", "gptq"]: ++ # return cls.get_name() -- tensors_ = cast(List[torch.Tensor], stacked) -+ tensors_ = cast(list[torch.Tensor], stacked) - if any(t.shape != tensors_[0].shape for t in tensors_): - # The tensors have incompatible shapes and can't be stacked. - return tensors_ -@@ -147,7 +402,7 @@ class MultiModalKwargs(UserDict[str, NestedTensors]): - return torch.stack(tensors_) + return None - @staticmethod -- def batch(inputs_list: List["MultiModalKwargs"]) -> BatchedTensorInputs: -+ def batch(inputs_list: list["MultiModalKwargs"]) -> BatchedTensorInputs: +diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py +index 624ed63ab..f826db908 100644 +--- a/vllm/model_executor/layers/rotary_embedding.py ++++ b/vllm/model_executor/layers/rotary_embedding.py +@@ -956,37 +956,39 @@ class MRotaryEmbedding(RotaryEmbedding): """ - Batch multiple inputs together into a dictionary. + assert positions.ndim == 1 or positions.ndim == 2 -@@ -162,7 +417,7 @@ class MultiModalKwargs(UserDict[str, NestedTensors]): +- num_tokens = positions.shape[-1] +- cos_sin = self.cos_sin_cache[positions] +- cos, sin = cos_sin.chunk(2, dim=-1) +- if positions.ndim == 2: +- assert self.mrope_section +- +- cos = torch.cat([ +- m[i] +- for i, m in enumerate(cos.split(self.mrope_section, dim=-1)) +- ], +- dim=-1) +- sin = torch.cat([ +- m[i] +- for i, m in enumerate(sin.split(self.mrope_section, dim=-1)) +- ], +- dim=-1) +- +- query_shape = query.shape +- query = query.view(num_tokens, -1, self.head_size) +- query_rot = query[..., :self.rotary_dim] +- query_pass = query[..., self.rotary_dim:] +- query_rot = _apply_rotary_emb(query_rot, cos, sin, self.is_neox_style) +- query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) +- +- key_shape = key.shape +- key = key.view(num_tokens, -1, self.head_size) +- key_rot = key[..., :self.rotary_dim] +- key_pass = key[..., self.rotary_dim:] +- key_rot = _apply_rotary_emb(key_rot, cos, sin, self.is_neox_style) +- key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) +- return query, key ++ return self.forward_xpu(positions, query, key) ++ ++ # num_tokens = positions.shape[-1] ++ # cos_sin = self.cos_sin_cache[positions] ++ # cos, sin = cos_sin.chunk(2, dim=-1) ++ # if positions.ndim == 2: ++ # assert self.mrope_section ++ ++ # cos = torch.cat([ ++ # m[i] ++ # for i, m in enumerate(cos.split(self.mrope_section, dim=-1)) ++ # ], ++ # dim=-1) ++ # sin = torch.cat([ ++ # m[i] ++ # for i, m in enumerate(sin.split(self.mrope_section, dim=-1)) ++ # ], ++ # dim=-1) ++ ++ # query_shape = query.shape ++ # query = query.view(num_tokens, -1, self.head_size) ++ # query_rot = query[..., :self.rotary_dim] ++ # query_pass = query[..., self.rotary_dim:] ++ # query_rot = _apply_rotary_emb(query_rot, cos, sin, self.is_neox_style) ++ # query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) ++ ++ # key_shape = key.shape ++ # key = key.view(num_tokens, -1, self.head_size) ++ # key_rot = key[..., :self.rotary_dim] ++ # key_pass = key[..., self.rotary_dim:] ++ # key_rot = _apply_rotary_emb(key_rot, cos, sin, self.is_neox_style) ++ # key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) ++ # return query, key - # We need to consider the case where each item in the batch - # contains different modalities (i.e. different keys). -- item_lists: Dict[str, List[NestedTensors]] = defaultdict(list) -+ item_lists = defaultdict[str, list[NestedTensors]](list) + @staticmethod + def get_input_positions( +diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py +index 1eb0c8c2e..27d246069 100644 +--- a/vllm/model_executor/layers/vocab_parallel_embedding.py ++++ b/vllm/model_executor/layers/vocab_parallel_embedding.py +@@ -32,8 +32,8 @@ class UnquantizedEmbeddingMethod(QuantizeMethodBase): + input_size_per_partition, + dtype=params_dtype), + requires_grad=False) +- set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0}) + layer.register_parameter("weight", weight) ++ set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0}) + set_weight_attrs(weight, extra_weight_attrs) - for inputs in inputs_list: - for k, v in inputs.items(): -@@ -188,6 +443,48 @@ class MultiModalKwargs(UserDict[str, NestedTensors]): + def apply(self, +diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py +index 5649cf2dd..66e30984e 100644 +--- a/vllm/model_executor/model_loader/loader.py ++++ b/vllm/model_executor/model_loader/loader.py +@@ -1481,8 +1481,41 @@ class RunaiModelStreamerLoader(BaseModelLoader): + return model.eval() - return cast(BatchedTensorInputs, json_mapped) -+ def __eq__(self, other: object) -> bool: -+ if not isinstance(other, self.__class__): -+ return False -+ if self._items_by_modality != other._items_by_modality: -+ return False ++class IPEXLLMLowBitLoader(BaseModelLoader): ++ def __init__(self, load_config: LoadConfig): ++ super().__init__(load_config) ++ logger.info("IPEXLLMLowBitLoader get selected. Ensure your model is converted before.") ++ if load_config.model_loader_extra_config: ++ raise ValueError(f"Model loader extra config is not supported for " ++ f"load format {load_config.load_format}") + -+ ks = self.keys() -+ return (ks == other.keys() -+ and all(nested_tensors_equal(self[k], other[k]) for k in ks)) ++ def download_model(self, model_config: ModelConfig) -> None: ++ """Download a model so that it can be immediately loaded.""" ++ raise ValueError(f"IPEXLLMLowBitLoader does not support " ++ f"download_model api.") + -+ def _validate_modality(self, method_name: str, modality: str) -> None: -+ if not self._items_by_modality: -+ raise RuntimeError( -+ f"`{method_name}` is not supported when " -+ "MultiModalKwargs is not initialized with `items`") ++ def load_model(self, vllm_config: VllmConfig) -> nn.Module: ++ model_config = vllm_config.model_config + -+ if modality not in self._items_by_modality: -+ available_modalities = set(self._items_by_modality.keys()) -+ raise KeyError(f"Modality {modality!r} not found. " -+ f"Available modalities: {available_modalities}") ++ from ipex_llm.optimize import low_memory_init, load_low_bit ++ with set_default_torch_dtype(model_config.dtype): ++ # Initialize an empty skeleton of the model ++ with low_memory_init(): ++ model = _initialize_model(vllm_config=vllm_config) ++ # Load the real weights from the config ++ local_rank = os.environ["LOCAL_RANK"] ++ load_path = os.path.join(model_config.low_bit_model_path, ++ str(local_rank)) ++ model = load_low_bit(model, load_path) ++ return model + -+ def get_item_count(self, modality: str) -> int: -+ """Get the number of items belonging to a modality.""" -+ self._validate_modality("get_item_count", modality) -+ return len(self._items_by_modality[modality]) + -+ def get_item(self, modality: str, item_index: int) -> MultiModalKwargsItem: -+ """ -+ Get the keyword arguments corresponding to an item identified by -+ its modality and index. -+ """ -+ self._validate_modality("get_item", modality) -+ return self._items_by_modality[modality][item_index] + def get_model_loader(load_config: LoadConfig) -> BaseModelLoader: + """Get a model loader based on the load format.""" + -+ def get_items(self, modality: str) -> Sequence[MultiModalKwargsItem]: -+ """ -+ Get the keyword arguments corresponding to each item belonging to -+ a modality. -+ """ -+ self._validate_modality("get_items", modality) -+ return self._items_by_modality[modality] ++ if load_config.use_low_bit_loader: ++ return IPEXLLMLowBitLoader(load_config) + + if isinstance(load_config.load_format, type): + return load_config.load_format(load_config) + +diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py +index 1b1738f88..2c2ed67b9 100644 +--- a/vllm/model_executor/models/chatglm.py ++++ b/vllm/model_executor/models/chatglm.py +@@ -130,12 +130,14 @@ class GLMMLP(nn.Module): + def __init__( + self, + config: ChatGLMConfig, ++ layer, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ): + super().__init__() + + self.add_bias = config.add_bias_linear ++ self.layer = layer + + # Project to 4h. + self.dense_h_to_4h = MergedColumnParallelLinear( +@@ -160,7 +162,14 @@ class GLMMLP(nn.Module): + def forward(self, hidden_states): + # [s, b, 4hp] + intermediate_parallel, _ = self.dense_h_to_4h(hidden_states) +- intermediate_parallel = self.activation_func(intermediate_parallel) ++ # IPEX-LLM changes start: workaround fp16 overflow ++ if self.layer >= 38 and intermediate_parallel.device.type == "xpu": ++ d = intermediate_parallel.shape[-1] // 2 ++ intermediate_parallel[..., d:] /= 10 ++ intermediate_parallel = self.activation_func(intermediate_parallel) ++ else: ++ intermediate_parallel = self.activation_func(intermediate_parallel) ++ # IPEX-LLM changes end. + # [s, b, h] + output, _ = self.dense_4h_to_h(intermediate_parallel) + return output +@@ -176,6 +185,7 @@ class GLMBlock(nn.Module): + def __init__( + self, + config: ChatGLMConfig, ++ layer, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", +@@ -203,7 +213,9 @@ class GLMBlock(nn.Module): + config.hidden_size, eps=config.layernorm_epsilon) - MultiModalPlaceholderDict = Mapping[str, Sequence[PlaceholderRange]] - """ -@@ -207,16 +504,16 @@ class MultiModalInputsV2(TypedDict): - prompt: str - """The processed prompt text.""" + # MLP +- self.mlp = GLMMLP(config, quant_config, prefix=f"{prefix}.mlp") ++ self.mlp = GLMMLP(config, layer, quant_config, prefix=f"{prefix}.mlp") ++ ++ self.layer = layer -- prompt_token_ids: List[int] -+ prompt_token_ids: list[int] - """The processed token IDs which includes placeholder tokens.""" + def forward( + self, +@@ -235,8 +247,13 @@ class GLMBlock(nn.Module): + residual = layernorm_output + else: + residual = layernorm_input +- +- output = self.mlp(layernorm_output) + residual ++ # IPEX-LLM changes start: workaround fp16 overflow ++ if self.layer >= 38 and layernorm_output.device.type == "xpu": ++ output = self.mlp(layernorm_output) * 10 + residual ++ output = torch.nan_to_num(output) ++ else: ++ output = self.mlp(layernorm_output) + residual ++ # ipex-llm changes end -- token_type_ids: NotRequired[List[int]] -+ token_type_ids: NotRequired[list[int]] - """The token type IDs of the prompt.""" + return output - mm_kwargs: MultiModalKwargs - """Keyword arguments to be directly passed to the model after batching.""" +@@ -258,12 +275,15 @@ class GLMTransformer(nn.Module): + self.num_layers = config.num_layers -- mm_hashes: NotRequired[List[str]] -+ mm_hashes: NotRequired[list[str]] - """The hashes of the multi-modal data.""" + # Transformer layers. +- self.start_layer, self.end_layer, self.layers = make_layers( +- self.num_layers, +- lambda prefix: GLMBlock( +- config, cache_config, quant_config, prefix=prefix), +- prefix=f"{prefix}.layers", +- ) ++ # Not sure if pp is available now ++ from vllm.distributed.utils import get_pp_indices ++ self.start_layer, self.end_layer = get_pp_indices(self.num_layers, ++ get_pp_group().rank_in_group, ++ get_pp_group().world_size) ++ self.layers = nn.ModuleList([ ++ GLMBlock(config, i, cache_config, quant_config, prefix=f"{prefix}.layers.{i}") ++ for i in range(self.start_layer, self.end_layer) ++ ]) - mm_placeholders: MultiModalPlaceholderDict -diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py + if self.post_layer_norm: + layer_norm_func = RMSNorm if config.rmsnorm else LayerNorm +diff --git a/vllm/model_executor/models/glm4.py b/vllm/model_executor/models/glm4.py new file mode 100644 -index 000000000..6be046ba7 +index 000000000..07453f636 --- /dev/null -+++ b/vllm/multimodal/parse.py -@@ -0,0 +1,355 @@ -+from abc import ABC, abstractmethod -+from collections import UserDict -+from collections.abc import Callable, Iterator, Mapping, Sequence -+from typing import (TYPE_CHECKING, Any, Generic, NamedTuple, Optional, TypeVar, -+ Union) ++++ b/vllm/model_executor/models/glm4.py +@@ -0,0 +1,312 @@ ++# SPDX-License-Identifier: Apache-2.0 + -+import numpy as np -+import torch -+from PIL.Image import Image -+from typing_extensions import TypeAlias, TypeGuard, assert_never ++# Copyright 2025 The Zhipu AI team. ++# Copyright 2023 The vLLM team. ++# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. ++# ++# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX ++# and OPT implementations in this library. It has been modified from its ++# original forms to accommodate minor architectural differences compared ++# to GPT-NeoX and OPT used by the Meta AI team that trained the model. ++# ++# Licensed under the Apache License, Version 2.0 (the "License"); ++# you may not use this file except in compliance with the License. ++# You may obtain a copy of the License at ++# ++# http://www.apache.org/licenses/LICENSE-2.0 ++# ++# Unless required by applicable law or agreed to in writing, software ++# distributed under the License is distributed on an "AS IS" BASIS, ++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++# See the License for the specific language governing permissions and ++# limitations under the License. ++"""Inference-only GLM-4-0414 model compatible with HuggingFace weights.""" ++from typing import Iterable, Optional, Set, Tuple, Union + -+from vllm.utils import is_list_of ++import torch ++from torch import nn ++from transformers import Glm4Config + -+from .audio import resample_audio -+from .inputs import (AudioItem, HfAudioItem, HfImageItem, HfVideoItem, -+ ImageItem, ModalityData, MultiModalDataDict, -+ NestedTensors, VideoItem) ++from vllm.attention import Attention, AttentionType ++from vllm.compilation.decorators import support_torch_compile ++from vllm.config import CacheConfig, VllmConfig ++from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size ++from vllm.model_executor.layers.layernorm import RMSNorm ++from vllm.model_executor.layers.linear import (QKVParallelLinear, ++ RowParallelLinear) ++from vllm.model_executor.layers.logits_processor import LogitsProcessor ++from vllm.model_executor.layers.quantization import QuantizationConfig ++from vllm.model_executor.layers.rotary_embedding import get_rope ++from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler ++from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead ++from vllm.model_executor.sampling_metadata import SamplingMetadata ++from vllm.sequence import IntermediateTensors + -+_T = TypeVar("_T") -+_I = TypeVar("_I") ++from .interfaces import SupportsLoRA, SupportsPP ++from .llama import LlamaMLP as Glm4MLP ++from .llama import LlamaModel ++from .utils import AutoWeightsLoader, PPMissingLayer, maybe_prefix + + -+class ModalityDataItems(ABC, Generic[_T, _I]): ++class Glm4Attention(nn.Module): + -+ def __init__(self, data: _T, modality: str) -> None: ++ def __init__(self, ++ config: Glm4Config, ++ hidden_size: int, ++ num_heads: int, ++ num_kv_heads: int, ++ max_position: int = 4096 * 32, ++ head_dim: Optional[int] = None, ++ qkv_bias: bool = False, ++ rope_theta: float = 10000, ++ cache_config: Optional[CacheConfig] = None, ++ quant_config: Optional[QuantizationConfig] = None, ++ rope_scaling: Optional[Tuple] = None, ++ prefix: str = "", ++ attn_type: str = AttentionType.DECODER) -> None: + super().__init__() ++ self.hidden_size = hidden_size ++ tp_size = get_tensor_model_parallel_world_size() ++ self.total_num_heads = num_heads ++ assert self.total_num_heads % tp_size == 0 ++ self.num_heads = self.total_num_heads // tp_size ++ self.total_num_kv_heads = num_kv_heads ++ if self.total_num_kv_heads >= tp_size: ++ # Number of KV heads is greater than TP size, so we partition ++ # the KV heads across multiple tensor parallel GPUs. ++ assert self.total_num_kv_heads % tp_size == 0 ++ else: ++ # Number of KV heads is less than TP size, so we replicate ++ # the KV heads across multiple tensor parallel GPUs. ++ assert tp_size % self.total_num_kv_heads == 0 ++ partial_rotary_factor = getattr(config, "partial_rotary_factor", 0.5) ++ self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) ++ self.head_dim = head_dim or hidden_size // self.total_num_heads ++ self.rotary_dim = int(partial_rotary_factor * self.head_dim) ++ self.q_size = self.num_heads * self.head_dim ++ self.kv_size = self.num_kv_heads * self.head_dim ++ self.scaling = self.head_dim**-0.5 ++ self.rope_theta = rope_theta ++ self.qkv_proj = QKVParallelLinear( ++ hidden_size, ++ self.head_dim, ++ self.total_num_heads, ++ self.total_num_kv_heads, ++ bias=qkv_bias, ++ quant_config=quant_config, ++ prefix=f"{prefix}.qkv_proj", ++ ) ++ self.o_proj = RowParallelLinear( ++ self.total_num_heads * self.head_dim, ++ hidden_size, ++ bias=False, ++ quant_config=quant_config, ++ prefix=f"{prefix}.o_proj", ++ ) ++ self.rotary_emb = get_rope( ++ self.head_dim, ++ rotary_dim=self.rotary_dim, ++ max_position=max_position, ++ base=self.rope_theta, ++ rope_scaling=rope_scaling, ++ partial_rotary_factor=partial_rotary_factor, ++ is_neox_style=False, ++ ) ++ self.attn = Attention(self.num_heads, ++ self.head_dim, ++ self.scaling, ++ num_kv_heads=self.num_kv_heads, ++ cache_config=cache_config, ++ quant_config=quant_config, ++ prefix=f"{prefix}.attn", ++ attn_type=attn_type) + -+ self.data = data -+ self.modality = modality -+ -+ def __repr__(self) -> str: -+ return (f"{type(self).__name__}(modality={self.modality!r}, " -+ f"len={len(self)})") -+ -+ def __len__(self) -> int: -+ return self.get_count() -+ -+ def __getitem__(self, index: int) -> _I: -+ return self.get(index) ++ def forward( ++ self, ++ positions: torch.Tensor, ++ hidden_states: torch.Tensor, ++ ) -> torch.Tensor: ++ qkv, _ = self.qkv_proj(hidden_states) ++ q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) ++ q, k = self.rotary_emb(positions, q, k) ++ attn_output = self.attn(q, k, v) ++ output, _ = self.o_proj(attn_output) ++ return output + -+ if TYPE_CHECKING: -+ # Auto-generated -+ def __iter__(self) -> Iterator[_I]: -+ ... + -+ @abstractmethod -+ def get_count(self) -> int: -+ """Get the number of data items.""" -+ raise NotImplementedError ++class Glm4DecoderLayer(nn.Module): + -+ @abstractmethod -+ def get(self, index: int) -> _I: -+ """Get a data item by its index.""" -+ raise NotImplementedError ++ def __init__( ++ self, ++ config: Glm4Config, ++ cache_config: Optional[CacheConfig] = None, ++ quant_config: Optional[QuantizationConfig] = None, ++ prefix: str = "", ++ ) -> None: ++ super().__init__() ++ self.hidden_size = config.hidden_size ++ rope_theta = getattr(config, "rope_theta", 1000000) ++ rope_scaling = getattr(config, "rope_scaling", None) ++ ++ self.self_attn = Glm4Attention( ++ config=config, ++ hidden_size=self.hidden_size, ++ num_heads=config.num_attention_heads, ++ max_position=config.max_position_embeddings, ++ num_kv_heads=config.num_key_value_heads, ++ rope_theta=rope_theta, ++ qkv_bias=getattr(config, 'attention_bias', False), ++ head_dim=getattr(config, 'head_dim', None), ++ cache_config=cache_config, ++ quant_config=quant_config, ++ rope_scaling=rope_scaling, ++ prefix=f"{prefix}.self_attn", ++ attn_type=AttentionType.DECODER, ++ ) ++ self.mlp = Glm4MLP( ++ hidden_size=self.hidden_size, ++ intermediate_size=config.intermediate_size, ++ hidden_act=config.hidden_act, ++ quant_config=quant_config, ++ prefix=f"{prefix}.mlp", ++ ) ++ self.input_layernorm = RMSNorm(config.hidden_size, ++ eps=config.rms_norm_eps) ++ self.post_attention_layernorm = RMSNorm(config.hidden_size, ++ eps=config.rms_norm_eps) ++ self.post_self_attn_layernorm = RMSNorm(config.hidden_size, ++ eps=config.rms_norm_eps) ++ self.post_mlp_layernorm = RMSNorm(config.hidden_size, ++ eps=config.rms_norm_eps) + -+ def get_all(self) -> list[_I]: -+ """Get all data items.""" -+ return [self.get(idx) for idx in range(self.get_count())] ++ def forward( ++ self, ++ positions: torch.Tensor, ++ hidden_states: torch.Tensor, ++ residual: Optional[torch.Tensor], ++ ) -> Tuple[torch.Tensor, torch.Tensor]: ++ # Self Attention ++ if residual is None: ++ residual = hidden_states ++ hidden_states = self.input_layernorm(hidden_states) ++ else: ++ hidden_states, residual = self.input_layernorm( ++ hidden_states, residual) ++ hidden_states = self.self_attn( ++ positions=positions, ++ hidden_states=hidden_states, ++ ) + -+ @abstractmethod -+ def get_processor_data(self) -> Mapping[str, object]: -+ """Get the data to pass to the HF processor.""" -+ raise NotImplementedError ++ hidden_states = self.post_self_attn_layernorm(hidden_states) + -+ @abstractmethod -+ def get_passthrough_data(self) -> Mapping[str, object]: -+ """Get the data to pass directly to the model.""" -+ raise NotImplementedError ++ # Fully Connected ++ hidden_states, residual = self.post_attention_layernorm(hidden_states, residual) ++ hidden_states = self.mlp(hidden_states) ++ hidden_states = self.post_mlp_layernorm(hidden_states) + ++ return hidden_states, residual + -+class ProcessorBatchItems(ModalityDataItems[Sequence[_T], _T]): + -+ def get_count(self) -> int: -+ return len(self.data) ++ALL_DECODER_LAYER_TYPES = { ++ "attention": Glm4DecoderLayer, ++} + -+ def get(self, index: int) -> _T: -+ return self.data[index] + -+ def get_processor_data(self) -> Mapping[str, object]: -+ return {f"{self.modality}s": self.data} ++@support_torch_compile( ++ dynamic_arg_dims={ ++ "input_ids": 0, ++ "positions": -1, ++ "intermediate_tensors": 0, ++ "inputs_embeds": 0, ++ }) ++class Glm4Model(LlamaModel): + -+ def get_passthrough_data(self) -> Mapping[str, object]: -+ return {} ++ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ++ super().__init__(vllm_config=vllm_config, ++ prefix=prefix, ++ layer_type=Glm4DecoderLayer) + + -+class EmbeddingItems(ModalityDataItems[NestedTensors, torch.Tensor]): ++class Glm4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): ++ packed_modules_mapping = { ++ "qkv_proj": [ ++ "q_proj", ++ "k_proj", ++ "v_proj", ++ ], ++ "gate_up_proj": [ ++ "gate_proj", ++ "up_proj", ++ ], ++ } + -+ def get_count(self) -> int: -+ return len(self.data) ++ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ++ super().__init__() ++ config = vllm_config.model_config.hf_config ++ quant_config = vllm_config.quant_config ++ lora_config = vllm_config.lora_config + -+ def get(self, index: int) -> torch.Tensor: -+ return self.data[index] ++ self.config = config ++ self.lora_config = lora_config + -+ def get_processor_data(self) -> Mapping[str, object]: -+ return {} ++ self.quant_config = quant_config ++ self.model = Glm4Model(vllm_config=vllm_config, ++ prefix=maybe_prefix(prefix, "model")) + -+ def get_passthrough_data(self) -> Mapping[str, object]: -+ return {f"{self.modality}_embeds": self.data} ++ if get_pp_group().is_last_rank: ++ if config.tie_word_embeddings: ++ self.lm_head = self.model.embed_tokens ++ else: ++ self.lm_head = ParallelLMHead(config.vocab_size, ++ config.hidden_size, ++ quant_config=quant_config, ++ prefix=maybe_prefix( ++ prefix, "lm_head")) ++ else: ++ self.lm_head = PPMissingLayer() + -+ def get_feature_size(self, item_idx: int) -> int: -+ return len(self.get(item_idx)) ++ self.logits_processor = LogitsProcessor(config.vocab_size) ++ self.sampler = get_sampler() + ++ self.make_empty_intermediate_tensors = ( ++ self.model.make_empty_intermediate_tensors) + -+class AudioProcessorItems(ProcessorBatchItems[HfAudioItem]): ++ def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: ++ return self.model.get_input_embeddings(input_ids) + -+ def __init__(self, data: Sequence[HfAudioItem]) -> None: -+ super().__init__(data, "audio") ++ def forward( ++ self, ++ input_ids: torch.Tensor, ++ positions: torch.Tensor, ++ intermediate_tensors: Optional[IntermediateTensors] = None, ++ inputs_embeds: Optional[torch.Tensor] = None, ++ ) -> Union[torch.Tensor, IntermediateTensors]: ++ hidden_states = self.model(input_ids, positions, intermediate_tensors, ++ inputs_embeds) ++ return hidden_states + ++ def compute_logits( ++ self, ++ hidden_states: torch.Tensor, ++ sampling_metadata: SamplingMetadata, ++ ) -> Optional[torch.Tensor]: ++ logits = self.logits_processor(self.lm_head, hidden_states, ++ sampling_metadata) ++ return logits + -+class AudioEmbeddingItems(EmbeddingItems): ++ def sample( ++ self, ++ logits: torch.Tensor, ++ sampling_metadata: SamplingMetadata, ++ ) -> Optional[SamplerOutput]: ++ next_tokens = self.sampler(logits, sampling_metadata) ++ return next_tokens + -+ def __init__(self, data: NestedTensors) -> None: -+ super().__init__(data, "audio") ++ def load_weights(self, weights: Iterable[Tuple[str, ++ torch.Tensor]]) -> Set[str]: ++ loader = AutoWeightsLoader( ++ self, ++ skip_prefixes=(["lm_head."] ++ if self.config.tie_word_embeddings else None), ++ ) ++ return loader.load_weights(weights) +\ No newline at end of file +diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py +index c190a4585..195f48f64 100644 +--- a/vllm/model_executor/models/glm4v.py ++++ b/vllm/model_executor/models/glm4v.py +@@ -111,8 +111,11 @@ class EVA2CLIPAttention(nn.Module): + prefix=f"{prefix}.dense", + ) + +- self.attn = MultiHeadAttention(self.num_heads_per_rank, self.head_dim, +- self.scale) ++ # self.attn = MultiHeadAttention(self.num_heads_per_rank, self.head_dim, ++ # self.scale) ++ from siglip import SelfAttention ++ self.attn = SelfAttention(self.num_heads_per_rank, self.head_dim, ++ self.scale) + self.output_dropout = torch.nn.Dropout(config.dropout_prob) + + def forward(self, x: torch.Tensor) -> torch.Tensor: +diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py +index 5fab9df3f..f8e6fbe24 100644 +--- a/vllm/model_executor/models/minicpmv.py ++++ b/vllm/model_executor/models/minicpmv.py +@@ -149,6 +149,8 @@ class Resampler2_5(BaseResampler): + self.max_size = max_size + self._set_2d_pos_cache(self.max_size) + ++ #self.apply(self._init_weights) + + def _set_2d_pos_cache(self, + max_size: Tuple[int, int], + device: torch.types.Device = "cpu") -> None: +@@ -1236,7 +1238,8 @@ class MiniCPMV2_5(MiniCPMVBaseModel, SupportsLoRA): + return self.resampler(vision_embedding, tgt_sizes) + + +-class MiniCPMV2_6(MiniCPMVBaseModel, SupportsLoRA): ++class MiniCPMV2_6(MiniCPMVBaseModel): + -+class ImageSize(NamedTuple): -+ width: int -+ height: int + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", +@@ -1319,9 +1322,9 @@ class MiniCPMV2_6(MiniCPMVBaseModel, SupportsLoRA): + patch_attn_mask[i, :num_patches_item] = True + + vision_embedding = self.vpm( +- all_pixel_values, ++ all_pixel_values.type(dtype).to(device), + patch_attention_mask=patch_attn_mask.unsqueeze(1), +- tgt_sizes=tgt_sizes, ++ tgt_sizes=tgt_sizes.to(device), + ) + + return self.resampler(vision_embedding, tgt_sizes) +@@ -1363,7 +1366,7 @@ class MiniCPMV(MiniCPMVBaseModel, SupportsMultiModal, SupportsLoRA): + + # quant_config references base class members, + # so update values before init is called +- cls.packed_modules_mapping.update(instance_cls.packed_modules_mapping) +- cls.embedding_modules.update(instance_cls.embedding_modules) +- cls.embedding_padding_modules += instance_cls.embedding_padding_modules ++ # cls.packed_modules_mapping.update(instance_cls.packed_modules_mapping) ++ # cls.embedding_modules.update(instance_cls.embedding_modules) ++ # cls.embedding_padding_modules += instance_cls.embedding_padding_modules + return instance_cls(vllm_config=vllm_config, prefix=prefix) +diff --git a/vllm/model_executor/models/na_vit.py b/vllm/model_executor/models/na_vit.py +new file mode 100644 +index 000000000..d96085f46 +--- /dev/null ++++ b/vllm/model_executor/models/na_vit.py +@@ -0,0 +1,831 @@ ++import logging ++import math ++import os ++import warnings ++from typing import Optional, Tuple, Union + ++import numpy as np ++import torch ++import torch.nn.functional as F ++from torch import nn ++from torch.nn.init import _calculate_fan_in_and_fan_out ++from transformers.activations import ACT2FN ++from transformers.configuration_utils import PretrainedConfig ++from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask ++from transformers.modeling_outputs import (BaseModelOutput, ++ BaseModelOutputWithPooling) ++from transformers.modeling_utils import PreTrainedModel ++from transformers.utils import (ModelOutput, is_flash_attn_2_available, ++ replace_return_docstrings) + -+class ImageProcessorItems(ProcessorBatchItems[HfImageItem]): ++logger = logging.getLogger("vllm") + -+ def __init__(self, data: Sequence[HfImageItem]) -> None: -+ super().__init__(data, "image") + -+ def get_image_size(self, item_idx: int) -> ImageSize: -+ image = self.get(item_idx) ++# For Siglip: copied from ++# HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit and add tgt_sizes ++# Remove hints as there's little possibility to change these code. ++class SiglipVisionConfig(PretrainedConfig): + -+ if isinstance(image, Image): -+ return ImageSize(*image.size) -+ if isinstance(image, (np.ndarray, torch.Tensor)): -+ _, h, w = image.shape -+ return ImageSize(w, h) ++ model_type = "siglip_vision_model" + -+ assert_never(image) ++ def __init__( ++ self, ++ hidden_size=768, ++ intermediate_size=3072, ++ num_hidden_layers=12, ++ num_attention_heads=12, ++ num_channels=3, ++ image_size=224, ++ patch_size=16, ++ hidden_act="gelu_pytorch_tanh", ++ layer_norm_eps=1e-6, ++ attention_dropout=0.0, ++ **kwargs, ++ ): ++ super().__init__(**kwargs) + ++ self.hidden_size = hidden_size ++ self.intermediate_size = intermediate_size ++ self.num_hidden_layers = num_hidden_layers ++ self.num_attention_heads = num_attention_heads ++ self.num_channels = num_channels ++ self.patch_size = patch_size ++ self.image_size = image_size ++ self.attention_dropout = attention_dropout ++ self.layer_norm_eps = layer_norm_eps ++ self.hidden_act = hidden_act + -+class ImageEmbeddingItems(EmbeddingItems): ++ @classmethod ++ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, ++ os.PathLike], ++ **kwargs) -> "PretrainedConfig": ++ cls._set_token_in_kwargs(kwargs) + -+ def __init__(self, data: NestedTensors) -> None: -+ super().__init__(data, "image") ++ config_dict, kwargs = cls.get_config_dict( ++ pretrained_model_name_or_path, **kwargs) + ++ # get the vision config dict if we are loading from SiglipConfig ++ if config_dict.get("model_type") == "siglip": ++ config_dict = config_dict["vision_config"] + -+class VideoProcessorItems(ProcessorBatchItems[HfVideoItem]): ++ if "model_type" in config_dict and hasattr( ++ cls, ++ "model_type") and config_dict["model_type"] != cls.model_type: ++ logger.warning( ++ "You are using a model of type %s to " ++ "instantiate a model of type %s. " ++ "This is not supported for all configurations" ++ "of models and can yield errors.", config_dict['model_type'], ++ cls.model_type) + -+ def __init__(self, data: Sequence[HfVideoItem]) -> None: -+ super().__init__(data, "video") ++ return cls.from_dict(config_dict, **kwargs) + -+ def get_num_frames(self, item_idx: int) -> int: -+ return len(self.get(item_idx)) + -+ def get_frame_size(self, item_idx: int) -> ImageSize: -+ image = self.get(item_idx)[0] # Assume that the video isn't empty ++_CHECKPOINT_FOR_DOC = "google/siglip-base-patch16-224" + -+ if isinstance(image, Image): -+ return ImageSize(*image.size) -+ if isinstance(image, (np.ndarray, torch.Tensor)): -+ _, h, w = image.shape -+ return ImageSize(w, h) ++SIGLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [ ++ "google/siglip-base-patch16-224", ++ # See all SigLIP models at https://huggingface.co/models?filter=siglip ++] + -+ assert_never(image) ++if is_flash_attn_2_available(): ++ from flash_attn import flash_attn_func, flash_attn_varlen_func ++ from flash_attn.bert_padding import pad_input # noqa ++ from flash_attn.bert_padding import index_first_axis, unpad_input + + -+class VideoEmbeddingItems(EmbeddingItems): ++# Copied from transformers.models.llama.modeling_llama._get_unpad_data ++def _get_unpad_data(attention_mask): ++ seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) ++ indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() ++ max_seqlen_in_batch = seqlens_in_batch.max().item() ++ cu_seqlens = F.pad( ++ torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0)) ++ return ( ++ indices, ++ cu_seqlens, ++ max_seqlen_in_batch, ++ ) + -+ def __init__(self, data: NestedTensors) -> None: -+ super().__init__(data, "video") + ++def _trunc_normal_(tensor, mean, std, a, b): + -+_D = TypeVar("_D", bound=ModalityDataItems[Any, Any]) ++ def norm_cdf(x): ++ # Computes standard normal cumulative distribution function ++ return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0 + ++ if (mean < a - 2 * std) or (mean > b + 2 * std): ++ warnings.warn( ++ "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. " ++ "The distribution of values may be incorrect.", ++ stacklevel=2, ++ ) + -+class MultiModalDataItems(UserDict[str, ModalityDataItems[Any, Any]]): -+ """ -+ As :class:`MultiModalDataDict`, but normalized such that each entry -+ corresponds to a list. -+ """ ++ # Values are generated by using a truncated uniform distribution and ++ # then using the inverse CDF for the normal distribution. ++ # Get upper and lower cdf values ++ l_ = norm_cdf((a - mean) / std) ++ u = norm_cdf((b - mean) / std) + -+ def get_count(self, modality: str, *, strict: bool = True) -> int: -+ """ -+ Get the number of data items belonging to a modality. -+ -+ If `strict=False`, return `0` instead of raising :exc:`KeyError` -+ even if the modality is not found. -+ """ -+ if modality not in self: -+ if strict: -+ available_modalities = set(self.keys()) -+ raise KeyError(f"Modality {modality!r} not found. " -+ f"Available modalities: {available_modalities}") ++ # Uniformly fill tensor with values from [l, u], then translate to ++ # [2l-1, 2u-1]. ++ tensor.uniform_(2 * l_ - 1, 2 * u - 1) + -+ return 0 ++ # Use inverse cdf transform for normal distribution to get truncated ++ # standard normal ++ if tensor.dtype in [torch.float16, torch.bfloat16]: ++ # The `erfinv_` op is not (yet?) defined in float16+cpu, bfloat16+gpu ++ og_dtype = tensor.dtype ++ tensor = tensor.to(torch.float32) ++ tensor.erfinv_() ++ tensor = tensor.to(og_dtype) ++ else: ++ tensor.erfinv_() + -+ return self[modality].get_count() ++ # Transform to proper mean, std ++ tensor.mul_(std * math.sqrt(2.0)) ++ tensor.add_(mean) + -+ def get_all_counts(self) -> Mapping[str, int]: -+ """Get the number of items belonging to each modality.""" -+ return {m: items.get_count() for m, items in self.items()} ++ # Clamp to ensure it's in the proper range ++ if tensor.dtype == torch.float16: ++ # The `clamp_` op is not (yet?) defined in float16+cpu ++ tensor = tensor.to(torch.float32) ++ tensor.clamp_(min=a, max=b) ++ tensor = tensor.to(torch.float16) ++ else: ++ tensor.clamp_(min=a, max=b) + -+ def get_items( -+ self, -+ modality: str, -+ typ: Union[type[_D], tuple[type[_D], ...]], -+ ) -> _D: -+ """ -+ Get the data items belonging to a modality, -+ requiring that they belong to a certain type. -+ """ -+ if modality not in self: -+ available_modalities = set(self.keys()) -+ raise KeyError(f"Modality {modality!r} not found. " -+ f"Available modalities: {available_modalities}") + -+ items = self[modality] -+ if not isinstance(items, typ): -+ raise TypeError(f"Invalid type of data items for {modality=}. " -+ f"Expected type: {typ}, but " -+ f"found type: {type(items)}") ++def trunc_normal_tf_(tensor: torch.Tensor, ++ mean: float = 0.0, ++ std: float = 1.0, ++ a: float = -2.0, ++ b: float = 2.0) -> torch.Tensor: ++ with torch.no_grad(): ++ _trunc_normal_(tensor, 0, 1.0, a, b) ++ tensor.mul_(std).add_(mean) + -+ return items # type: ignore[return-value] + ++def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="normal"): ++ fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor) ++ if mode == "fan_in": ++ denom = fan_in ++ elif mode == "fan_out": ++ denom = fan_out ++ elif mode == "fan_avg": ++ denom = (fan_in + fan_out) / 2 + -+ModalityDataParser: TypeAlias = Callable[[ModalityData[Any]], -+ ModalityDataItems[Any, Any]] ++ variance = scale / denom + ++ if distribution == "truncated_normal": ++ # constant is stddev of standard normal truncated to (-2, 2) ++ trunc_normal_tf_(tensor, std=math.sqrt(variance) / 0.87962566103423978) ++ elif distribution == "normal": ++ with torch.no_grad(): ++ tensor.normal_(std=math.sqrt(variance)) ++ elif distribution == "uniform": ++ bound = math.sqrt(3 * variance) ++ with torch.no_grad(): ++ tensor.uniform_(-bound, bound) ++ else: ++ raise ValueError(f"invalid distribution {distribution}") + -+class MultiModalDataParser: -+ """ -+ Parses :class:`MultiModalDataDict` into :class:`MultiModalDataItems`. + -+ Args: -+ target_sr (float, optional): Enables automatic resampling of audio -+ items to the model's expected sampling rate. -+ """ ++def lecun_normal_(tensor): ++ variance_scaling_(tensor, mode="fan_in", distribution="truncated_normal") + -+ def __init__(self, *, target_sr: Optional[float] = None) -> None: -+ super().__init__() + -+ self.target_sr = target_sr ++def default_flax_embed_init(tensor): ++ variance_scaling_(tensor, mode="fan_in", distribution="normal") + -+ def _is_embeddings(self, data: object) -> TypeGuard[NestedTensors]: -+ if isinstance(data, torch.Tensor): -+ return data.ndim == 3 -+ if is_list_of(data, torch.Tensor): -+ return len(data) == 0 or data[0].ndim == 2 + -+ return False ++class SiglipVisionModelOutput(ModelOutput): ++ image_embeds: Optional[torch.FloatTensor] = None ++ last_hidden_state: torch.FloatTensor = None ++ hidden_states: Optional[Tuple[torch.FloatTensor]] = None ++ attentions: Optional[Tuple[torch.FloatTensor]] = None + -+ def _get_audio_with_sr( -+ self, -+ audio: AudioItem, -+ ) -> tuple[np.ndarray, Optional[float]]: -+ if isinstance(audio, tuple): -+ return audio -+ if isinstance(audio, list): -+ return np.array(audio), None -+ if isinstance(audio, np.ndarray): -+ return audio, None -+ if isinstance(audio, torch.Tensor): -+ return audio.numpy(), None -+ -+ assert_never(audio) -+ -+ def _parse_audio_data( -+ self, -+ data: ModalityData[AudioItem], -+ ) -> ModalityDataItems[Any, Any]: -+ if self._is_embeddings(data): -+ return AudioEmbeddingItems(data) -+ -+ if (is_list_of(data, float) -+ or isinstance(data, -+ (np.ndarray, torch.Tensor)) and data.ndim == 1 -+ or isinstance(data, tuple)): -+ data_items = [data] -+ elif isinstance(data, (np.ndarray, torch.Tensor)): -+ data_items = [elem for elem in data] -+ else: -+ data_items = data + -+ new_audios = list[np.ndarray]() -+ for data_item in data_items: -+ audio, orig_sr = self._get_audio_with_sr(data_item) -+ if orig_sr is None: -+ new_audio = audio -+ else: -+ target_sr = self.target_sr -+ if target_sr is None: -+ raise RuntimeError( -+ "Audio resampling is not supported when " -+ "`target_sr` is not provided") ++class SiglipVisionEmbeddings(nn.Module): + -+ new_audio = resample_audio(audio, -+ orig_sr=orig_sr, -+ target_sr=target_sr) ++ def __init__(self, config: SiglipVisionConfig): ++ super().__init__() ++ self.config = config ++ self.embed_dim = config.hidden_size ++ self.image_size = config.image_size ++ self.patch_size = config.patch_size + -+ new_audios.append(new_audio) ++ self.patch_embedding = nn.Conv2d( ++ in_channels=config.num_channels, ++ out_channels=self.embed_dim, ++ kernel_size=self.patch_size, ++ stride=self.patch_size, ++ padding="valid", ++ ) + -+ return AudioProcessorItems(new_audios) ++ self.num_patches_per_side = self.image_size // self.patch_size ++ self.num_patches = self.num_patches_per_side**2 ++ self.num_positions = self.num_patches ++ self.position_embedding = nn.Embedding(self.num_positions, ++ self.embed_dim) + -+ def _parse_image_data( -+ self, -+ data: ModalityData[ImageItem], -+ ) -> ModalityDataItems[Any, Any]: -+ if self._is_embeddings(data): -+ return ImageEmbeddingItems(data) -+ -+ if (isinstance(data, Image) -+ or isinstance(data, -+ (np.ndarray, torch.Tensor)) and data.ndim == 3): -+ data_items = [data] -+ elif isinstance(data, (np.ndarray, torch.Tensor)): -+ data_items = [elem for elem in data] -+ else: -+ data_items = data ++ def forward(self, ++ pixel_values: torch.FloatTensor, ++ patch_attention_mask: torch.BoolTensor, ++ tgt_sizes: Optional[torch.IntTensor] = None) -> torch.Tensor: ++ batch_size = pixel_values.size(0) + -+ return ImageProcessorItems(data_items) ++ patch_embeds = self.patch_embedding(pixel_values) ++ embeddings = patch_embeds.flatten(2).transpose(1, 2) + -+ def _parse_video_data( -+ self, -+ data: ModalityData[VideoItem], -+ ) -> ModalityDataItems[Any, Any]: -+ if self._is_embeddings(data): -+ return VideoEmbeddingItems(data) -+ -+ if (is_list_of(data, Image) -+ or isinstance(data, -+ (np.ndarray, torch.Tensor)) and data.ndim == 4): -+ data_items = [data] -+ elif isinstance(data, (np.ndarray, torch.Tensor)): -+ data_items = [elem for elem in data] -+ else: -+ data_items = data ++ max_im_h, max_im_w = pixel_values.size(2), pixel_values.size(3) ++ max_nb_patches_h, max_nb_patches_w = (max_im_h // self.patch_size, ++ max_im_w // self.patch_size) ++ boundaries = torch.arange(1 / self.num_patches_per_side, 1.0, ++ 1 / self.num_patches_per_side) ++ position_ids = torch.full( ++ size=( ++ batch_size, ++ max_nb_patches_h * max_nb_patches_w, ++ ), ++ fill_value=0, ++ ) + -+ return VideoProcessorItems(data_items) ++ for batch_idx, p_attn_mask in enumerate(patch_attention_mask): ++ if tgt_sizes is not None: ++ nb_patches_h = tgt_sizes[batch_idx][0] ++ nb_patches_w = tgt_sizes[batch_idx][1] ++ else: ++ nb_patches_h = p_attn_mask[:, 0].sum() ++ nb_patches_w = p_attn_mask[0].sum() + -+ def _get_subparsers(self) -> Mapping[str, ModalityDataParser]: -+ return { -+ "audio": self._parse_audio_data, -+ "image": self._parse_image_data, -+ "video": self._parse_video_data, -+ } ++ fractional_coords_h = torch.arange(0, 1 - 1e-6, 1 / nb_patches_h) ++ fractional_coords_w = torch.arange(0, 1 - 1e-6, 1 / nb_patches_w) + -+ def parse_mm_data(self, -+ mm_data: MultiModalDataDict) -> MultiModalDataItems: -+ subparsers = self._get_subparsers() ++ bucket_coords_h = torch.bucketize(fractional_coords_h, ++ boundaries, ++ right=True) ++ bucket_coords_w = torch.bucketize(fractional_coords_w, ++ boundaries, ++ right=True) + -+ mm_items = MultiModalDataItems() -+ for k, v in mm_data.items(): -+ if k not in subparsers: -+ raise ValueError(f"Unsupported modality: {k}") ++ pos_ids = (bucket_coords_h[:, None] * self.num_patches_per_side + ++ bucket_coords_w).flatten() ++ position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids + -+ mm_items[k] = subparsers[k](v) ++ position_ids = position_ids.to(self.position_embedding.weight.device) + -+ return mm_items -diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py -index 6baf19d67..933c1d3af 100644 ---- a/vllm/multimodal/processing.py -+++ b/vllm/multimodal/processing.py -@@ -1,6 +1,7 @@ -+import pickle - import re - from abc import ABC, abstractmethod --from collections import UserDict -+from collections import defaultdict - from collections.abc import Callable, ItemsView, Iterable, Mapping, Sequence - from dataclasses import dataclass, field - from functools import lru_cache -@@ -8,19 +9,21 @@ from typing import Any, NamedTuple, Optional, Protocol, TypeVar, Union - - import numpy as np - import torch --from PIL.Image import Image --from transformers import BatchFeature, ProcessorMixin --from typing_extensions import assert_never -+from blake3 import blake3 -+from PIL import Image -+from transformers import BatchFeature, PretrainedConfig, ProcessorMixin - - from vllm.inputs import DummyData, InputProcessingContext - from vllm.logger import init_logger --from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer --from vllm.utils import flatten_2d_lists, full_groupby, is_list_of -+from vllm.transformers_utils.tokenizer import (AnyTokenizer, decode_tokens, -+ encode_tokens) -+from vllm.utils import LRUCache, flatten_2d_lists, full_groupby - --from .audio import resample_audio --from .inputs import (AudioItem, ImageItem, MultiModalDataDict, -- MultiModalInputsV2, MultiModalKwargs, PlaceholderRange, -- VideoItem) -+from .inputs import (MultiModalDataDict, MultiModalFieldConfig, -+ MultiModalInputsV2, MultiModalKwargs, -+ MultiModalKwargsItem, PlaceholderRange) -+from .parse import MultiModalDataItems, MultiModalDataParser -+from .profiling import BaseProfilingInfo - - logger = init_logger(__name__) - -@@ -55,24 +58,6 @@ class PromptReplacement: - ) - - --def _encode( -- tokenizer: AnyTokenizer, -- text: str, -- *, -- add_special_tokens: bool = False, --) -> list[int]: -- """ -- Backend-agnostic equivalent of HF's -- :code:`tokenizer.encode(text, add_special_tokens=...)`. -- """ -- if isinstance(tokenizer, MistralTokenizer): -- return tokenizer.tokenizer.encode(text, -- bos=add_special_tokens, -- eos=add_special_tokens) -- -- return tokenizer.encode(text, add_special_tokens=add_special_tokens) -- -- - @lru_cache(maxsize=2048) - def _cached_encode( - tokenizer: AnyTokenizer, -@@ -80,20 +65,9 @@ def _cached_encode( - *, - add_special_tokens: bool = False, - ) -> list[int]: -- return _encode(tokenizer, text, add_special_tokens=add_special_tokens) -- -- --def _decode( -- tokenizer: AnyTokenizer, -- token_ids: list[int], -- *, -- skip_special_tokens: bool = False, --) -> str: -- """ -- Backend-agnostic equivalent of HF's -- :code:`tokenizer.decode(token_ids, skip_special_tokens=...)`. -- """ -- return tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens) -+ return encode_tokens(tokenizer, -+ text, -+ add_special_tokens=add_special_tokens) - - - @lru_cache(maxsize=2048) -@@ -103,9 +77,9 @@ def _cached_decode( - *, - skip_special_tokens: bool = False, - ) -> str: -- return _decode(tokenizer, -- list(token_ids), -- skip_special_tokens=skip_special_tokens) -+ return decode_tokens(tokenizer, -+ list(token_ids), -+ skip_special_tokens=skip_special_tokens) - - - class _HasModalityAttr(Protocol): -@@ -201,111 +175,6 @@ class _BoundPromptReplacement: - return bound_replacement - - --class ImageSize(NamedTuple): -- width: int -- height: int -- -- --class MultiModalDataItems(UserDict[str, list[Any]]): -- """ -- As :class:`MultiModalDataDict`, but normalized such that each entry -- corresponds to a list. -- """ -- -- @staticmethod -- def from_dict(data: MultiModalDataDict) -> "MultiModalDataItems": -- """ -- Normalize :class:`MultiModalDataDict` to :class:`MultiModalDataItems`. -- """ -- multi_data = MultiModalDataItems() -- -- for k, v in data.items(): -- # TODO: Make a separate modality for embedding inputs -- # to avoid confusion -- # yapf: disable -- if k == "video": -- # Special case since even a single item can be a list -- multi_data[k] = ( # type: ignore[index] -- v if (isinstance(v, torch.Tensor) -- or is_list_of(v, list)) else [v] -- ) -- elif k in ("image", "audio"): -- multi_data[k] = ( # type: ignore[index] -- v if isinstance(v, (torch.Tensor, list)) else [v] -- ) -- else: -- multi_data[k] = v if isinstance(v, list) else [v] # type: ignore[index] -- # yapf: enable -- -- return multi_data -- -- # NOTE: When a field (e.g. `images`) doesn't exist, directly appending to -- # `self.images` doesn't update this dictionary, which may be confusing -- # We annotate the getter methods as `Sequence` to prevent others from -- # trying to update the list in this way -- @property -- def images(self) -> Sequence[ImageItem]: -- return self.get("image", []) -- -- @property -- def videos(self) -> Sequence[VideoItem]: -- return self.get("video", []) -- -- @property -- def audios(self) -> Sequence[AudioItem]: -- return self.get("audio", []) -- -- def get_item_counts(self) -> Mapping[str, int]: -- return {m: len(items) for m, items in self.items()} -- -- def get_image_size(self, item_idx: int) -> ImageSize: -- image = self.images[item_idx] -- -- if isinstance(image, Image): -- return ImageSize(*image.size) -- if isinstance(image, (np.ndarray, torch.Tensor)): -- _, h, w = image.shape -- return ImageSize(w, h) -- -- assert_never(image) -- -- def get_audio_with_sr( -- self, -- item_idx: int, -- *, -- default_sr: float, -- ) -> tuple[np.ndarray, float]: -- audio = self.audios[item_idx] -- -- if isinstance(audio, tuple): -- return audio -- if isinstance(audio, list): -- return np.array(audio), default_sr -- if isinstance(audio, np.ndarray): -- return audio, default_sr -- -- assert_never(audio) -- -- def resample_audios(self, new_sr: float, *, drop_sr: bool = True) -> None: -- """ -- If :code:`drop_sr=True`, the audio items in this dictionary are updated -- to be NumPy arrays which implicitly means that their sampling rate is -- the same as the model's expected sampling rate; otherwise, they remain -- as :code:`(audio, new_sr)` tuples. -- """ -- if not self.audios: -- return -- -- new_audios = [] -- for item_idx in range(len(self.audios)): -- audio, sr = self.get_audio_with_sr(item_idx, default_sr=new_sr) -- audio = resample_audio(audio, orig_sr=sr, target_sr=new_sr) -- -- new_audios.append(audio if drop_sr else (audio, new_sr)) -- -- self["audio"] = new_audios -- -- - class _TokenMatch(NamedTuple): - start_idx: int - end_idx: int -@@ -388,8 +257,10 @@ class _PromptReplacementTextMatch(_PromptReplacementMatch): - return self.match.end() - - --class _PlaceholderInfo(NamedTuple): -+@dataclass -+class _PlaceholderInfo: - modality: str -+ item_idx: int - start_idx: int - replacement: list[int] - -@@ -430,12 +301,14 @@ def find_text_matches( - - def _resolve_matches( - prompt: _PromptSeq, -- matches: Sequence[_PromptReplacementMatch], -+ mm_matches: Mapping[str, Sequence[_PromptReplacementMatch]], - ) -> list[_PromptReplacementMatch]: - """ -- Resolve :code:`matches` to ensure that there are no overlapping matches, -+ Resolve :code:`mm_matches` to ensure that there are no overlapping matches, - and sort them such that earlier matches take priority over later ones. - """ -+ matches = [m for matches in mm_matches.values() for m in matches] ++ embeddings = embeddings + self.position_embedding(position_ids) ++ return embeddings + - seen_matches: list[Optional[_PromptReplacementMatch]] = [None - ] * len(prompt) - -@@ -453,18 +326,19 @@ def _resolve_matches( - - def _replace_matches( - prompt: _S, -- matches: Sequence[_PromptReplacementMatch], -+ mm_matches: Mapping[str, Sequence[_PromptReplacementMatch]], - mm_item_counts: Mapping[str, int], - ) -> list[_S]: -+ """Apply the replacements in :code:`mm_matches` to :code:`prompt`.""" - out_seqs = list[_S]() - prev_end_idx = 0 -- next_idx_by_modality = {modality: 0 for modality in mm_item_counts} -+ next_idx_by_modality = defaultdict[str, int](lambda: 0) - -- for match in _resolve_matches(prompt, matches): -+ for match in _resolve_matches(prompt, mm_matches): - modality = match.modality - - item_idx = next_idx_by_modality[modality] -- if item_idx >= mm_item_counts[modality]: -+ if item_idx >= mm_item_counts.get(modality, 0): - continue - - start_idx = match.start_idx -@@ -490,28 +364,28 @@ def _replace_matches( - - def replace_token_matches( - prompt: list[int], -- matches: Sequence[_PromptReplacementTokenMatch], -+ mm_matches: Mapping[str, Sequence[_PromptReplacementTokenMatch]], - mm_item_counts: Mapping[str, int], - ) -> list[int]: -- """Apply :code:`prompt_repls` to :code:`prompt`.""" -- if not matches: -+ """Apply the replacements in :code:`mm_matches` to :code:`prompt`.""" -+ if not mm_matches: - return prompt - -- token_id_seqs = _replace_matches(prompt, matches, mm_item_counts) -+ token_id_seqs = _replace_matches(prompt, mm_matches, mm_item_counts) - - return flatten_2d_lists(token_id_seqs) - - - def replace_text_matches( - prompt: str, -- matches: Sequence[_PromptReplacementTextMatch], -+ mm_matches: Mapping[str, Sequence[_PromptReplacementTextMatch]], - mm_item_counts: Mapping[str, int], - ) -> str: -- """Apply :code:`prompt_repls` to :code:`prompt`.""" -- if not matches: -+ """Apply the replacements in :code:`mm_matches` to :code:`prompt`.""" -+ if not mm_matches: - return prompt - -- texts = _replace_matches(prompt, matches, mm_item_counts) -+ texts = _replace_matches(prompt, mm_matches, mm_item_counts) - - return "".join(texts) - -@@ -526,14 +400,14 @@ def _iter_modality_placeholders( - return - - prompt_len = len(prompt) -- item_index = 0 -+ item_idx = 0 - - start_idx = 0 - while start_idx < prompt_len: - found = False - - for repl_info in modality_repls: -- replacement = repl_info.get_replacement(item_index) -+ replacement = repl_info.get_replacement(item_idx) - repl_tokens = replacement.token_ids - repl_len = len(repl_tokens) - end_idx = start_idx + repl_len -@@ -544,12 +418,13 @@ def _iter_modality_placeholders( - if prompt[start_idx:end_idx] == repl_tokens: - yield _PlaceholderInfo( - modality=modality, -+ item_idx=item_idx, - start_idx=start_idx, - replacement=repl_tokens, - ) - -- item_index += 1 -- if item_index >= modal_item_count: -+ item_idx += 1 -+ if item_idx >= modal_item_count: - return - - # Exclude overlapping matches -@@ -561,175 +436,495 @@ def _iter_modality_placeholders( - start_idx += 1 - - --def iter_placeholders( -- prompt_repls: Sequence[_BoundPromptReplacement], -+def _iter_placeholders( -+ mm_prompt_repls: Mapping[str, Sequence[_BoundPromptReplacement]], - prompt: list[int], - mm_item_counts: Mapping[str, int], - ) -> Iterable[_PlaceholderInfo]: - """ -- Yield each set of placeholder tokens found in :code:`prompt`. -+ For each modality, yield each set of placeholder tokens found in -+ :code:`prompt`. - - Note that empty matches are ignored. - """ -- repls_by_modality = dict(full_groupby_modality(prompt_repls)) -- - for modality, modal_item_count in mm_item_counts.items(): -- if modality in repls_by_modality: -+ if modality in mm_prompt_repls: - yield from _iter_modality_placeholders( - prompt, - modality, -- repls_by_modality[modality], -+ mm_prompt_repls[modality], - modal_item_count, - ) - - --class ProcessorInputs(NamedTuple): -- """Keyword arguments to :meth:`BaseMultiModalProcessor`""" -- prompt_text: str -- mm_data: MultiModalDataDict -- mm_processor_kwargs: Mapping[str, object] -+def find_mm_placeholders( -+ mm_prompt_repls: Mapping[str, Sequence[_BoundPromptReplacement]], -+ prompt: list[int], -+ mm_item_counts: Mapping[str, int], -+) -> Mapping[str, list[_PlaceholderInfo]]: -+ it = _iter_placeholders(mm_prompt_repls, prompt, mm_item_counts) -+ return dict(full_groupby_modality(it)) -+ -+ -+class ProcessingCache: -+ -+ def __init__(self, capacity: int) -> None: -+ super().__init__() + -+ # DEBUG: Set to None to disable -+ self.debug_cache_hit_ratio_steps: Optional[int] = None - -+ self._cache = LRUCache[str, MultiModalKwargsItem](capacity) - --class BaseMultiModalProcessor(ABC): -+ def _maybe_log_cache_stats(self) -> None: -+ steps = self.debug_cache_hit_ratio_steps -+ if not steps: -+ return ++def attention_softmax(attn_weights: torch.Tensor, training: bool): ++ if attn_weights.is_contiguous() and attn_weights.device.type == "xpu" and not training: ++ import xe_addons ++ xe_addons.attn_softmax_inplaced(attn_weights) ++ else: ++ attn_weights = torch.nn.functional.softmax(attn_weights, dim=-1, ++ dtype=torch.float32).to(attn_weights.dtype) ++ return attn_weights + -+ cache_stats = self._cache.stat() -+ if cache_stats.total % steps == 0: -+ logger.debug("ProcessingCache: hit_ratio = %.2f", -+ cache_stats.hit_ratio) -+ -+ def _serialize_item(self, obj: object) -> bytes: -+ # Simple cases -+ if isinstance(obj, str): -+ return obj.encode("utf-8") -+ if isinstance(obj, bytes): -+ return obj -+ if isinstance(obj, Image.Image): -+ return obj.tobytes() -+ -+ # Convertible to NumPy arrays -+ if isinstance(obj, torch.Tensor): -+ obj = obj.numpy() -+ if isinstance(obj, (int, float)): -+ obj = np.array(obj) -+ if isinstance(obj, np.ndarray): -+ return obj.tobytes() -+ -+ logger.warning( -+ "No serialization method found for %s. " -+ "Falling back to pickle.", type(obj)) -+ -+ return pickle.dumps(obj) -+ -+ def _item_to_bytes( -+ self, -+ key: str, -+ obj: object, -+ ) -> Iterable[tuple[bytes, bytes]]: -+ # Recursive cases -+ if isinstance(obj, (list, tuple)): -+ for i, elem in enumerate(obj): -+ yield from self._item_to_bytes(f"{key}.{i}", elem) -+ elif isinstance(obj, dict): -+ for k, v in obj.items(): -+ yield from self._item_to_bytes(f"{key}.{k}", v) -+ else: -+ key_bytes = self._serialize_item(key) -+ value_bytes = self._serialize_item(obj) -+ yield key_bytes, value_bytes + -+ def _hash_kwargs(self, **kwargs: object) -> str: -+ hasher = blake3() ++class SiglipAttention(nn.Module): ++ """Multi-headed attention from 'Attention Is All You Need' paper""" + -+ for k, v in kwargs.items(): -+ for k_bytes, v_bytes in self._item_to_bytes(k, v): -+ hasher.update(k_bytes) -+ hasher.update(v_bytes) ++ # Copied from transformers.models.clip.modeling_clip.CLIPAttention.__init__ ++ def __init__(self, config): ++ super().__init__() ++ self.config = config ++ self.embed_dim = config.hidden_size ++ self.num_heads = config.num_attention_heads ++ self.head_dim = self.embed_dim // self.num_heads ++ if self.head_dim * self.num_heads != self.embed_dim: ++ raise ValueError( ++ "embed_dim must be divisible by num_heads (got `embed_dim`: " ++ f"{self.embed_dim} and `num_heads`:" ++ f" {self.num_heads}).") ++ self.scale = self.head_dim**-0.5 ++ self.dropout = config.attention_dropout + -+ return hasher.hexdigest() ++ self.k_proj = nn.Linear(self.embed_dim, self.embed_dim) ++ self.v_proj = nn.Linear(self.embed_dim, self.embed_dim) ++ self.q_proj = nn.Linear(self.embed_dim, self.embed_dim) ++ self.out_proj = nn.Linear(self.embed_dim, self.embed_dim) + -+ def get( ++ def forward( + self, -+ model_id: str, -+ modality: str, -+ input_item: object, -+ input_kwargs: Mapping[str, object], -+ ) -> Optional[MultiModalKwargsItem]: -+ """ -+ Get a processed multi-modal item from the cache -+ according to its dependencies, including: -+ -+ - The model ID -+ - The modality of the item -+ - The original data item passed to the HF processor -+ - The configuration options of the HF processor -+ """ -+ self._maybe_log_cache_stats() -+ -+ cache_key = self._hash_kwargs(model_id=model_id, -+ **{modality: input_item}, -+ **input_kwargs) -+ return self._cache.get(cache_key) ++ hidden_states: torch.Tensor, ++ attention_mask: Optional[torch.Tensor] = None, ++ output_attentions: Optional[bool] = False, ++ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], ++ Optional[Tuple[torch.Tensor]]]: ++ """Input shape: Batch x Time x Channel""" + -+ def put( -+ self, -+ model_id: str, -+ modality: str, -+ input_item: object, -+ input_kwargs: Mapping[str, object], -+ output_kwargs: MultiModalKwargsItem, -+ ) -> None: -+ """ -+ Put a processed multi-modal item into the cache -+ according to its dependencies (see :meth:`get`). -+ """ -+ cache_key = self._hash_kwargs(model_id=model_id, -+ **{modality: input_item}, -+ **input_kwargs) -+ self._cache.put(cache_key, output_kwargs) ++ batch_size, q_len, _ = hidden_states.size() + ++ # query_states = self.q_proj(hidden_states) ++ # key_states = self.k_proj(hidden_states) ++ # value_states = self.v_proj(hidden_states) + -+class ProcessingMixin: -+ """ -+ Contains helper functions to perform processing. ++ # query_states = query_states.view(batch_size, q_len, self.num_heads, ++ # self.head_dim).transpose(1, 2) ++ # key_states = key_states.view(batch_size, q_len, self.num_heads, ++ # self.head_dim).transpose(1, 2) ++ # value_states = value_states.view(batch_size, q_len, self.num_heads, ++ # self.head_dim).transpose(1, 2) + -+ Not to be confused with :class:`transformers.ProcessorMixin`. -+ """ -+ ctx: InputProcessingContext ++ qkv = self.qkv_proj(hidden_states) ++ qkv = qkv.view(batch_size, q_len, self.num_heads * 3, self.head_dim) ++ qkv = qkv.transpose(1, 2) ++ query_states, key_states, value_states = qkv.chunk(3, dim=1) + -+ def _get_tokenizer(self) -> AnyTokenizer: -+ return self.ctx.tokenizer ++ from ipex_llm.transformers.models.common import padding_qkv_hd ++ query_states, key_states, value_states = padding_qkv_hd( ++ query_states, key_states, value_states, ++ 72, 80 ++ ) ++ from ipex_llm.transformers.models.utils import use_sdp_non_causal ++ if use_sdp_non_causal(self.head_dim, query_states.device, query_states.dtype): ++ import xe_addons ++ attn_weights = None ++ attn_output = xe_addons.sdp_non_causal(query_states, key_states.contiguous(), value_states.contiguous(), attention_mask) ++ else: ++ k_v_seq_len = key_states.shape[-2] ++ attn_weights = torch.matmul(query_states, key_states.transpose( ++ 2, 3)) * self.scale + -+ def _get_hf_config(self) -> PretrainedConfig: -+ return self.ctx.get_hf_config() ++ if attn_weights.size() != (batch_size, self.num_heads, q_len, ++ k_v_seq_len): ++ raise ValueError( ++ "Attention weights should be of size " ++ f"{(batch_size, self.num_heads, q_len, k_v_seq_len)}, but is" ++ f" {attn_weights.size()}") + -+ def _get_hf_processor(self, **kwargs: object) -> ProcessorMixin: -+ """ -+ Subclasses can override this method to handle -+ specific kwargs from model config or user inputs. -+ """ -+ return self.ctx.get_hf_processor(**kwargs) ++ if attention_mask is not None: ++ if attention_mask.size() != (batch_size, 1, q_len, k_v_seq_len): ++ raise ValueError( ++ "Attention mask should be of size " ++ f"{(batch_size, 1, q_len, k_v_seq_len)}", ++ f"but is {attention_mask.size()}") ++ attn_weights = attn_weights + attention_mask + ++ # upcast attention to fp32 ++ # attn_weights = nn.functional.softmax(attn_weights, ++ # dim=-1, ++ # dtype=torch.float32).to( ++ # query_states.dtype) ++ attn_weights = attention_softmax(attn_weights, self.training) ++ attn_weights = nn.functional.dropout(attn_weights, ++ p=self.dropout, ++ training=self.training) ++ attn_output = torch.matmul(attn_weights, value_states) + -+class BaseMultiModalProcessor(ProcessingMixin, ABC): - """ - Abstract base class to process multi-modal inputs to be used in vLLM. ++ if attn_output.size() != (batch_size, self.num_heads, q_len, ++ self.head_dim): ++ raise ValueError( ++ "`attn_output` should be of size " ++ f"{(batch_size, self.num_heads, q_len, self.head_dim)}, " ++ "but is" ++ f" {attn_output.size()}") ++ attn_output = attn_output[:, :, :, :self.head_dim] ++ attn_output = attn_output.transpose(1, 2).contiguous() ++ attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim) + -+ Not to be confused with :class:`transformers.ProcessorMixin`. - """ - -- def __init__(self, ctx: InputProcessingContext) -> None: -+ def __init__(self, -+ ctx: InputProcessingContext, -+ *, -+ cache: Optional[ProcessingCache] = None, -+ enable_sanity_checks: bool = True) -> None: - super().__init__() - - self.ctx = ctx -+ self.cache = cache -+ self.enable_sanity_checks = enable_sanity_checks ++ attn_output = self.out_proj(attn_output) + -+ self.data_parser = self._get_data_parser() -+ self.profiling_info = self._get_profiling_info() - - def __call__( - self, - prompt: str, - mm_data: MultiModalDataDict, -- mm_processor_kwargs: Mapping[str, object], -+ hf_processor_mm_kwargs: Mapping[str, object], - ) -> MultiModalInputsV2: -- return self.apply(prompt, mm_data, mm_processor_kwargs) -+ return self.apply(prompt, mm_data, hf_processor_mm_kwargs) - -- def _get_hf_processor(self) -> ProcessorMixin: -+ def _get_data_parser(self) -> MultiModalDataParser: - """ -- Subclasses can add keyword arguments to this method to accept -- additional kwargs from model config or user inputs. -+ Construct a parser to preprocess multi-modal data items -+ before passing them to :meth:`_get_hf_mm_data`. ++ return attn_output, attn_weights + -+ You can support additional modalities by creating a subclass -+ of :class:`MultiModalDataParser` that has additional subparsers. - """ -- return self.ctx.get_hf_processor() -+ return MultiModalDataParser() - -- def _get_tokenizer(self) -> AnyTokenizer: -- return self.ctx.tokenizer -+ def _get_profiling_info(self) -> BaseProfilingInfo: -+ """ -+ Get the profiling information to find the worst-case memory usage of -+ the model. -+ """ -+ raise NotImplementedError - -- def _get_mm_items( -+ def _to_mm_items( - self, - mm_data: MultiModalDataDict, - ) -> MultiModalDataItems: -- return MultiModalDataItems.from_dict(mm_data) -+ """ -+ Normalize :class:`MultiModalDataDict` to :class:`MultiModalDataItems` -+ before passing them to :meth:`_get_hf_mm_data`. -+ """ -+ mm_items = self.data_parser.parse_mm_data(mm_data) + -+ mm_limits = self.ctx.get_mm_config().limit_per_prompt -+ for modality, items in mm_items.items(): -+ limit = mm_limits.get(modality, 1) -+ if len(items) > limit: -+ raise ValueError( -+ f"You set {modality}={limit} (or defaulted to 1) in " -+ f"`--limit-mm-per-prompt`, but passed {len(items)} " -+ f"{modality} items in the same prompt.") ++class SiglipFlashAttention2(SiglipAttention): + -+ return mm_items ++ def __init__(self, *args, **kwargs): ++ super().__init__(*args, **kwargs) ++ self.is_causal = False # Hack to make sure we don't use a causal mask + -+ @abstractmethod -+ def _get_mm_fields_config( ++ def forward( + self, -+ hf_inputs: BatchFeature, -+ hf_processor_mm_kwargs: Mapping[str, object], -+ ) -> Mapping[str, MultiModalFieldConfig]: -+ """Given the HF-processed data, output the metadata of each field.""" -+ raise NotImplementedError - - @abstractmethod - def _get_prompt_replacements( - self, - mm_items: MultiModalDataItems, -- hf_inputs: BatchFeature, -- mm_processor_kwargs: Mapping[str, object], -+ hf_processor_mm_kwargs: Mapping[str, object], -+ out_mm_kwargs: MultiModalKwargs, - ) -> list[PromptReplacement]: - """ - Given the original multi-modal items for this modality - and HF-processed data, output the replacements to perform. - -- Note: -- Even when the HF processor already performs replacement for us, -- we still use this replacement information to determine -- the placeholder token positions for each multi-modal item. -+ Notes: -+ - You should not assume that HF processor always performs prompt -+ replacement: in :meth:`_apply_hf_processor_missing`, this method -+ is called on text-only and multimodal-only inputs separately, -+ instead of passing them in the same call. -+ - The replacement information returned by this method is also used -+ to determine the placeholder token positions for each multi-modal -+ item. - """ - raise NotImplementedError - -- def _find_placeholders( -+ def _find_mm_placeholders( - self, -- all_prompt_repls: Sequence[_BoundPromptReplacement], -+ mm_prompt_repls: Mapping[str, Sequence[_BoundPromptReplacement]], - new_token_ids: list[int], - mm_item_counts: Mapping[str, int], -- ) -> list[_PlaceholderInfo]: -- return list( -- iter_placeholders(all_prompt_repls, new_token_ids, mm_item_counts)) -+ ) -> Mapping[str, list[_PlaceholderInfo]]: -+ return find_mm_placeholders(mm_prompt_repls, new_token_ids, -+ mm_item_counts) - -- def _get_processor_data( -+ def _get_hf_mm_data( - self, - mm_items: MultiModalDataItems, - ) -> tuple[dict[str, Any], dict[str, Any]]: - processor_data = dict[str, Any]() - passthrough_data = dict[str, Any]() - -- for k, v in mm_items.items(): -- # TODO: Make a separate modality for embedding inputs -- # to avoid confusion -- if k in ("image", "video", "audio"): -- if isinstance(v, torch.Tensor) and v.ndim == 3: -- # Pass through embedding inputs (single) -- passthrough_data[f"{k}_embeds"] = [v] -- elif (is_list_of(v, torch.Tensor) and len(v) > 0 -- and v[0].ndim == 2): -- # Pass through embedding inputs (multi) -- passthrough_data[f"{k}_embeds"] = v -- else: -- # Map keys to plural form, e.g.: image -> images -- processor_data[f"{k}s"] = v -- else: -- processor_data[k] = v -+ for items in mm_items.values(): -+ processor_data.update(items.get_processor_data()) -+ passthrough_data.update(items.get_passthrough_data()) - - return processor_data, passthrough_data - - def _call_hf_processor( - self, -- hf_processor: ProcessorMixin, - prompt: str, -- processor_data: Mapping[str, object], -- mm_processor_kwargs: Mapping[str, object], -+ # Not to be confused with `mm_data` in `self.apply`. -+ # This refers to the data to be passed to HF processor. -+ mm_data: Mapping[str, object], -+ mm_kwargs: Mapping[str, object], - ) -> BatchFeature: -+ """ -+ Call the HF processor on the prompt text and -+ associated multi-modal data. -+ """ - return self.ctx.call_hf_processor( -- hf_processor, -- prompt, -- processor_data, -- mm_processor_kwargs, -+ self._get_hf_processor(**mm_kwargs), -+ dict(text=prompt, **mm_data), -+ mm_kwargs, - ) - - def _apply_hf_processor( - self, -- prompt: str, -+ prompt_text: str, - mm_items: MultiModalDataItems, -- mm_processor_kwargs: Mapping[str, object], -- ) -> BatchFeature: -- # some mm_processor_kwargs may be used in processor initialization -- # instead of processor call -- hf_processor = self._get_hf_processor(**mm_processor_kwargs) -+ hf_processor_mm_kwargs: Mapping[str, object], -+ ) -> tuple[list[int], MultiModalKwargs]: -+ """ -+ Wrapper of :meth:`_call_hf_processor` that applies -+ additional pre-processing and post-processing. -+ """ -+ processor_data, passthrough_data = self._get_hf_mm_data(mm_items) ++ hidden_states: torch.Tensor, ++ attention_mask: Optional[torch.LongTensor] = None, ++ position_ids: Optional[torch.LongTensor] = None, ++ past_key_value: Optional[Tuple[torch.Tensor]] = None, ++ output_attentions: bool = False, ++ use_cache: bool = False, ++ **kwargs, ++ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], ++ Optional[Tuple[torch.Tensor]]]: ++ output_attentions = False + -+ processed_data = self._call_hf_processor( -+ prompt=prompt_text, -+ mm_data=processor_data, -+ mm_kwargs=hf_processor_mm_kwargs, -+ ) -+ processed_data.update(passthrough_data) ++ bsz, q_len, _ = hidden_states.size() + -+ prompt_ids, = processed_data.pop("input_ids").tolist() ++ query_states = self.q_proj(hidden_states) ++ key_states = self.k_proj(hidden_states) ++ value_states = self.v_proj(hidden_states) + -+ mm_kwargs = MultiModalKwargs.from_hf_inputs( -+ processed_data, -+ self._get_mm_fields_config(processed_data, hf_processor_mm_kwargs), -+ ) ++ query_states = query_states.view(bsz, q_len, self.num_heads, ++ self.head_dim).transpose(1, 2) ++ key_states = key_states.view(bsz, q_len, self.num_heads, ++ self.head_dim).transpose(1, 2) ++ value_states = value_states.view(bsz, q_len, self.num_heads, ++ self.head_dim).transpose(1, 2) + -+ return prompt_ids, mm_kwargs ++ kv_seq_len = key_states.shape[-2] ++ if past_key_value is not None: ++ kv_seq_len += past_key_value.get_usable_length( ++ kv_seq_len, self.layer_idx) + -+ def _apply_hf_processor_missing( -+ self, -+ prompt_text: str, -+ mm_missing_data_items: MultiModalDataItems, -+ hf_processor_mm_kwargs: Mapping[str, object], -+ ): -+ """ -+ Apply the HF processor on the full prompt text, but only on the -+ multi-modal data that are missing from the cache. -+ -+ Note: -+ We pass prompt text and multi-modal data into the HF processor -+ in separate calls to avoid HF prompt replacement being done for -+ cached items; instead, we rely on our own prompt replacement logic -+ (:meth:`_get_prompt_replacements`) for the full text. -+ """ -+ mm_missing_counts = mm_missing_data_items.get_all_counts() - -- processor_data, passthrough_data = self._get_processor_data(mm_items) -+ prompt_ids, _ = self._apply_hf_processor( -+ prompt_text=prompt_text, -+ mm_items=MultiModalDataItems({}), -+ hf_processor_mm_kwargs={}, -+ ) ++ query_states = query_states.transpose(1, 2) ++ key_states = key_states.transpose(1, 2) ++ value_states = value_states.transpose(1, 2) + -+ # Some HF processors (e.g. Qwen2-VL) expect corresponding -+ # multi-modal tokens to be in the prompt text -+ dummy_inputs = self.profiling_info.get_dummy_processor_inputs( -+ self.ctx.model_config.max_model_len, -+ mm_missing_counts, -+ ) - -- hf_inputs = self._call_hf_processor( -- hf_processor, -- prompt=prompt, -- processor_data=processor_data, -- mm_processor_kwargs=mm_processor_kwargs, -+ _, mm_missing_kwargs = self._apply_hf_processor( -+ prompt_text=dummy_inputs.prompt_text, -+ mm_items=mm_missing_data_items, -+ hf_processor_mm_kwargs=hf_processor_mm_kwargs, - ) -- hf_inputs.update(passthrough_data) - -- return hf_inputs -+ return prompt_ids, mm_missing_kwargs - -- def _bind_prompt_replacements( -+ def _cached_apply_hf_processor( -+ self, -+ prompt_text: str, -+ mm_data_items: MultiModalDataItems, -+ hf_processor_mm_kwargs: Mapping[str, object], -+ ) -> tuple[list[int], MultiModalKwargs]: -+ """ -+ Apply the HF processor on the full prompt text, -+ caching the results and reusing cached results. -+ """ -+ cache = self.cache -+ model_id = self.ctx.model_config.model -+ -+ _, passthrough_data = self._get_hf_mm_data(mm_data_items) -+ if cache is None or passthrough_data: -+ return self._apply_hf_processor( -+ prompt_text=prompt_text, -+ mm_items=mm_data_items, -+ hf_processor_mm_kwargs=hf_processor_mm_kwargs, -+ ) ++ dropout_rate = self.dropout if self.training else 0.0 + -+ mm_maybe_cached_kw_items = { -+ modality: [ -+ cache.get(model_id, modality, item, hf_processor_mm_kwargs) -+ for item in items -+ ] -+ for modality, items in mm_data_items.items() -+ } ++ input_dtype = query_states.dtype ++ if input_dtype == torch.float32: ++ if torch.is_autocast_enabled(): ++ target_dtype = torch.get_autocast_gpu_dtype() ++ # Handle the case where the model is quantized ++ elif hasattr(self.config, "_pre_quantization_dtype"): ++ target_dtype = self.config._pre_quantization_dtype ++ else: ++ target_dtype = self.q_proj.weight.dtype + -+ mm_missing_idxs = { -+ modality: -+ [idx for idx, item in enumerate(kw_items) if item is None] -+ for modality, kw_items in mm_maybe_cached_kw_items.items() -+ } -+ mm_missing_data = { -+ modality: [mm_data_items[modality][idx] for idx in idxs] -+ for modality, idxs in mm_missing_idxs.items() -+ } -+ mm_missing_data_items = self._to_mm_items(mm_missing_data) ++ logger.warning( ++ "The input hidden states seems to be " ++ "silently casted in float32, " ++ "this might be related to the fact " ++ "you have upcasted embedding or layer norm layers in float32. " ++ "We will cast back the input in" ++ " %s.", target_dtype) + -+ prompt_ids, mm_missing_kwargs = self._apply_hf_processor_missing( -+ prompt_text=prompt_text, -+ mm_missing_data_items=mm_missing_data_items, -+ hf_processor_mm_kwargs=hf_processor_mm_kwargs, -+ ) ++ query_states = query_states.to(target_dtype) ++ key_states = key_states.to(target_dtype) ++ value_states = value_states.to(target_dtype) + -+ mm_missing_next_idx = { -+ modality: 0 -+ for modality in mm_missing_data_items -+ } ++ attn_output = self._flash_attention_forward(query_states, ++ key_states, ++ value_states, ++ attention_mask, ++ q_len, ++ dropout=dropout_rate) + -+ merged_kw_items = list[MultiModalKwargsItem]() -+ for modality, kw_items in mm_maybe_cached_kw_items.items(): -+ for idx, kw_item in enumerate(kw_items): -+ if kw_item is None: -+ kw_item = mm_missing_kwargs.get_item( -+ modality, -+ mm_missing_next_idx[modality], -+ ) ++ attn_output = attn_output.reshape(bsz, q_len, ++ self.embed_dim).contiguous() ++ attn_output = self.out_proj(attn_output) + -+ cache.put( -+ model_id, -+ modality, -+ mm_data_items[modality][idx], -+ hf_processor_mm_kwargs, -+ kw_item, -+ ) ++ if not output_attentions: ++ attn_weights = None + -+ mm_missing_next_idx[modality] += 1 ++ return attn_output, attn_weights + -+ merged_kw_items.append(kw_item) ++ def _flash_attention_forward(self, ++ query_states, ++ key_states, ++ value_states, ++ attention_mask, ++ query_length, ++ dropout=0.0, ++ softmax_scale=None): ++ causal = self.is_causal and query_length != 1 + -+ if self.enable_sanity_checks: -+ mm_missing_counts = mm_missing_data_items.get_all_counts() -+ assert all( -+ item_count == mm_missing_counts[modality] -+ for modality, item_count in mm_missing_next_idx.items()), dict( -+ mm_missing_next_idx=mm_missing_next_idx, -+ mm_missing_counts=mm_missing_counts) ++ # Contains at least one padding token in the sequence ++ if attention_mask is not None: ++ batch_size = query_states.shape[0] ++ (query_states, key_states, value_states, indices_q, cu_seq_lens, ++ max_seq_lens) = self._upad_input(query_states, key_states, ++ value_states, attention_mask, ++ query_length) + -+ mm_kwargs = MultiModalKwargs.from_items(merged_kw_items) ++ cu_seqlens_q, cu_seqlens_k = cu_seq_lens ++ max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens + -+ return prompt_ids, mm_kwargs ++ attn_output_unpad = flash_attn_varlen_func( ++ query_states, ++ key_states, ++ value_states, ++ cu_seqlens_q=cu_seqlens_q, ++ cu_seqlens_k=cu_seqlens_k, ++ max_seqlen_q=max_seqlen_in_batch_q, ++ max_seqlen_k=max_seqlen_in_batch_k, ++ dropout_p=dropout, ++ softmax_scale=softmax_scale, ++ causal=causal, ++ ) + -+ def _bind_and_group_repls( - self, - prompt_repls: list[PromptReplacement], -- ) -> list[_BoundPromptReplacement]: -+ ) -> dict[str, list[_BoundPromptReplacement]]: - tokenizer = self._get_tokenizer() - -- return [prompt_repl.bind(tokenizer) for prompt_repl in prompt_repls] -+ it = (prompt_repl.bind(tokenizer) for prompt_repl in prompt_repls) -+ return dict(full_groupby_modality(it)) ++ attn_output = pad_input(attn_output_unpad, indices_q, batch_size, ++ query_length) ++ else: ++ attn_output = flash_attn_func(query_states, ++ key_states, ++ value_states, ++ dropout, ++ softmax_scale=softmax_scale, ++ causal=causal) + -+ def _always_apply_prompt_replacements(self) -> bool: -+ """ -+ A flag which can be overridden so that -+ :meth:`_apply_prompt_replacements` is always called even if we -+ detect that HF has performed processing via -+ :meth:`_find_placeholders_by_modality`. ++ return attn_output + -+ This is useful in cases where :meth:`_find_placeholders_by_modality` -+ cannot be reliably used to detect whether HF has performed processing. -+ """ -+ return False - - def _apply_prompt_replacements( - self, - token_ids: list[int], -- prompt_repls: Sequence[_BoundPromptReplacement], -+ mm_prompt_repls: Mapping[str, Sequence[_BoundPromptReplacement]], - mm_item_counts: Mapping[str, int], -- ) -> tuple[list[int], str, list[_PlaceholderInfo]]: -+ ) -> tuple[list[int], str, Mapping[str, list[_PlaceholderInfo]]]: - tokenizer = self._get_tokenizer() - -- token_matches = find_token_matches(token_ids, prompt_repls) -+ mm_token_matches = { -+ modality: find_token_matches(token_ids, prompt_repls) -+ for modality, prompt_repls in mm_prompt_repls.items() -+ } -+ mm_match_counts = { -+ modality: len(matches) -+ for modality, matches in mm_token_matches.items() -+ } - - # If the search text does not represent a special token, - # it may have different token IDs in the prompt, because -@@ -742,40 +937,102 @@ class BaseMultiModalProcessor(ABC): - # of the search text in the prompt, we instead perform string - # replacement on the decoded token IDs, then encode them back. - if all( -- len(matches) >= mm_item_counts[modality] -- for modality, matches in full_groupby_modality(token_matches) -+ mm_match_counts.get(modality, 0) >= item_count -+ for modality, item_count in mm_item_counts.items() - ): # yapf: disable - token_ids = replace_token_matches( - token_ids, -- token_matches, -+ mm_token_matches, - mm_item_counts, - ) - -- text = _decode(tokenizer, token_ids) -- matched_repls = [match.prompt_repl for match in token_matches] -+ text = decode_tokens(tokenizer, token_ids) -+ matched_repls = { -+ modality: [match.prompt_repl for match in token_matches] -+ for modality, token_matches in mm_token_matches.items() -+ } - else: -- text = _decode(tokenizer, token_ids) -+ text = decode_tokens(tokenizer, token_ids) - -- text_matches = find_text_matches(text, prompt_repls) -+ mm_text_matches = { -+ modality: find_text_matches(text, prompt_repls) -+ for modality, prompt_repls in mm_prompt_repls.items() -+ } - text = replace_text_matches( - text, -- text_matches, -+ mm_text_matches, - mm_item_counts, - ) - -- token_ids = _encode(tokenizer, text) -- matched_repls = [match.prompt_repl for match in text_matches] -- -- placeholders = self._find_placeholders(matched_repls, token_ids, -- mm_item_counts) -+ token_ids = encode_tokens(tokenizer, -+ text, -+ add_special_tokens=False) -+ matched_repls = { -+ modality: [match.prompt_repl for match in token_matches] -+ for modality, token_matches in mm_text_matches.items() -+ } ++ def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, ++ query_length): ++ indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data( ++ attention_mask) ++ batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape + -+ placeholders = self._find_mm_placeholders( -+ matched_repls, -+ token_ids, -+ mm_item_counts, -+ ) - - return token_ids, text, placeholders - -+ def _validate_mm_kwargs( -+ self, -+ mm_kwargs: MultiModalKwargs, -+ mm_item_counts: Mapping[str, int], -+ ) -> None: -+ for modality, item_count in mm_item_counts.items(): -+ if modality in mm_kwargs.modalities: -+ items = mm_kwargs.get_items(modality) -+ else: -+ items = [] -+ -+ if len(items) != item_count: -+ raise RuntimeError( -+ f"Expected there to be {item_count} {modality} items in " -+ f"keyword arguments corresponding to {item_count} " -+ f"{modality} data items, but only found {len(items)}! " -+ "There is likely a problem with your " -+ "implementation of merged multi-modal processor for this " -+ "model (usually arising from an inconsistency between " -+ "`_call_hf_processor` and `_get_mm_fields_config`).") -+ -+ def _validate_mm_placeholders( -+ self, -+ mm_placeholders: Mapping[str, list[_PlaceholderInfo]], -+ mm_item_counts: Mapping[str, int], -+ *, -+ allow_missing: bool = False, -+ ) -> Mapping[str, int]: -+ missing_repl_counts = dict[str, int]() -+ -+ for modality, item_count in mm_item_counts.items(): -+ placeholders = mm_placeholders.get(modality, []) -+ -+ if len(placeholders) != item_count and not allow_missing: -+ raise RuntimeError( -+ f"Expected there to be {item_count} prompt replacements " -+ f"corresponding to {item_count} {modality} items, but only " -+ f"found {len(placeholders)} prompt replacements! Either " -+ "the prompt text has missing/incorrect tokens for " -+ "multi-modal inputs, or there is a problem with your " -+ "implementation of merged multi-modal processor for this " -+ "model (usually arising from an inconsistency between " -+ "`_call_hf_processor` and `_get_prompt_replacements`).") -+ -+ missing_repl_counts[modality] = item_count - len(placeholders) -+ -+ return missing_repl_counts ++ key_layer = index_first_axis( ++ key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, ++ head_dim), indices_k) ++ value_layer = index_first_axis( ++ value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, ++ head_dim), indices_k) ++ if query_length == kv_seq_len: ++ query_layer = index_first_axis( ++ query_layer.reshape(batch_size * kv_seq_len, self.num_heads, ++ head_dim), indices_k) ++ cu_seqlens_q = cu_seqlens_k ++ max_seqlen_in_batch_q = max_seqlen_in_batch_k ++ indices_q = indices_k ++ elif query_length == 1: ++ max_seqlen_in_batch_q = 1 ++ cu_seqlens_q = torch.arange( ++ batch_size + 1, dtype=torch.int32, device=query_layer.device ++ ) # There is a memcpy here, that is very bad. ++ indices_q = cu_seqlens_q[:-1] ++ query_layer = query_layer.squeeze(1) ++ else: ++ # The -q_len: slice assumes left padding. ++ attention_mask = attention_mask[:, -query_length:] ++ (query_layer, indices_q, cu_seqlens_q, ++ max_seqlen_in_batch_q) = unpad_input(query_layer, attention_mask) + - def apply( - self, - prompt_text: str, - mm_data: MultiModalDataDict, -- mm_processor_kwargs: Mapping[str, object], -+ hf_processor_mm_kwargs: Mapping[str, object], - ) -> MultiModalInputsV2: - """ - Process multi-modal inputs to be used in vLLM. -@@ -790,40 +1047,74 @@ class BaseMultiModalProcessor(ABC): - 3. Extract information about the placeholder tokens from the - processed token IDs. - """ -- mm_items = self._get_mm_items(mm_data) -+ mm_items = self._to_mm_items(mm_data) - -- hf_inputs = self._apply_hf_processor(prompt_text, mm_items, -- mm_processor_kwargs) -- prompt_ids, = hf_inputs.pop("input_ids").tolist() -- mm_kwargs = MultiModalKwargs(hf_inputs) -+ prompt_ids, mm_kwargs = self._cached_apply_hf_processor( -+ prompt_text, -+ mm_items, -+ hf_processor_mm_kwargs, ++ return ( ++ query_layer, ++ key_layer, ++ value_layer, ++ indices_q, ++ (cu_seqlens_q, cu_seqlens_k), ++ (max_seqlen_in_batch_q, max_seqlen_in_batch_k), + ) + -+ unbound_prompt_repls = self._get_prompt_replacements( -+ mm_items, -+ hf_processor_mm_kwargs, -+ mm_kwargs, -+ ) -+ mm_prompt_repls = self._bind_and_group_repls(unbound_prompt_repls) - -- prompt_repls = self._get_prompt_replacements(mm_items, hf_inputs, -- mm_processor_kwargs) -- all_prompt_repls = self._bind_prompt_replacements(prompt_repls) -+ mm_item_counts = mm_items.get_all_counts() -+ self._validate_mm_kwargs(mm_kwargs, mm_item_counts) -+ -+ hf_mm_placeholders = self._find_mm_placeholders( -+ mm_prompt_repls, -+ prompt_ids, -+ mm_item_counts, -+ ) + -+ if self._always_apply_prompt_replacements(): -+ mm_missing_repl_counts = mm_item_counts -+ mm_missing_repls = dict(mm_prompt_repls) -+ else: -+ mm_missing_repl_counts = self._validate_mm_placeholders( -+ hf_mm_placeholders, -+ mm_item_counts, -+ allow_missing=True, -+ ) ++# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->Siglip ++class SiglipMLP(nn.Module): + -+ mm_missing_repls = dict[str, list[_BoundPromptReplacement]]() -+ for modality, missing_repl_count in mm_missing_repl_counts.items(): -+ if missing_repl_count == 0: -+ mm_missing_repls[modality] = [] -+ elif missing_repl_count == mm_item_counts.get(modality, 0): -+ mm_missing_repls[modality] = mm_prompt_repls[modality] -+ else: -+ raise ValueError("Partial prompt replacement within " -+ f"{modality=} is not supported") - - # If HF processor already inserts placeholder tokens, - # there is no need for us to insert them -- mm_item_counts = mm_items.get_item_counts() -- all_placeholders = self._find_placeholders(all_prompt_repls, -- prompt_ids, mm_item_counts) -- -- if all_placeholders: -+ if all(len(repls) == 0 for repls in mm_missing_repls.items()): - tokenizer = self._get_tokenizer() -- prompt_text = _decode(tokenizer, prompt_ids) -+ prompt_text = decode_tokens(tokenizer, prompt_ids) -+ mm_placeholders = hf_mm_placeholders - else: - ( - prompt_ids, - prompt_text, -- all_placeholders, -+ missing_mm_placeholders, - ) = self._apply_prompt_replacements( - prompt_ids, -- all_prompt_repls, -- mm_item_counts, -+ mm_missing_repls, -+ mm_missing_repl_counts, - ) - -- mm_placeholders = { -- modality: [item.to_range() for item in items] -- for modality, items in full_groupby_modality(all_placeholders) -+ mm_placeholders = {**hf_mm_placeholders, **missing_mm_placeholders} ++ def __init__(self, config): ++ super().__init__() ++ self.config = config ++ self.activation_fn = ACT2FN[config.hidden_act] ++ self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size) ++ self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size) + -+ self._validate_mm_placeholders(mm_placeholders, mm_item_counts) ++ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: ++ hidden_states = self.fc1(hidden_states) ++ hidden_states = self.activation_fn(hidden_states) ++ hidden_states = self.fc2(hidden_states) ++ return hidden_states + -+ mm_placeholder_ranges = { -+ modality: [item.to_range() for item in placeholders] -+ for modality, placeholders in mm_placeholders.items() - } - - return MultiModalInputsV2( -@@ -831,47 +1122,56 @@ class BaseMultiModalProcessor(ABC): - prompt=prompt_text, - prompt_token_ids=prompt_ids, - mm_kwargs=mm_kwargs, -- mm_placeholders=mm_placeholders, -+ mm_placeholders=mm_placeholder_ranges, - ) - -- @abstractmethod - def _get_dummy_mm_inputs( -- self, -- mm_counts: Mapping[str, int], -- ) -> ProcessorInputs: -- """ -- Build the multi-modal portion of the input which, after processing, -- results in `mm_max_tokens` in :meth:`get_dummy_data`. -- """ -- raise NotImplementedError -- -- def get_dummy_data( - self, - seq_len: int, - mm_counts: Mapping[str, int], -- mm_max_tokens: Mapping[str, int], -- ) -> DummyData: -+ ) -> MultiModalInputsV2: -+ profiling = self.profiling_info -+ processor_inputs = profiling.get_dummy_processor_inputs( -+ seq_len, mm_counts) -+ -+ return self.apply( -+ prompt_text=processor_inputs.prompt_text, -+ mm_data=processor_inputs.mm_data, -+ hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs, -+ ) + -+ def get_dummy_data(self, seq_len: int) -> DummyData: - # Avoid circular import - from vllm.sequence import SequenceData - -- processor_inputs = self._get_dummy_mm_inputs(mm_counts) -- mm_inputs = self.apply(*processor_inputs) -- -+ profiling = self.profiling_info -+ mm_counts = profiling.get_mm_limits() -+ mm_max_tokens_per_item = profiling.get_mm_max_tokens_per_item(seq_len) -+ if mm_counts.keys() != mm_max_tokens_per_item.keys(): -+ raise AssertionError( -+ "The keys returned by `get_supported_mm_limits`" -+ f"({set(mm_counts.keys())}) should be the same as those " -+ "returned by `get_mm_max_tokens_per_item` " -+ f"({set(mm_max_tokens_per_item.keys())})") -+ -+ mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts) - prompt_token_ids = mm_inputs["prompt_token_ids"] - placeholders_by_modality = mm_inputs["mm_placeholders"] - -- total_placeholders_by_modality = dict[str, int]() -- for modality, placeholders in placeholders_by_modality.items(): -- num_placeholders = sum(item["length"] for item in placeholders) -- max_tokens = mm_max_tokens[modality] -- -- if num_placeholders != max_tokens: -- logger.warning( -- "The processed dummy data has a total of %d placeholder " -- "tokens for the '%s' modality, which is not the expected " -- "%d tokens.", num_placeholders, modality, max_tokens) -- -- total_placeholders_by_modality[modality] = num_placeholders -+ total_placeholders_by_modality = { -+ modality: sum(item["length"] for item in placeholders) -+ for modality, placeholders in placeholders_by_modality.items() -+ } -+ expected_placeholders_by_modality = { -+ modality: mm_max_tokens_per_item[modality] * mm_counts[modality] -+ for modality in placeholders_by_modality -+ } -+ if total_placeholders_by_modality != expected_placeholders_by_modality: -+ raise AssertionError( -+ f"The processed dummy data has a total of " -+ f"{total_placeholders_by_modality} placeholder tokens, which " -+ f"is not the expected {expected_placeholders_by_modality} " -+ "tokens.") - - total_len = len(prompt_token_ids) - if total_len > seq_len: -@@ -885,6 +1185,12 @@ class BaseMultiModalProcessor(ABC): - "reduce `max_num_seqs`, and/or reduce `mm_counts`.", seq_len, - total_len, total_placeholders_by_modality) - -+ return DummyData( -+ seq_data=SequenceData.from_prompt_token_counts((0, seq_len)), -+ multi_modal_data=None, -+ multi_modal_placeholders=None, -+ ) ++# Copied from transformers.models.clip.modeling_clip.CLIPEncoderLayer ++# with CLIP->Siglip ++class SiglipEncoderLayer(nn.Module): + - prompt_token_ids.extend([0] * (seq_len - len(prompt_token_ids))) - - return DummyData( -diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py -new file mode 100644 -index 000000000..2ecf0db1a ---- /dev/null -+++ b/vllm/multimodal/profiling.py -@@ -0,0 +1,121 @@ -+from abc import ABC, abstractmethod -+from collections.abc import Mapping -+from dataclasses import dataclass, field -+from typing import Optional ++ def __init__(self, config: SiglipVisionConfig): ++ super().__init__() ++ self.embed_dim = config.hidden_size ++ self._use_flash_attention_2 = ( ++ config._attn_implementation == "flash_attention_2") ++ self.self_attn = (SiglipAttention(config) ++ if not self._use_flash_attention_2 else ++ SiglipFlashAttention2(config)) ++ self.layer_norm1 = nn.LayerNorm(self.embed_dim, ++ eps=config.layer_norm_eps) ++ self.mlp = SiglipMLP(config) ++ self.layer_norm2 = nn.LayerNorm(self.embed_dim, ++ eps=config.layer_norm_eps) + -+import numpy as np -+import numpy.typing as npt -+from PIL import Image ++ def forward( ++ self, ++ hidden_states: torch.Tensor, ++ attention_mask: torch.Tensor, ++ output_attentions: Optional[bool] = False, ++ ) -> Tuple[torch.FloatTensor]: ++ residual = hidden_states + -+from vllm.inputs import InputProcessingContext -+from vllm.logger import init_logger ++ hidden_states = self.layer_norm1(hidden_states) ++ hidden_states, attn_weights = self.self_attn( ++ hidden_states=hidden_states, ++ attention_mask=attention_mask, ++ output_attentions=output_attentions, ++ ) ++ hidden_states = residual + hidden_states ++ ++ residual = hidden_states ++ hidden_states = self.layer_norm2(hidden_states) ++ hidden_states = self.mlp(hidden_states) ++ hidden_states = residual + hidden_states + -+from .inputs import MultiModalDataDict ++ outputs = (hidden_states, ) + -+logger = init_logger(__name__) ++ if output_attentions: ++ outputs += (attn_weights, ) + ++ return outputs + -+@dataclass -+class ProcessorInputs: -+ """Keyword arguments to :meth:`BaseMultiModalProcessor`.""" -+ prompt_text: str -+ mm_data: MultiModalDataDict -+ hf_processor_mm_kwargs: Mapping[str, object] = field(default_factory=dict) + ++class SiglipPreTrainedModel(PreTrainedModel): ++ config_class = SiglipVisionConfig ++ base_model_prefix = "siglip" ++ supports_gradient_checkpointing = True + -+class BaseProfilingInfo(ABC): -+ """ -+ Abstract base class that provides the information necessary to profile -+ multi-modal models. -+ """ ++ def _init_weights(self, module): ++ """Initialize the weights""" + -+ def __init__(self, ctx: InputProcessingContext) -> None: -+ super().__init__() ++ if isinstance(module, SiglipVisionEmbeddings): ++ width = self.config.hidden_size ++ nn.init.normal_(module.position_embedding.weight, ++ std=1 / np.sqrt(width)) ++ elif isinstance(module, nn.Embedding): ++ default_flax_embed_init(module.weight) ++ elif isinstance(module, SiglipAttention): ++ nn.init.normal_(module.q_proj.weight) ++ nn.init.normal_(module.k_proj.weight) ++ nn.init.normal_(module.v_proj.weight) ++ nn.init.normal_(module.out_proj.weight) ++ nn.init.zeros_(module.q_proj.bias) ++ nn.init.zeros_(module.k_proj.bias) ++ nn.init.zeros_(module.v_proj.bias) ++ nn.init.zeros_(module.out_proj.bias) ++ elif isinstance(module, SiglipMLP): ++ nn.init.normal_(module.fc1.weight) ++ nn.init.normal_(module.fc2.weight) ++ nn.init.normal_(module.fc1.bias, std=1e-6) ++ nn.init.normal_(module.fc2.bias, std=1e-6) ++ elif isinstance(module, (nn.Linear, nn.Conv2d)): ++ lecun_normal_(module.weight) ++ if module.bias is not None: ++ nn.init.zeros_(module.bias) ++ elif isinstance(module, nn.LayerNorm): ++ module.bias.data.zero_() ++ module.weight.data.fill_(1.0) + -+ self.ctx = ctx + -+ @abstractmethod -+ def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: -+ """ -+ Return the maximum supported number of items for each modality. ++# Copied from transformers.models.clip.modeling_clip.CLIPEncoder ++# with CLIP->Siglip ++class SiglipEncoder(nn.Module): + -+ A value of `None` means unlimited number of items. ++ def __init__(self, config: SiglipVisionConfig): ++ super().__init__() ++ self.config = config ++ self.layers = nn.ModuleList([ ++ SiglipEncoderLayer(config) for _ in range(config.num_hidden_layers) ++ ]) ++ self.gradient_checkpointing = False + -+ Omitting a modality from the returned dictionary means that -+ it is not supported at all. -+ """ -+ raise NotImplementedError ++ # Ignore copy ++ def forward( ++ self, ++ inputs_embeds, ++ attention_mask: Optional[torch.Tensor] = None, ++ output_attentions: Optional[bool] = None, ++ output_hidden_states: Optional[bool] = None, ++ return_dict: Optional[bool] = None, ++ ) -> Union[Tuple, BaseModelOutput]: ++ output_attentions = output_attentions if output_attentions is not None \ ++ else self.config.output_attentions ++ output_hidden_states = (output_hidden_states ++ if output_hidden_states is not None else ++ self.config.output_hidden_states) ++ return_dict = return_dict if return_dict is not None \ ++ else self.config.use_return_dict + -+ @abstractmethod -+ def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: -+ """ -+ Get the maximum possible number of tokens per data item -+ for each modality. ++ encoder_states = () if output_hidden_states else None ++ all_attentions = () if output_attentions else None + -+ The dictionary returned by this method should have the same -+ keys as that returned by :meth:`get_supported_mm_limits`. -+ """ -+ raise NotImplementedError ++ hidden_states = inputs_embeds ++ for encoder_layer in self.layers: ++ if output_hidden_states: ++ encoder_states = encoder_states + (hidden_states, ) ++ if self.gradient_checkpointing and self.training: ++ layer_outputs = self._gradient_checkpointing_func( ++ encoder_layer.__call__, ++ hidden_states, ++ attention_mask, ++ output_attentions, ++ ) ++ else: ++ layer_outputs = encoder_layer( ++ hidden_states, ++ attention_mask, ++ output_attentions=output_attentions, ++ ) + -+ @abstractmethod -+ def get_dummy_processor_inputs( -+ self, -+ seq_len: int, -+ mm_counts: Mapping[str, int], -+ ) -> ProcessorInputs: -+ """ -+ Build the multi-modal portion of the input which, after processing, -+ results in `mm_max_tokens` in :meth:`get_mm_max_tokens_per_item`. -+ """ -+ raise NotImplementedError ++ hidden_states = layer_outputs[0] + -+ def _get_dummy_audios( -+ self, -+ *, -+ length: int, -+ num_audios: int, -+ ) -> list[npt.NDArray]: -+ audio = np.zeros((length, )) -+ return [audio] * num_audios -+ -+ def _get_dummy_images( -+ self, -+ *, -+ width: int, -+ height: int, -+ num_images: int, -+ ) -> list[Image.Image]: -+ image = Image.new("RGB", (width, height), color=0) -+ return [image] * num_images -+ -+ def _get_dummy_videos( -+ self, -+ *, -+ width: int, -+ height: int, -+ num_frames: int, -+ num_videos: int, -+ ) -> list[npt.NDArray]: -+ video = np.zeros((num_frames, width, height, 3)) -+ return [video] * num_videos -+ -+ def get_mm_limits(self) -> Mapping[str, int]: -+ mm_config = self.ctx.get_mm_config() -+ mm_limit_per_prompt = mm_config.limit_per_prompt -+ -+ supported_mm_limits = self.get_supported_mm_limits() -+ -+ mm_limits = { -+ modality: mm_limit_per_prompt.get(modality, 1) -+ for modality in supported_mm_limits -+ } ++ if output_attentions: ++ all_attentions = all_attentions + (layer_outputs[1], ) + -+ for modality, supported_limit in supported_mm_limits.items(): -+ limit = mm_limits[modality] -+ if supported_limit is not None and supported_limit < limit: -+ raise ValueError( -+ f"You set {modality}={limit} (or defaulted to 1) in " -+ f"`--limit-mm-per-prompt`, but this model only supports " -+ f"at most {supported_limit} {modality} items.") -+ -+ return mm_limits -diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py -index ded45a718..f75a594a4 100644 ---- a/vllm/multimodal/registry.py -+++ b/vllm/multimodal/registry.py -@@ -1,10 +1,9 @@ - import functools - from collections import UserDict --from typing import (TYPE_CHECKING, Any, Callable, Dict, Mapping, Optional, -+from typing import (TYPE_CHECKING, Any, Dict, Mapping, Optional, Protocol, - Sequence, Type, TypeVar) - - import torch.nn as nn --from typing_extensions import TypeAlias - - from vllm.inputs import InputProcessingContext - from vllm.logger import init_logger -@@ -15,7 +14,8 @@ from .audio import AudioPlugin - from .base import MultiModalInputMapper, MultiModalPlugin, MultiModalTokensCalc - from .image import ImagePlugin - from .inputs import MultiModalDataDict, MultiModalKwargs, NestedTensors --from .processing import BaseMultiModalProcessor -+from .processing import BaseMultiModalProcessor, ProcessingCache -+from .utils import cached_get_tokenizer - from .video import VideoPlugin - - if TYPE_CHECKING: -@@ -23,15 +23,22 @@ if TYPE_CHECKING: - - logger = init_logger(__name__) - -+# TODO: Tune the MM cache size -+MM_CACHE_SIZE = 256 ++ if output_hidden_states: ++ encoder_states = encoder_states + (hidden_states, ) + - N = TypeVar("N", bound=Type[nn.Module]) - --MultiModalProcessorFactory: TypeAlias = Callable[[InputProcessingContext], -- BaseMultiModalProcessor] --""" --Constructs a :class:`MultiModalProcessor` instance from the context. - --The processing metadata should be derived from the context. --""" -+class MultiModalProcessorFactory(Protocol): -+ """Constructs a :class:`MultiModalProcessor` instance from the context.""" ++ if not return_dict: ++ return tuple( ++ v for v in [hidden_states, encoder_states, all_attentions] ++ if v is not None) ++ return BaseModelOutput(last_hidden_state=hidden_states, ++ hidden_states=encoder_states, ++ attentions=all_attentions) + -+ def __call__( -+ self, -+ ctx: InputProcessingContext, -+ *, -+ cache: Optional[ProcessingCache] = None, -+ ) -> BaseMultiModalProcessor: -+ ... - - - class _MultiModalLimits(UserDict["ModelConfig", Dict[str, int]]): -@@ -71,6 +78,8 @@ class MultiModalRegistry: - - self._limits_by_model = _MultiModalLimits() - -+ self._processing_cache = ProcessingCache(MM_CACHE_SIZE) + - def register_plugin(self, plugin: MultiModalPlugin) -> None: - """ - Register a multi-modal plugin so it can be recognized by vLLM. -@@ -211,6 +220,11 @@ class MultiModalRegistry: - Note: - This is currently directly used only in V1. - """ -+ if self.has_processor(model_config): -+ tokenizer = cached_get_tokenizer(model_config.tokenizer) -+ processor = self.create_processor(model_config, tokenizer) -+ seq_len = model_config.max_model_len -+ return processor.profiling_info.get_mm_max_tokens_per_item(seq_len) - - return { - key: plugin.get_max_multimodal_tokens(model_config) -@@ -328,15 +342,18 @@ class MultiModalRegistry: - - return wrapper - -- def has_processor(self, model_config: "ModelConfig") -> bool: -- """ -- Test whether a multi-modal processor is defined for a specific model. -- """ -+ def _get_model_cls(self, model_config: "ModelConfig"): - # Avoid circular import - from vllm.model_executor.model_loader import get_model_architecture - - model_cls, _ = get_model_architecture(model_config) -- return model_cls in self._processor_factories -+ return model_cls ++class SiglipVisionTransformer(SiglipPreTrainedModel): ++ config_class = SiglipVisionConfig ++ main_input_name = "pixel_values" ++ _supports_flash_attn_2 = True + -+ def has_processor(self, model_config: "ModelConfig") -> bool: -+ """ -+ Test whether a multi-modal processor is defined for a specific model. -+ """ -+ return self._get_model_cls(model_config) in self._processor_factories - - def create_processor( - self, -@@ -346,12 +363,11 @@ class MultiModalRegistry: - """ - Create a multi-modal processor for a specific model and tokenizer. - """ -- -- # Avoid circular import -- from vllm.model_executor.model_loader import get_model_architecture -- -- model_cls, _ = get_model_architecture(model_config) -+ model_cls = self._get_model_cls(model_config) - processor_factory = self._processor_factories[model_cls] - - ctx = InputProcessingContext(model_config, tokenizer) -- return processor_factory(ctx) -+ cache = (None if model_config.disable_mm_preprocessor_cache else -+ self._processing_cache) ++ def __init__(self, config: SiglipVisionConfig): ++ super().__init__(config) ++ self.config = config ++ embed_dim = config.hidden_size ++ ++ self.embeddings = SiglipVisionEmbeddings(config) ++ self.encoder = SiglipEncoder(config) ++ self.post_layernorm = nn.LayerNorm(embed_dim, ++ eps=config.layer_norm_eps) ++ self._use_flash_attention_2 = ( ++ config._attn_implementation == "flash_attention_2") + -+ return processor_factory(ctx, cache=cache) -diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py -index a49da2bde..589b1a447 100644 ---- a/vllm/multimodal/utils.py -+++ b/vllm/multimodal/utils.py -@@ -1,8 +1,7 @@ --import base64 --import os - from functools import lru_cache --from io import BytesIO --from typing import List, Optional, Tuple, TypeVar, Union -+from pathlib import Path -+from typing import Optional, TypeVar, Union -+from urllib.parse import ParseResult, urlparse - - import numpy as np - import numpy.typing as npt -@@ -10,283 +9,255 @@ import torch - from PIL import Image - - import vllm.envs as envs --from vllm.connections import global_http_connection -+from vllm.connections import HTTPConnection, global_http_connection - from vllm.logger import init_logger - from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer --from vllm.utils import PlaceholderModule - --from .inputs import MultiModalDataDict, PlaceholderRange -- --try: -- import decord --except ImportError: -- decord = PlaceholderModule("decord") # type: ignore[assignment] -- --try: -- import librosa --except ImportError: -- librosa = PlaceholderModule("librosa") # type: ignore[assignment] -- --try: -- import soundfile --except ImportError: -- soundfile = PlaceholderModule("soundfile") # type: ignore[assignment] -+from .audio import AudioMediaIO -+from .base import MediaIO -+from .image import ImageMediaIO -+from .inputs import PlaceholderRange -+from .video import VideoMediaIO - - logger = init_logger(__name__) - - cached_get_tokenizer = lru_cache(get_tokenizer) - -+_M = TypeVar("_M") - --def _load_image_from_bytes(b: bytes) -> Image.Image: -- image = Image.open(BytesIO(b)) -- image.load() -- return image - -+class MediaConnector: - --def _is_subpath(image_path: str, allowed_local_media_path: str) -> bool: -- # Get the common path -- common_path = os.path.commonpath([ -- os.path.abspath(image_path), -- os.path.abspath(allowed_local_media_path) -- ]) -- # Check if the common path is the same as allowed_local_media_path -- return common_path == os.path.abspath(allowed_local_media_path) -- -- --def _load_image_from_file(image_url: str, -- allowed_local_media_path: str) -> Image.Image: -- if not allowed_local_media_path: -- raise ValueError("Invalid 'image_url': Cannot load local files without" -- "'--allowed-local-media-path'.") -- if allowed_local_media_path: -- if not os.path.exists(allowed_local_media_path): -- raise ValueError( -- "Invalid '--allowed-local-media-path': " -- f"The path {allowed_local_media_path} does not exist.") -- if not os.path.isdir(allowed_local_media_path): -+ def __init__( ++ # Initialize weights and apply final processing ++ self.post_init() ++ ++ def get_input_embeddings(self) -> nn.Module: ++ return self.embeddings.patch_embedding ++ ++ @replace_return_docstrings(output_type=BaseModelOutputWithPooling, ++ config_class=SiglipVisionConfig) ++ def forward( + self, -+ connection: HTTPConnection = global_http_connection, -+ *, -+ allowed_local_media_path: str = "", -+ ) -> None: -+ super().__init__() ++ pixel_values, ++ patch_attention_mask: Optional[torch.BoolTensor] = None, ++ tgt_sizes: Optional[torch.IntTensor] = None, ++ output_attentions: Optional[bool] = None, ++ output_hidden_states: Optional[bool] = None, ++ return_dict: Optional[bool] = None, ++ ) -> Union[Tuple, BaseModelOutputWithPooling]: ++ r""" ++ Returns: ++ """ ++ output_attentions = output_attentions if output_attentions is not None \ ++ else self.config.output_attentions ++ output_hidden_states = (output_hidden_states ++ if output_hidden_states is not None else ++ self.config.output_hidden_states) ++ return_dict = return_dict if return_dict is not None \ ++ else self.config.use_return_dict + -+ self.connection = connection ++ batch_size = pixel_values.size(0) ++ if patch_attention_mask is None: ++ patch_attention_mask = torch.ones( ++ size=( ++ batch_size, ++ pixel_values.size(2) // self.config.patch_size, ++ pixel_values.size(3) // self.config.patch_size, ++ ), ++ dtype=torch.bool, ++ device=pixel_values.device, ++ ) + -+ if allowed_local_media_path: -+ allowed_local_media_path_ = Path(allowed_local_media_path) ++ hidden_states = self.embeddings( ++ pixel_values=pixel_values, ++ patch_attention_mask=patch_attention_mask, ++ tgt_sizes=tgt_sizes) + -+ if not allowed_local_media_path_.exists(): -+ raise ValueError( -+ "Invalid `--allowed-local-media-path`: The path " -+ f"{allowed_local_media_path_} does not exist.") -+ if not allowed_local_media_path_.is_dir(): -+ raise ValueError( -+ "Invalid `--allowed-local-media-path`: The path " -+ f"{allowed_local_media_path_} must be a directory.") ++ patch_attention_mask = patch_attention_mask.view(batch_size, -1) ++ # The call to `_upad_input` in `_flash_attention_forward` is expensive ++ # So when the `patch_attention_mask` is full of 1s ++ # (i.e. attending to the whole sequence), ++ # avoiding passing the attention_mask, ++ # which is equivalent to attending to the full sequence ++ if not torch.any(~patch_attention_mask): ++ attention_mask = None + else: -+ allowed_local_media_path_ = None -+ -+ self.allowed_local_media_path = allowed_local_media_path_ ++ attention_mask = (_prepare_4d_attention_mask( ++ patch_attention_mask, hidden_states.dtype) ++ if not self._use_flash_attention_2 else ++ patch_attention_mask) + -+ def _load_data_url( -+ self, -+ url_spec: ParseResult, -+ media_io: MediaIO[_M], -+ ) -> _M: -+ data_spec, data = url_spec.path.split(",", 1) -+ media_type, data_type = data_spec.split(";", 1) ++ encoder_outputs = self.encoder( ++ inputs_embeds=hidden_states, ++ attention_mask=attention_mask, ++ output_attentions=output_attentions, ++ output_hidden_states=output_hidden_states, ++ return_dict=return_dict, ++ ) + -+ if data_type != "base64": -+ msg = "Only base64 data URLs are supported for now." -+ raise NotImplementedError(msg) ++ last_hidden_state = encoder_outputs[0] ++ last_hidden_state = self.post_layernorm(last_hidden_state) + -+ return media_io.load_base64(media_type, data) ++ if not return_dict: ++ return (last_hidden_state, None) + encoder_outputs[1:] + -+ def _load_file_url( -+ self, -+ url_spec: ParseResult, -+ media_io: MediaIO[_M], -+ ) -> _M: -+ allowed_local_media_path = self.allowed_local_media_path -+ if allowed_local_media_path is None: -+ raise RuntimeError("Cannot load local files without " -+ "`--allowed-local-media-path`.") -+ -+ filepath = Path(url_spec.path) -+ if allowed_local_media_path not in filepath.resolve().parents: - raise ValueError( -- "Invalid '--allowed-local-media-path': " -- f"The path {allowed_local_media_path} must be a directory.") -- -- # Only split once and assume the second part is the image path -- _, image_path = image_url.split("file://", 1) -- if not _is_subpath(image_path, allowed_local_media_path): -- raise ValueError( -- f"Invalid 'image_url': The file path {image_path} must" -- " be a subpath of '--allowed-local-media-path'" -- f" '{allowed_local_media_path}'.") -- -- image = Image.open(image_path) -- image.load() -- return image -- -- --def _load_image_from_data_url(image_url: str) -> Image.Image: -- # Only split once and assume the second part is the base64 encoded image -- _, image_base64 = image_url.split(",", 1) -- return load_image_from_base64(image_base64) -- -- --def fetch_image(image_url: str, -- *, -- image_mode: str = "RGB", -- allowed_local_media_path: str = "") -> Image.Image: -- """ -- Load a PIL image from a HTTP or base64 data URL. -- -- By default, the image is converted into RGB format. -- """ -- if image_url.startswith('http'): -- image_raw = global_http_connection.get_bytes( -- image_url, -- timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT, -- ) -- image = _load_image_from_bytes(image_raw) -- -- elif image_url.startswith('data:image'): -- image = _load_image_from_data_url(image_url) -- elif image_url.startswith('file://'): -- image = _load_image_from_file(image_url, allowed_local_media_path) -- else: -- raise ValueError("Invalid 'image_url': A valid 'image_url' must start " -- "with either 'data:image', 'file://' or 'http'.") -+ f"The file path {filepath} must be a subpath " -+ f"of `--allowed-local-media-path` {allowed_local_media_path}.") ++ return BaseModelOutputWithPooling( ++ last_hidden_state=last_hidden_state, ++ pooler_output=None, ++ hidden_states=encoder_outputs.hidden_states, ++ attentions=encoder_outputs.attentions, ++ ) +diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py +index c4d02e5dd..2831a5a12 100644 +--- a/vllm/model_executor/models/qwen2.py ++++ b/vllm/model_executor/models/qwen2.py +@@ -263,7 +263,11 @@ class Qwen2DecoderLayer(nn.Module): + }) + class Qwen2Model(nn.Module): + +- def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ++ def __init__(self, ++ *, ++ vllm_config: VllmConfig, ++ prefix: str = "", ++ decoder_layer_type: type[nn.Module] = Qwen2DecoderLayer): + super().__init__() -- return image.convert(image_mode) -+ return media_io.load_file(filepath) + config = vllm_config.model_config.hf_config +@@ -297,12 +301,14 @@ class Qwen2Model(nn.Module): + else: + self.embed_tokens = PPMissingLayer() -+ def load_from_url( -+ self, -+ url: str, -+ media_io: MediaIO[_M], -+ *, -+ fetch_timeout: Optional[int] = None, -+ ) -> _M: -+ url_spec = urlparse(url) ++ # Use the provided decoder layer type or default to Qwen2DecoderLayer ++ decoder_layer_type = decoder_layer_type or Qwen2DecoderLayer + self.start_layer, self.end_layer, self.layers = make_layers( + config.num_hidden_layers, +- lambda prefix: Qwen2DecoderLayer(config=config, +- cache_config=cache_config, +- quant_config=quant_config, +- prefix=prefix), ++ lambda prefix: decoder_layer_type(config=config, ++ cache_config=cache_config, ++ quant_config=quant_config, ++ prefix=prefix), + prefix=f"{prefix}.layers", + ) + +diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py +index 1e6ff1fec..e2480326a 100644 +--- a/vllm/model_executor/models/qwen2_5_vl.py ++++ b/vllm/model_executor/models/qwen2_5_vl.py +@@ -304,6 +304,10 @@ class Qwen2_5_VisionAttention(nn.Module): + elif self.attn_backend == _Backend.TORCH_SDPA: + # Execute attention entry by entry for speed & less VRAM. + outputs = [] ++ head_dim = q.shape[-1] ++ import math ++ import xe_addons ++ scale = 1 / math.sqrt(head_dim) + for i in range(1, len(cu_seqlens)): + start_idx = cu_seqlens[i - 1] + end_idx = cu_seqlens[i] +@@ -312,10 +316,16 @@ class Qwen2_5_VisionAttention(nn.Module): + v_i = v[:, start_idx:end_idx] + q_i, k_i, v_i = (rearrange(x, "b s h d -> b h s d") + for x in [q_i, k_i, v_i]) +- output_i = F.scaled_dot_product_attention(q_i, +- k_i, +- v_i, +- dropout_p=0.0) ++ # output_i = F.scaled_dot_product_attention(q_i, ++ # k_i, ++ # v_i, ++ # dropout_p=0.0) ++ output_i = xe_addons.sdp_non_causal( ++ q_i.contiguous(), ++ k_i.contiguous(), ++ v_i.contiguous(), ++ None, ++ scale) + output_i = rearrange(output_i, "b h s d -> b s h d ") + outputs.append(output_i) + context_layer = torch.cat(outputs, dim=1) +diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py +index a7800d415..26af87512 100644 +--- a/vllm/model_executor/models/qwen2_vl.py ++++ b/vllm/model_executor/models/qwen2_vl.py +@@ -273,12 +273,37 @@ class Qwen2VisionAttention(nn.Module): + prefix=f"{prefix}.proj") + + # Detect attention implementation. +- self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True) +- if self.attn_backend not in { +- _Backend.FLASH_ATTN, _Backend.TORCH_SDPA, _Backend.XFORMERS +- }: +- raise RuntimeError( +- f"Qwen2-VL does not support {self.attn_backend} backend now.") ++ # selected_backend: Optional[_Backend] = get_global_forced_attn_backend() ++ # if selected_backend is None: ++ # backend_by_env_var: Optional[str] = envs.VLLM_ATTENTION_BACKEND ++ # if backend_by_env_var is not None: ++ # selected_backend = backend_name_to_enum(backend_by_env_var) ++ # if selected_backend is None: ++ # # For Volta and Turing GPUs, use xformers instead. ++ # device_available = current_platform.get_device_capability()[0] >= 8 ++ # if device_available: ++ # from transformers.utils import is_flash_attn_2_available ++ ++ # if is_flash_attn_2_available(): ++ # self._use_flash_attn = True ++ # else: ++ # logger.warning( ++ # "Current Qwen2-VL implementation has a bug with " ++ # "`vllm-flash-attn` inside vision module, so we use " ++ # "xformers backend instead. You can run `pip install " ++ # "flash-attn to use flash-attention backend.") ++ # self._use_flash_attn = False ++ # else: ++ # self._use_flash_attn = False ++ # else: ++ # if selected_backend == _Backend.FLASH_ATTN: ++ # self._use_flash_attn = True ++ # elif selected_backend == _Backend.XFORMERS: ++ # self._use_flash_attn = False ++ # else: ++ # raise RuntimeError( ++ # f"Qwen2-VL does not support {selected_backend} backend now." ++ # ) --async def async_fetch_image(image_url: str, -- *, -- image_mode: str = "RGB", -- allowed_local_media_path: str = "") -> Image.Image: -- """ -- Asynchronously load a PIL image from a HTTP or base64 data URL. + def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]: + # [s, b, 3 * head * head_dim] +@@ -311,8 +336,9 @@ class Qwen2VisionAttention(nn.Module): + max_seqlen: Optional[int] = None, # Only used for Flash Attention + seqlens: Optional[list[int]] = None, # Only used for xFormers + ) -> torch.Tensor: - -- By default, the image is converted into RGB format. -- """ -- if image_url.startswith('http'): -- image_raw = await global_http_connection.async_get_bytes( -- image_url, -- timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT, -- ) -- image = _load_image_from_bytes(image_raw) +- # [s, b, c] --> [s, b, 3 * head * head_dim] ++ # TODO(xiangyu): Check logic here. It differs from current v0.8.1 version. ++ seq_length = x.shape[0] ++ # [s, b, c] --> [s, b, head * 3 * head_dim] + x, _ = self.qkv(x) + + # [s, b, 3 * head * head_dim] -> 3 * [s, b, head, head_dim] +@@ -324,59 +350,61 @@ class Qwen2VisionAttention(nn.Module): + if rotary_pos_emb is not None: + q = apply_rotary_pos_emb_vision(q, rotary_pos_emb) + k = apply_rotary_pos_emb_vision(k, rotary_pos_emb) - -- elif image_url.startswith('data:image'): -- image = _load_image_from_data_url(image_url) -- elif image_url.startswith('file://'): -- image = _load_image_from_file(image_url, allowed_local_media_path) -- else: -- raise ValueError("Invalid 'image_url': A valid 'image_url' must start " -- "with either 'data:image', 'file://' or 'http'.") +- if self.attn_backend == _Backend.FLASH_ATTN: +- # from vllm_flash_attn.flash_attn_interface import ( +- # flash_attn_varlen_func) +- from flash_attn import flash_attn_varlen_func - -- return image.convert(image_mode) +- q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]) - +- output = flash_attn_varlen_func(q, +- k, +- v, +- cu_seqlens_q=cu_seqlens, +- cu_seqlens_k=cu_seqlens, +- max_seqlen_q=max_seqlen, +- max_seqlen_k=max_seqlen, +- dropout_p=0, +- causal=False) - --def _load_video_from_bytes(b: bytes, num_frames: int = 32) -> npt.NDArray: -- video_path = BytesIO(b) -- vr = decord.VideoReader(video_path, num_threads=1) -- total_frame_num = len(vr) +- context_layer = rearrange(output, +- "(b s) ... -> b s ...", +- b=batch_size) +- elif self.attn_backend == _Backend.TORCH_SDPA: +- # Execute attention entry by entry for speed & less VRAM. +- outputs = [] +- for i in range(1, len(cu_seqlens)): +- start_idx = cu_seqlens[i - 1] +- end_idx = cu_seqlens[i] +- q_i = q[:, start_idx:end_idx] +- k_i = k[:, start_idx:end_idx] +- v_i = v[:, start_idx:end_idx] +- q_i, k_i, v_i = (rearrange(x, "b s h d -> b h s d") +- for x in [q_i, k_i, v_i]) +- output_i = F.scaled_dot_product_attention(q_i, +- k_i, +- v_i, +- dropout_p=0.0) +- output_i = rearrange(output_i, "b h s d -> b s h d ") +- outputs.append(output_i) +- context_layer = torch.cat(outputs, dim=1) +- elif self.attn_backend == _Backend.XFORMERS: +- from xformers import ops as xops +- from xformers.ops.fmha.attn_bias import BlockDiagonalMask - -- if total_frame_num > num_frames: -- uniform_sampled_frames = np.linspace(0, -- total_frame_num - 1, -- num_frames, -- dtype=int) -- frame_idx = uniform_sampled_frames.tolist() -- else: -- frame_idx = [i for i in range(0, total_frame_num)] -- frames = vr.get_batch(frame_idx).asnumpy() +- attn_bias = BlockDiagonalMask.from_seqlens(q_seqlen=seqlens, +- kv_seqlen=None, +- device=q.device) - -- return frames +- context_layer = xops.memory_efficient_attention_forward( +- q, k, v, attn_bias=attn_bias, p=0, scale=None) +- context_layer = rearrange(context_layer, +- "b s h d -> s b (h d)").contiguous() - -+ if url_spec.scheme.startswith("http"): -+ connection = self.connection -+ data = connection.get_bytes(url, timeout=fetch_timeout) - --def _load_video_from_data_url(video_url: str) -> npt.NDArray: -- # Only split once and assume the second part is the base64 encoded video -- _, video_base64 = video_url.split(",", 1) -+ return media_io.load_bytes(data) +- output, _ = self.proj(context_layer) ++ query = q.movedim(1, 2) ++ key = k.movedim(1, 2) ++ value = v.movedim(1, 2) ++ head_dim = query.shape[-1] ++ # if len(cu_seqlens) == 2 and cu_seqlens.tolist() == [0, seq_length]: ++ # attention_mask = None ++ # else: ++ # attention_mask = torch.full( ++ # [1, seq_length, seq_length], torch.finfo(q.dtype).min, device=q.device, dtype=q.dtype ++ # ) ++ # for i in range(1, len(cu_seqlens)): ++ # attention_mask[..., cu_seqlens[i - 1]:cu_seqlens[i], ++ # cu_seqlens[i - 1]:cu_seqlens[i]] = 0 ++ #from ipex_llm.transformers.models.common import attention_softmax ++ from ipex_llm.transformers.models.utils import use_sdp_non_causal ++ import math ++ seq_lens = [] ++ for i in range(1, len(cu_seqlens)): ++ seq_lens.append(cu_seqlens[i]-cu_seqlens[i-1]) ++ att_masks = [None] * len(seq_lens) ++ ++ num_tokens = q.shape[0] * q.shape[1] ++ attn_output = torch.empty( ++ (num_tokens, self.num_attention_heads_per_partition, self.hidden_size_per_attention_head), ++ dtype=query.dtype, device=query.device) ++ start = 0 ++ for seq_len, mask in zip(seq_lens, ++ att_masks): ++ end = start + seq_len ++ if use_sdp_non_causal(head_dim, q.device, q.dtype): ++ import xe_addons ++ scale = 1 / math.sqrt(head_dim) ++ if mask is not None: ++ mask = mask.unsqueeze(0) ++ sub_out = xe_addons.sdp_non_causal( ++ query[:, :, start:end, :].contiguous(), ++ key[:, :, start:end, :].contiguous(), ++ value[:, :, start:end, :].contiguous(), ++ mask, ++ scale).squeeze(0).movedim(0, 1) ++ else: ++ sub_out = torch.nn.functional.scaled_dot_product_attention( ++ query[:, :, start:end, :], ++ key[:, :, start:end, :], ++ value[:, :, start:end, :], ++ attn_mask=mask, ++ dropout_p=0.0, ++ is_causal=False, ++ scale= self.hidden_size_per_attention_head**-0.5).squeeze(0).movedim( ++ 0, 1) ++ attn_output[start:end, :, :] = sub_out ++ start = end ++ output = attn_output.reshape(-1, batch_size, self.hidden_size_per_attention_head * self.num_attention_heads_per_partition) ++ ++ output, _ = self.proj(output) + return output -- if video_url.startswith("data:video/jpeg;"): -- return np.stack([ -- np.array(load_image_from_base64(frame_base64)) -- for frame_base64 in video_base64.split(",") -- ]) -+ if url_spec.scheme == "data": -+ return self._load_data_url(url_spec, media_io) -- return load_video_from_base64(video_base64) -+ if url_spec.scheme == "file": -+ return self._load_file_url(url_spec, media_io) +@@ -633,9 +661,7 @@ class Qwen2VisionTransformer(nn.Module): + grid_thw: torch.Tensor, + ) -> torch.Tensor: + # patchify +- x = x.to(device=self.device, dtype=self.dtype) + x = self.patch_embed(x) +- + # compute position embedding + rotary_pos_emb = self.rot_pos_emb(grid_thw) -+ msg = "The URL must be either a HTTP, data or file URL." -+ raise ValueError(msg) +@@ -1231,7 +1257,8 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, + if image_input["type"] == "image_embeds": + image_embeds = image_input["image_embeds"].type(self.visual.dtype) + else: +- pixel_values = image_input["pixel_values"].type(self.visual.dtype) ++ # pixel_values = image_input["pixel_values"].type(self.visual.dtype) ++ pixel_values = image_input["pixel_values"].to(torch.float16) + image_embeds = self.visual(pixel_values, grid_thw=grid_thw) --def fetch_video(video_url: str, *, num_frames: int = 32) -> npt.NDArray: -- """ -- Load video from a HTTP or base64 data URL. -- """ -- if video_url.startswith('http') or video_url.startswith('https'): -- video_raw = global_http_connection.get_bytes( -- video_url, -- timeout=envs.VLLM_VIDEO_FETCH_TIMEOUT, -- ) -- video = _load_video_from_bytes(video_raw, num_frames) -- elif video_url.startswith('data:video'): -- video = _load_video_from_data_url(video_url) -- else: -- raise ValueError("Invalid 'video_url': A valid 'video_url' must start " -- "with either 'data:video' or 'http'.") -- return video -- -- --async def async_fetch_video(video_url: str, -- *, -- num_frames: int = 32) -> npt.NDArray: -- """ -- Asynchronously load video from a HTTP or base64 data URL. -- -- By default, the image is converted into RGB format. -- """ -- if video_url.startswith('http') or video_url.startswith('https'): -- video_raw = await global_http_connection.async_get_bytes( -- video_url, -- timeout=envs.VLLM_VIDEO_FETCH_TIMEOUT, -- ) -- video = _load_video_from_bytes(video_raw, num_frames) -- elif video_url.startswith('data:video'): -- video = _load_video_from_data_url(video_url) -- else: -- raise ValueError("Invalid 'video_url': A valid 'video_url' must start " -- "with either 'data:video' or 'http'.") -- return video -- -- --def fetch_audio(audio_url: str) -> Tuple[np.ndarray, Union[int, float]]: -- """ -- Load audio from a URL. -- """ -- if audio_url.startswith("http"): -- audio_bytes = global_http_connection.get_bytes( -+ async def load_from_url_async( -+ self, -+ url: str, -+ media_io: MediaIO[_M], -+ *, -+ fetch_timeout: Optional[int] = None, -+ ) -> _M: -+ url_spec = urlparse(url) -+ if url_spec.scheme.startswith("http"): -+ try: -+ import requests -+ image = Image.open(requests.get(url, stream=True).raw) -+ return image -+ except: -+ connection = self.connection -+ data = await connection.async_get_bytes(url, timeout=fetch_timeout) + # Split concatenated embeddings for each image item. +diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py +new file mode 100644 +index 000000000..9c14038e6 +--- /dev/null ++++ b/vllm/model_executor/models/qwen3.py +@@ -0,0 +1,329 @@ ++# SPDX-License-Identifier: Apache-2.0 + -+ return media_io.load_bytes(data) ++# Copyright 2024 The Qwen team. ++# Copyright 2023 The vLLM team. ++# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. ++# ++# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX ++# and OPT implementations in this library. It has been modified from its ++# original forms to accommodate minor architectural differences compared ++# to GPT-NeoX and OPT used by the Meta AI team that trained the model. ++# ++# Licensed under the Apache License, Version 2.0 (the "License"); ++# you may not use this file except in compliance with the License. ++# You may obtain a copy of the License at ++# ++# http://www.apache.org/licenses/LICENSE-2.0 ++# ++# Unless required by applicable law or agreed to in writing, software ++# distributed under the License is distributed on an "AS IS" BASIS, ++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++# See the License for the specific language governing permissions and ++# limitations under the License. ++"""Inference-only Qwen3 model compatible with HuggingFace weights.""" ++from typing import Iterable, Optional, Set, Tuple, Union + -+ if url_spec.scheme == "data": -+ return self._load_data_url(url_spec, media_io) ++import torch ++from torch import nn ++from transformers import Qwen3Config + -+ if url_spec.scheme == "file": -+ return self._load_file_url(url_spec, media_io) -+ -+ import os -+ if url_spec.scheme == "" and os.path.exists(url): -+ image = Image.open(url).convert('RGB') -+ return image ++from vllm.attention import Attention, AttentionType ++from vllm.compilation.decorators import support_torch_compile ++from vllm.config import CacheConfig, VllmConfig ++from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size ++from vllm.logger import init_logger ++from vllm.model_executor.layers.layernorm import RMSNorm ++from vllm.model_executor.layers.linear import (QKVParallelLinear, ++ RowParallelLinear) ++from vllm.model_executor.layers.logits_processor import LogitsProcessor ++from vllm.model_executor.layers.quantization import QuantizationConfig ++from vllm.model_executor.layers.rotary_embedding import get_rope ++from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler ++from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead ++from vllm.model_executor.sampling_metadata import SamplingMetadata ++from vllm.sequence import IntermediateTensors + -+ msg = "The URL must be either a HTTP, data, file URL or a exist path." -+ raise ValueError(msg) ++from .interfaces import SupportsLoRA, SupportsPP ++from .qwen2 import Qwen2MLP as Qwen3MLP ++from .qwen2 import Qwen2Model ++from .utils import AutoWeightsLoader, PPMissingLayer, maybe_prefix + -+ def fetch_audio( -+ self, -+ audio_url: str, -+ ) -> tuple[np.ndarray, Union[int, float]]: -+ """ -+ Load audio from a URL. -+ """ -+ audio_io = AudioMediaIO() ++logger = init_logger(__name__) + -+ return self.load_from_url( - audio_url, -- timeout=envs.VLLM_AUDIO_FETCH_TIMEOUT, -+ audio_io, -+ fetch_timeout=envs.VLLM_AUDIO_FETCH_TIMEOUT, - ) -- elif audio_url.startswith("data:audio"): -- _, audio_base64 = audio_url.split(",", 1) -- audio_bytes = base64.b64decode(audio_base64) -- else: -- raise ValueError("Invalid 'audio_url': A valid 'audio_url' must start " -- "with either 'data:audio' or 'http'.") -- -- return librosa.load(BytesIO(audio_bytes), sr=None) - -+ async def fetch_audio_async( -+ self, -+ audio_url: str, -+ ) -> tuple[np.ndarray, Union[int, float]]: -+ """ -+ Asynchronously fetch audio from a URL. -+ """ -+ audio_io = AudioMediaIO() - --async def async_fetch_audio( -- audio_url: str) -> Tuple[np.ndarray, Union[int, float]]: -- """ -- Asynchronously fetch audio from a URL. -- """ -- if audio_url.startswith("http"): -- audio_bytes = await global_http_connection.async_get_bytes( -+ return await self.load_from_url_async( - audio_url, -- timeout=envs.VLLM_AUDIO_FETCH_TIMEOUT, -+ audio_io, -+ fetch_timeout=envs.VLLM_AUDIO_FETCH_TIMEOUT, - ) -- elif audio_url.startswith("data:audio"): -- _, audio_base64 = audio_url.split(",", 1) -- audio_bytes = base64.b64decode(audio_base64) -- else: -- raise ValueError("Invalid 'audio_url': A valid 'audio_url' must start " -- "with either 'data:audio' or 'http'.") -- -- return librosa.load(BytesIO(audio_bytes), sr=None) - -+ def fetch_image( -+ self, -+ image_url: str, -+ *, -+ image_mode: str = "RGB", -+ ) -> Image.Image: -+ """ -+ Load a PIL image from a HTTP or base64 data URL. - --def get_and_parse_audio(audio_url: str) -> MultiModalDataDict: -- audio, sr = fetch_audio(audio_url) -- return {"audio": (audio, sr)} -+ By default, the image is converted into RGB format. -+ """ -+ image_io = ImageMediaIO(image_mode=image_mode) - -+ return self.load_from_url( -+ image_url, -+ image_io, -+ fetch_timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT, -+ ) - --def get_and_parse_image( -+ async def fetch_image_async( -+ self, - image_url: str, - *, -- allowed_local_media_path: str = "") -> MultiModalDataDict: -- image = fetch_image(image_url, -- allowed_local_media_path=allowed_local_media_path) -- return {"image": image} -- -- --def get_and_parse_video(video_url: str) -> MultiModalDataDict: -- video = fetch_video(video_url) -- return {"video": video} -+ image_mode: str = "RGB", -+ ) -> Image.Image: -+ """ -+ Asynchronously load a PIL image from a HTTP or base64 data URL. - -+ By default, the image is converted into RGB format. -+ """ -+ image_io = ImageMediaIO(image_mode=image_mode) - --async def async_get_and_parse_audio(audio_url: str) -> MultiModalDataDict: -- audio, sr = await async_fetch_audio(audio_url) -- return {"audio": (audio, sr)} -+ return await self.load_from_url_async( -+ image_url, -+ image_io, -+ fetch_timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT, -+ ) - -+ def fetch_video( -+ self, -+ video_url: str, -+ *, -+ image_mode: str = "RGB", -+ num_frames: int = 32, -+ ) -> npt.NDArray: -+ """ -+ Load video from a HTTP or base64 data URL. -+ """ -+ image_io = ImageMediaIO(image_mode=image_mode) -+ video_io = VideoMediaIO(image_io, num_frames=num_frames) + -+ return self.load_from_url( -+ video_url, -+ video_io, -+ fetch_timeout=envs.VLLM_VIDEO_FETCH_TIMEOUT, -+ ) - --async def async_get_and_parse_image( -- image_url: str, -+ async def fetch_video_async( -+ self, -+ video_url: str, - *, -- allowed_local_media_path: str = "") -> MultiModalDataDict: -- image = await async_fetch_image( -- image_url, allowed_local_media_path=allowed_local_media_path) -- return {"image": image} -+ image_mode: str = "RGB", -+ num_frames: int = 32, -+ ) -> npt.NDArray: -+ """ -+ Asynchronously load video from a HTTP or base64 data URL. ++class Qwen3Attention(nn.Module): + -+ By default, the image is converted into RGB format. -+ """ -+ image_io = ImageMediaIO(image_mode=image_mode) -+ video_io = VideoMediaIO(image_io, num_frames=num_frames) ++ def __init__(self, ++ hidden_size: int, ++ num_heads: int, ++ num_kv_heads: int, ++ max_position: int = 4096 * 32, ++ head_dim: Optional[int] = None, ++ rms_norm_eps: float = 1e-06, ++ qkv_bias: bool = False, ++ rope_theta: float = 10000, ++ cache_config: Optional[CacheConfig] = None, ++ quant_config: Optional[QuantizationConfig] = None, ++ rope_scaling: Optional[Tuple] = None, ++ prefix: str = "", ++ attn_type: str = AttentionType.DECODER) -> None: ++ super().__init__() ++ self.hidden_size = hidden_size ++ tp_size = get_tensor_model_parallel_world_size() ++ self.total_num_heads = num_heads ++ assert self.total_num_heads % tp_size == 0 ++ self.num_heads = self.total_num_heads // tp_size ++ self.total_num_kv_heads = num_kv_heads ++ if self.total_num_kv_heads >= tp_size: ++ # Number of KV heads is greater than TP size, so we partition ++ # the KV heads across multiple tensor parallel GPUs. ++ assert self.total_num_kv_heads % tp_size == 0 ++ else: ++ # Number of KV heads is less than TP size, so we replicate ++ # the KV heads across multiple tensor parallel GPUs. ++ assert tp_size % self.total_num_kv_heads == 0 ++ self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) ++ self.head_dim = head_dim or hidden_size // self.total_num_heads ++ self.q_size = self.num_heads * self.head_dim ++ self.kv_size = self.num_kv_heads * self.head_dim ++ self.scaling = self.head_dim**-0.5 ++ self.rope_theta = rope_theta + -+ return await self.load_from_url_async( -+ video_url, -+ video_io, -+ fetch_timeout=envs.VLLM_VIDEO_FETCH_TIMEOUT, ++ self.qkv_proj = QKVParallelLinear( ++ hidden_size, ++ self.head_dim, ++ self.total_num_heads, ++ self.total_num_kv_heads, ++ bias=qkv_bias, ++ quant_config=quant_config, ++ prefix=f"{prefix}.qkv_proj", ++ ) ++ self.o_proj = RowParallelLinear( ++ self.total_num_heads * self.head_dim, ++ hidden_size, ++ bias=False, ++ quant_config=quant_config, ++ prefix=f"{prefix}.o_proj", + ) - - --async def async_get_and_parse_video(video_url: str) -> MultiModalDataDict: -- video = await async_fetch_video(video_url) -- return {"video": video} -+global_media_connector = MediaConnector() -+"""The global :class:`MediaConnector` instance used by vLLM.""" -+ -+fetch_audio = global_media_connector.fetch_audio -+fetch_image = global_media_connector.fetch_image -+fetch_video = global_media_connector.fetch_video - - - def encode_audio_base64( -@@ -294,10 +265,8 @@ def encode_audio_base64( - sampling_rate: int, - ) -> str: - """Encode audio as base64.""" -- buffered = BytesIO() -- soundfile.write(buffered, audio, sampling_rate, format="WAV") -- -- return base64.b64encode(buffered.getvalue()).decode('utf-8') -+ audio_io = AudioMediaIO() -+ return audio_io.encode_base64((audio, sampling_rate)) - - - def encode_image_base64( -@@ -311,29 +280,14 @@ def encode_image_base64( - - By default, the image is converted into RGB format before being encoded. - """ -- buffered = BytesIO() -- image = image.convert(image_mode) -- image.save(buffered, format) -- return base64.b64encode(buffered.getvalue()).decode('utf-8') -- -- --def load_image_from_base64(image: Union[bytes, str]) -> Image.Image: -- """Load image from base64 format.""" -- return _load_image_from_bytes(base64.b64decode(image)) -+ image_io = ImageMediaIO(image_mode=image_mode) -+ return image_io.encode_base64(image, image_format=format) - - - def encode_video_base64(frames: npt.NDArray) -> str: -- base64_frames = [] -- frames_list = [frames[i] for i in range(frames.shape[0])] -- for frame in frames_list: -- img_base64 = encode_image_base64(Image.fromarray(frame)) -- base64_frames.append(img_base64) -- return ",".join(base64_frames) -- -- --def load_video_from_base64(video: Union[bytes, str]) -> npt.NDArray: -- """Load video from base64 format.""" -- return _load_video_from_bytes(base64.b64decode(video)) -+ image_io = ImageMediaIO() -+ video_io = VideoMediaIO(image_io) -+ return video_io.encode_base64(frames) - - - def resolve_visual_encoder_outputs( -@@ -389,7 +343,7 @@ def repeat_and_pad_token( - repeat_count: int = 1, - pad_token_left: Optional[_T] = None, - pad_token_right: Optional[_T] = None, --) -> List[_T]: -+) -> list[_T]: - replacement = [token] * repeat_count - if pad_token_left is not None: - replacement = [pad_token_left] + replacement -@@ -402,13 +356,13 @@ def repeat_and_pad_token( - def repeat_and_pad_placeholder_tokens( - tokenizer: AnyTokenizer, - prompt: Optional[str], -- prompt_token_ids: List[int], -+ prompt_token_ids: list[int], - *, - placeholder_token_id: int, -- repeat_count: Union[int, List[int]], -+ repeat_count: Union[int, list[int]], - pad_token_left: Optional[int] = None, - pad_token_right: Optional[int] = None, --) -> Tuple[Optional[str], List[int], List[PlaceholderRange]]: -+) -> tuple[Optional[str], list[int], list[PlaceholderRange]]: - if isinstance(repeat_count, int): - repeat_count = [repeat_count] - -@@ -450,20 +404,24 @@ def repeat_and_pad_placeholder_tokens( - new_prompt += prompt_parts[i] + replacement_str - new_prompt += prompt_parts[-1] - -- new_token_ids: List[int] = [] -- placeholder_ranges: List[PlaceholderRange] = [] -+ new_token_ids = list[int]() -+ placeholder_ranges = list[PlaceholderRange]() - placeholder_token_idx = 0 - for i, token in enumerate(prompt_token_ids): - if token == placeholder_token_id: -+ curr_repeat_count = repeat_count[placeholder_token_idx] - replacement_ids = repeat_and_pad_token( - placeholder_token_id, -- repeat_count=repeat_count[placeholder_token_idx], -+ repeat_count=curr_repeat_count, - pad_token_left=pad_token_left, - pad_token_right=pad_token_right, - ) -+ offset = len(new_token_ids) -+ if pad_token_left is not None: -+ offset += 1 - placeholder_ranges.append({ -- "offset": len(new_token_ids), -- "length": len(replacement_ids) -+ "offset": offset, -+ "length": curr_repeat_count, - }) - new_token_ids.extend(replacement_ids) - placeholder_token_idx += 1 -@@ -481,10 +439,10 @@ def repeat_and_pad_placeholder_tokens( - def consecutive_placeholder_ranges( - num_items: int, - item_size: int, -- initial_offset: int = 0) -> List[PlaceholderRange]: -+ initial_offset: int = 0) -> list[PlaceholderRange]: - """Returns a list of consecutive PlaceholderRanges of a fixed size""" - - return [ - PlaceholderRange(offset=initial_offset + i * item_size, - length=item_size) for i in range(num_items) -- ] -+ ] -\ No newline at end of file -diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py -index c4be10056..1ad1f5abc 100644 ---- a/vllm/multimodal/video.py -+++ b/vllm/multimodal/video.py -@@ -1,23 +1,32 @@ --from functools import lru_cache -+import base64 -+from functools import lru_cache, partial -+from io import BytesIO -+from pathlib import Path - from typing import TYPE_CHECKING, Any, Dict, Optional - - import cv2 - import numpy as np - import numpy.typing as npt -+from PIL import Image - - from vllm.inputs.registry import InputContext - from vllm.logger import init_logger - from vllm.transformers_utils.processor import get_video_processor - from vllm.transformers_utils.tokenizer import get_tokenizer --from vllm.utils import is_list_of -+from vllm.utils import PlaceholderModule, is_list_of - --from .base import MultiModalData --from .image import ImagePlugin -+from .base import MediaIO, ModalityData -+from .image import ImageMediaIO, ImagePlugin - from .inputs import MultiModalKwargs, VideoItem - - if TYPE_CHECKING: - from vllm.config import ModelConfig - -+try: -+ import decord -+except ImportError: -+ decord = PlaceholderModule("decord") # type: ignore[assignment] + - logger = init_logger(__name__) - - cached_get_video_processor = lru_cache(get_video_processor) -@@ -45,7 +54,7 @@ class VideoPlugin(ImagePlugin): - def _default_input_mapper( - self, - ctx: InputContext, -- data: MultiModalData[VideoItem], -+ data: ModalityData[VideoItem], - **mm_processor_kwargs, - ) -> MultiModalKwargs: - model_config = ctx.model_config -@@ -107,3 +116,73 @@ def sample_frames_from_video(frames: npt.NDArray, - frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int) - sampled_frames = frames[frame_indices, ...] - return sampled_frames ++ self.rotary_emb = get_rope( ++ self.head_dim, ++ rotary_dim=self.head_dim, ++ max_position=max_position, ++ base=self.rope_theta, ++ rope_scaling=rope_scaling, ++ ) ++ self.attn = Attention(self.num_heads, ++ self.head_dim, ++ self.scaling, ++ num_kv_heads=self.num_kv_heads, ++ cache_config=cache_config, ++ quant_config=quant_config, ++ prefix=f"{prefix}.attn", ++ attn_type=attn_type) ++ self.q_norm = RMSNorm(self.head_dim, eps=rms_norm_eps) ++ self.k_norm = RMSNorm(self.head_dim, eps=rms_norm_eps) ++ ++ def forward( ++ self, ++ positions: torch.Tensor, ++ hidden_states: torch.Tensor, ++ ) -> torch.Tensor: ++ qkv, _ = self.qkv_proj(hidden_states) ++ q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) ++ # Add qk-norm ++ q_by_head = q.view(*q.shape[:-1], q.shape[-1] // self.head_dim, ++ self.head_dim) ++ q_by_head = self.q_norm.forward_native(q_by_head) ++ q = q_by_head.view(q.shape) ++ k_by_head = k.view(*k.shape[:-1], k.shape[-1] // self.head_dim, ++ self.head_dim) ++ k_by_head = self.k_norm.forward_native(k_by_head) ++ k = k_by_head.view(k.shape) ++ q, k = self.rotary_emb(positions, q, k) ++ attn_output = self.attn(q, k, v) ++ output, _ = self.o_proj(attn_output) ++ return output + + -+class VideoMediaIO(MediaIO[npt.NDArray]): ++class Qwen3DecoderLayer(nn.Module): + + def __init__( + self, -+ image_io: ImageMediaIO, -+ *, -+ num_frames: int = 32, ++ config: Qwen3Config, ++ cache_config: Optional[CacheConfig] = None, ++ quant_config: Optional[QuantizationConfig] = None, ++ prefix: str = "", + ) -> None: + super().__init__() -+ -+ self.image_io = image_io -+ self.num_frames = num_frames -+ -+ def load_bytes(self, data: bytes) -> npt.NDArray: -+ vr = decord.VideoReader(BytesIO(data), num_threads=1) -+ total_frame_num = len(vr) -+ -+ num_frames = self.num_frames -+ if total_frame_num > num_frames: -+ uniform_sampled_frames = np.linspace(0, -+ total_frame_num - 1, -+ num_frames, -+ dtype=int) -+ frame_idx = uniform_sampled_frames.tolist() ++ self.hidden_size = config.hidden_size ++ # Requires transformers > 4.32.0 ++ rope_theta = getattr(config, "rope_theta", 1000000) ++ rope_scaling = getattr(config, "rope_scaling", None) ++ ++ # By default, Qwen3 uses causal attention as it is a decoder-only model. ++ # You can override the HF config with `is_causal=False` to enable ++ # bidirectional attention, which is used in some embedding models ++ # (e.g. Alibaba-NLP/gte-Qwen3-7B-instruct) ++ if getattr(config, "is_causal", True): ++ attn_type = AttentionType.DECODER + else: -+ frame_idx = list(range(0, total_frame_num)) -+ -+ return vr.get_batch(frame_idx).asnumpy() ++ attn_type = AttentionType.ENCODER_ONLY ++ ++ self.self_attn = Qwen3Attention( ++ hidden_size=self.hidden_size, ++ num_heads=config.num_attention_heads, ++ max_position=config.max_position_embeddings, ++ num_kv_heads=config.num_key_value_heads, ++ rope_theta=rope_theta, ++ rms_norm_eps=config.rms_norm_eps, ++ qkv_bias=getattr(config, 'attention_bias', False), ++ head_dim=getattr(config, 'head_dim', None), ++ cache_config=cache_config, ++ quant_config=quant_config, ++ rope_scaling=rope_scaling, ++ prefix=f"{prefix}.self_attn", ++ attn_type=attn_type, ++ ) ++ self.mlp = Qwen3MLP( ++ hidden_size=self.hidden_size, ++ intermediate_size=config.intermediate_size, ++ hidden_act=config.hidden_act, ++ quant_config=quant_config, ++ prefix=f"{prefix}.mlp", ++ ) ++ self.input_layernorm = RMSNorm(config.hidden_size, ++ eps=config.rms_norm_eps) ++ self.post_attention_layernorm = RMSNorm(config.hidden_size, ++ eps=config.rms_norm_eps) + -+ def load_base64(self, media_type: str, data: str) -> npt.NDArray: -+ if media_type.lower() == "video/jpeg": -+ load_frame = partial( -+ self.image_io.load_base64, -+ "image/jpeg", -+ ) ++ def forward( ++ self, ++ positions: torch.Tensor, ++ hidden_states: torch.Tensor, ++ residual: Optional[torch.Tensor], ++ ) -> Tuple[torch.Tensor, torch.Tensor]: ++ # Self Attention ++ if residual is None: ++ residual = hidden_states ++ hidden_states = self.input_layernorm(hidden_states) ++ else: ++ hidden_states, residual = self.input_layernorm( ++ hidden_states, residual) ++ hidden_states = self.self_attn( ++ positions=positions, ++ hidden_states=hidden_states, ++ ) + -+ return np.stack([ -+ np.array(load_frame(frame_data)) -+ for frame_data in data.split(",") -+ ]) ++ # Fully Connected ++ hidden_states, residual = self.post_attention_layernorm( ++ hidden_states, residual) ++ hidden_states = self.mlp(hidden_states) ++ return hidden_states, residual + -+ return self.load_bytes(base64.b64decode(data)) + -+ def load_file(self, filepath: Path) -> npt.NDArray: -+ with filepath.open("rb") as f: -+ data = f.read() ++ALL_DECODER_LAYER_TYPES = { ++ "attention": Qwen3DecoderLayer, ++} + -+ return self.load_bytes(data) + -+ def encode_base64( -+ self, -+ media: npt.NDArray, -+ *, -+ video_format: str = "JPEG", -+ ) -> str: -+ video = media -+ -+ if video_format == "JPEG": -+ encode_frame = partial( -+ self.image_io.encode_base64, -+ image_format=video_format, -+ ) ++@support_torch_compile( ++ dynamic_arg_dims={ ++ "input_ids": 0, ++ # positions is of shape (3, seq_len) if mrope is enabled for qwen2-vl, ++ # otherwise (seq_len, ). ++ "positions": -1, ++ "intermediate_tensors": 0, ++ "inputs_embeds": 0, ++ }) ++class Qwen3Model(Qwen2Model): + -+ return ",".join( -+ encode_frame(Image.fromarray(frame)) for frame in video) ++ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ++ super().__init__(vllm_config=vllm_config, ++ prefix=prefix, ++ decoder_layer_type=Qwen3DecoderLayer) + -+ msg = "Only JPEG format is supported for now." -+ raise NotImplementedError(msg) -diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py -index 419237c25..c37654a9d 100644 ---- a/vllm/platforms/__init__.py -+++ b/vllm/platforms/__init__.py -@@ -1,123 +1,224 @@ -+import logging -+import traceback -+from itertools import chain -+from typing import TYPE_CHECKING, Optional + -+from vllm.plugins import load_plugins_by_group -+from vllm.utils import resolve_obj_by_qualname ++class Qwen3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): ++ packed_modules_mapping = { ++ "qkv_proj": [ ++ "q_proj", ++ "k_proj", ++ "v_proj", ++ ], ++ "gate_up_proj": [ ++ "gate_proj", ++ "up_proj", ++ ], ++ } + - from .interface import _Backend # noqa: F401 --from .interface import CpuArchEnum, Platform, PlatformEnum, UnspecifiedPlatform -+from .interface import CpuArchEnum, Platform, PlatformEnum - --current_platform: Platform -+logger = logging.getLogger(__name__) - --# NOTE: we don't use `torch.version.cuda` / `torch.version.hip` because --# they only indicate the build configuration, not the runtime environment. --# For example, people can install a cuda build of pytorch but run on tpu. - --is_tpu = False --try: -- # While it's technically possible to install libtpu on a non-TPU machine, -- # this is a very uncommon scenario. Therefore, we assume that libtpu is -- # installed if and only if the machine has TPUs. -- import libtpu # noqa: F401 -- is_tpu = True --except Exception: -- pass -+def tpu_platform_plugin() -> Optional[str]: -+ is_tpu = False -+ try: -+ # While it's technically possible to install libtpu on a -+ # non-TPU machine, this is a very uncommon scenario. Therefore, -+ # we assume that libtpu is installed if and only if the machine -+ # has TPUs. -+ import libtpu # noqa: F401 -+ is_tpu = True -+ except Exception: -+ pass ++ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ++ super().__init__() ++ config = vllm_config.model_config.hf_config ++ quant_config = vllm_config.quant_config ++ lora_config = vllm_config.lora_config + -+ return "vllm.platforms.tpu.TpuPlatform" if is_tpu else None - --is_cuda = False - --try: -- import pynvml -- pynvml.nvmlInit() -+def cuda_platform_plugin() -> Optional[str]: -+ is_cuda = False -+ - try: -- if pynvml.nvmlDeviceGetCount() > 0: -+ import pynvml -+ pynvml.nvmlInit() -+ try: -+ if pynvml.nvmlDeviceGetCount() > 0: -+ is_cuda = True -+ finally: -+ pynvml.nvmlShutdown() -+ except Exception: -+ # CUDA is supported on Jetson, but NVML may not be. -+ import os ++ self.config = config ++ self.lora_config = lora_config + -+ def cuda_is_jetson() -> bool: -+ return os.path.isfile("/etc/nv_tegra_release") \ -+ or os.path.exists("/sys/class/tegra-firmware") -+ -+ if cuda_is_jetson(): - is_cuda = True -- finally: -- pynvml.nvmlShutdown() --except Exception: -- # CUDA is supported on Jetson, but NVML may not be. -- import os - -- def cuda_is_jetson() -> bool: -- return os.path.isfile("/etc/nv_tegra_release") \ -- or os.path.exists("/sys/class/tegra-firmware") -+ return "vllm.platforms.cuda.CudaPlatform" if is_cuda else None ++ self.quant_config = quant_config ++ self.model = Qwen3Model(vllm_config=vllm_config, ++ prefix=maybe_prefix(prefix, "model")) + ++ if get_pp_group().is_last_rank: ++ if config.tie_word_embeddings: ++ self.lm_head = self.model.embed_tokens ++ else: ++ self.lm_head = ParallelLMHead(config.vocab_size, ++ config.hidden_size, ++ quant_config=quant_config, ++ prefix=maybe_prefix( ++ prefix, "lm_head")) ++ else: ++ self.lm_head = PPMissingLayer() + -+def rocm_platform_plugin() -> Optional[str]: -+ is_rocm = False ++ self.logits_processor = LogitsProcessor(config.vocab_size) ++ self.sampler = get_sampler() + -+ try: -+ import amdsmi -+ amdsmi.amdsmi_init() -+ try: -+ if len(amdsmi.amdsmi_get_processor_handles()) > 0: -+ is_rocm = True -+ finally: -+ amdsmi.amdsmi_shut_down() -+ except Exception: -+ pass ++ self.make_empty_intermediate_tensors = ( ++ self.model.make_empty_intermediate_tensors) + -+ return "vllm.platforms.rocm.RocmPlatform" if is_rocm else None ++ def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: ++ return self.model.get_input_embeddings(input_ids) + ++ def forward( ++ self, ++ input_ids: torch.Tensor, ++ positions: torch.Tensor, ++ intermediate_tensors: Optional[IntermediateTensors] = None, ++ inputs_embeds: Optional[torch.Tensor] = None, ++ ) -> Union[torch.Tensor, IntermediateTensors]: ++ hidden_states = self.model(input_ids, positions, intermediate_tensors, ++ inputs_embeds) ++ return hidden_states + -+def hpu_platform_plugin() -> Optional[str]: -+ is_hpu = False -+ try: -+ from importlib import util -+ is_hpu = util.find_spec('habana_frameworks') is not None -+ except Exception: -+ pass ++ def compute_logits( ++ self, ++ hidden_states: torch.Tensor, ++ sampling_metadata: SamplingMetadata, ++ ) -> Optional[torch.Tensor]: ++ logits = self.logits_processor(self.lm_head, hidden_states, ++ sampling_metadata) ++ return logits + -+ return "vllm.platforms.hpu.HpuPlatform" if is_hpu else None ++ def sample( ++ self, ++ logits: torch.Tensor, ++ sampling_metadata: SamplingMetadata, ++ ) -> Optional[SamplerOutput]: ++ next_tokens = self.sampler(logits, sampling_metadata) ++ return next_tokens + ++ def load_weights(self, weights: Iterable[Tuple[str, ++ torch.Tensor]]) -> Set[str]: ++ loader = AutoWeightsLoader( ++ self, ++ skip_prefixes=(["lm_head."] ++ if self.config.tie_word_embeddings else None), ++ ) ++ return loader.load_weights(weights) +diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py +new file mode 100644 +index 000000000..390bb7adf +--- /dev/null ++++ b/vllm/model_executor/models/qwen3_moe.py +@@ -0,0 +1,531 @@ ++# SPDX-License-Identifier: Apache-2.0 + -+def xpu_platform_plugin() -> Optional[str]: -+ # TODO(gc): we will see if we can get another method to get over this... -+ is_xpu = True ++# Copyright 2024 The Qwen team. ++# Copyright 2023 The vLLM team. ++# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. ++# ++# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX ++# and OPT implementations in this library. It has been modified from its ++# original forms to accommodate minor architectural differences compared ++# to GPT-NeoX and OPT used by the Meta AI team that trained the model. ++# ++# Licensed under the Apache License, Version 2.0 (the "License"); ++# you may not use this file except in compliance with the License. ++# You may obtain a copy of the License at ++# ++# http://www.apache.org/licenses/LICENSE-2.0 ++# ++# Unless required by applicable law or agreed to in writing, software ++# distributed under the License is distributed on an "AS IS" BASIS, ++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++# See the License for the specific language governing permissions and ++# limitations under the License. ++"""Inference-only Qwen3MoE model compatible with HuggingFace weights.""" ++from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union + -+ try: -+ # installed IPEX if the machine has XPUs. -+ import intel_extension_for_pytorch # noqa: F401 -+ import oneccl_bindings_for_pytorch # noqa: F401 -+ import torch -+ if hasattr(torch, 'xpu') and torch.xpu.is_available(): -+ is_xpu = True -+ except Exception: -+ pass ++import torch ++from torch import nn ++from transformers import PretrainedConfig + -+ return "vllm.platforms.xpu.XPUPlatform" if is_xpu else None ++from vllm.attention import Attention ++from vllm.compilation.decorators import support_torch_compile ++from vllm.config import CacheConfig, VllmConfig ++from vllm.distributed import (get_pp_group, ++ get_tensor_model_parallel_world_size, ++ tensor_model_parallel_all_reduce) ++from vllm.logger import init_logger ++from vllm.model_executor.layers.activation import SiluAndMul ++from vllm.model_executor.layers.fused_moe import FusedMoE ++from vllm.model_executor.layers.layernorm import RMSNorm ++from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, ++ QKVParallelLinear, ++ ReplicatedLinear, ++ RowParallelLinear) ++from vllm.model_executor.layers.logits_processor import LogitsProcessor ++from vllm.model_executor.layers.quantization import QuantizationConfig ++from vllm.model_executor.layers.rotary_embedding import get_rope ++from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler ++from vllm.model_executor.layers.vocab_parallel_embedding import ( ++ ParallelLMHead, VocabParallelEmbedding) ++from vllm.model_executor.model_loader.weight_utils import default_weight_loader ++from vllm.model_executor.sampling_metadata import SamplingMetadata ++from vllm.sequence import IntermediateTensors + ++from .interfaces import SupportsPP ++from .utils import (extract_layer_index, is_pp_missing_parameter, ++ make_empty_intermediate_tensors_factory, make_layers, ++ maybe_prefix) + -+def cpu_platform_plugin() -> Optional[str]: -+ is_cpu = False -+ try: -+ from importlib.metadata import version -+ is_cpu = "cpu" in version("vllm") -+ except Exception: -+ pass ++logger = init_logger(__name__) + -+ return "vllm.platforms.cpu.CpuPlatform" if is_cpu else None + ++class Qwen3MoeMLP(nn.Module): + -+def neuron_platform_plugin() -> Optional[str]: -+ is_neuron = False -+ try: -+ import transformers_neuronx # noqa: F401 -+ is_neuron = True -+ except ImportError: -+ pass - -- if cuda_is_jetson(): -- is_cuda = True -+ return "vllm.platforms.neuron.NeuronPlatform" if is_neuron else None - --is_rocm = False - --try: -- import amdsmi -- amdsmi.amdsmi_init() -+def openvino_platform_plugin() -> Optional[str]: -+ is_openvino = False - try: -- if len(amdsmi.amdsmi_get_processor_handles()) > 0: -- is_rocm = True -- finally: -- amdsmi.amdsmi_shut_down() --except Exception: -- pass -- --is_hpu = False --try: -- from importlib import util -- is_hpu = util.find_spec('habana_frameworks') is not None --except Exception: -- pass -- --is_xpu = False -- --try: -- # installed IPEX if the machine has XPUs. -- import intel_extension_for_pytorch # noqa: F401 -- import oneccl_bindings_for_pytorch # noqa: F401 -- import torch -- if hasattr(torch, 'xpu') and torch.xpu.is_available(): -- is_xpu = True --except Exception: -- pass -- --is_cpu = False --try: -- from importlib.metadata import version -- is_cpu = "cpu" in version("vllm") --except Exception: -- pass -- --is_neuron = False --try: -- import transformers_neuronx # noqa: F401 -- is_neuron = True --except ImportError: -- pass -- --is_openvino = False --try: -- from importlib.metadata import version -- is_openvino = "openvino" in version("vllm") --except Exception: -- pass -- --if is_tpu: -- # people might install pytorch built with cuda but run on tpu -- # so we need to check tpu first -- from .tpu import TpuPlatform -- current_platform = TpuPlatform() --elif is_cuda: -- from .cuda import CudaPlatform -- current_platform = CudaPlatform() --elif is_rocm: -- from .rocm import RocmPlatform -- current_platform = RocmPlatform() --elif is_hpu: -- from .hpu import HpuPlatform -- current_platform = HpuPlatform() --elif is_xpu: -- from .xpu import XPUPlatform -- current_platform = XPUPlatform() --elif is_cpu: -- from .cpu import CpuPlatform -- current_platform = CpuPlatform() --elif is_neuron: -- from .neuron import NeuronPlatform -- current_platform = NeuronPlatform() --elif is_openvino: -- from .openvino import OpenVinoPlatform -- current_platform = OpenVinoPlatform() --else: -- current_platform = UnspecifiedPlatform() -- --__all__ = ['Platform', 'PlatformEnum', 'current_platform', 'CpuArchEnum'] -+ from importlib.metadata import version -+ is_openvino = "openvino" in version("vllm") -+ except Exception: -+ pass ++ def __init__( ++ self, ++ hidden_size: int, ++ intermediate_size: int, ++ hidden_act: str, ++ quant_config: Optional[QuantizationConfig] = None, ++ reduce_results: bool = True, ++ prefix: str = "", ++ ) -> None: ++ super().__init__() ++ self.gate_up_proj = MergedColumnParallelLinear( ++ hidden_size, [intermediate_size] * 2, ++ bias=False, ++ quant_config=quant_config, ++ prefix=f"{prefix}.gate_up_proj") ++ self.down_proj = RowParallelLinear(intermediate_size, ++ hidden_size, ++ bias=False, ++ quant_config=quant_config, ++ reduce_results=reduce_results, ++ prefix=f"{prefix}.down_proj") ++ if hidden_act != "silu": ++ raise ValueError(f"Unsupported activation: {hidden_act}. " ++ "Only silu is supported for now.") ++ self.act_fn = SiluAndMul() ++ ++ def forward(self, x): ++ gate_up, _ = self.gate_up_proj(x) ++ x = self.act_fn(gate_up) ++ x, _ = self.down_proj(x) ++ return x + -+ return "vllm.platforms.openvino.OpenVinoPlatform" if is_openvino else None + ++class Qwen3MoeSparseMoeBlock(nn.Module): + -+builtin_platform_plugins = { -+ 'tpu': tpu_platform_plugin, -+ 'cuda': cuda_platform_plugin, -+ 'rocm': rocm_platform_plugin, -+ 'hpu': hpu_platform_plugin, -+ 'xpu': xpu_platform_plugin, -+ 'cpu': cpu_platform_plugin, -+ 'neuron': neuron_platform_plugin, -+ 'openvino': openvino_platform_plugin, -+} ++ def __init__( ++ self, ++ config: PretrainedConfig, ++ quant_config: Optional[QuantizationConfig] = None, ++ prefix: str = "", ++ ): ++ super().__init__() ++ self.tp_size = get_tensor_model_parallel_world_size() + ++ if self.tp_size > config.num_experts: ++ raise ValueError( ++ f"Tensor parallel size {self.tp_size} is greater than " ++ f"the number of experts {config.num_experts}.") ++ ++ self.experts = FusedMoE(num_experts=config.num_experts, ++ top_k=config.num_experts_per_tok, ++ hidden_size=config.hidden_size, ++ intermediate_size=config.moe_intermediate_size, ++ reduce_results=False, ++ renormalize=config.norm_topk_prob, ++ quant_config=quant_config, ++ prefix=f"{prefix}.experts") ++ ++ self.gate = ReplicatedLinear(config.hidden_size, ++ config.num_experts, ++ bias=False, ++ quant_config=None, ++ prefix=f"{prefix}.gate") + -+def resolve_current_platform_cls_qualname() -> str: -+ platform_plugins = load_plugins_by_group('vllm.platform_plugins') ++ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: ++ # NOTE: hidden_states can have either 1D or 2D shape. ++ orig_shape = hidden_states.shape ++ hidden_dim = hidden_states.shape[-1] ++ hidden_states = hidden_states.view(-1, hidden_dim) ++ ++ # router_logits: (num_tokens, n_experts) ++ router_logits, _ = self.gate(hidden_states) ++ final_hidden_states = self.experts(hidden_states=hidden_states, ++ router_logits=router_logits) ++ final_hidden_states = final_hidden_states ++ if self.tp_size > 1: ++ final_hidden_states = tensor_model_parallel_all_reduce( ++ final_hidden_states) + -+ activated_plugins = [] ++ return final_hidden_states.view(orig_shape) + -+ for name, func in chain(builtin_platform_plugins.items(), -+ platform_plugins.items()): -+ try: -+ assert callable(func) -+ platform_cls_qualname = func() -+ if platform_cls_qualname is not None: -+ activated_plugins.append(name) -+ except Exception: -+ pass -+ -+ activated_builtin_plugins = list( -+ set(activated_plugins) & set(builtin_platform_plugins.keys())) -+ activated_oot_plugins = list( -+ set(activated_plugins) & set(platform_plugins.keys())) -+ -+ if len(activated_oot_plugins) >= 2: -+ raise RuntimeError( -+ "Only one platform plugin can be activated, but got: " -+ f"{activated_oot_plugins}") -+ elif len(activated_oot_plugins) == 1: -+ platform_cls_qualname = platform_plugins[activated_oot_plugins[0]]() -+ logger.info("Platform plugin %s is activated", -+ activated_oot_plugins[0]) -+ elif len(activated_builtin_plugins) >= 2: -+ raise RuntimeError( -+ "Only one platform plugin can be activated, but got: " -+ f"{activated_builtin_plugins}") -+ elif len(activated_builtin_plugins) == 1: -+ platform_cls_qualname = builtin_platform_plugins[ -+ activated_builtin_plugins[0]]() -+ logger.info("Automatically detected platform %s.", -+ activated_builtin_plugins[0]) -+ else: -+ platform_cls_qualname = "vllm.interface.UnspecifiedPlatform" -+ logger.info( -+ "No platform detected, vLLM is running on UnspecifiedPlatform") -+ return platform_cls_qualname + ++class Qwen3MoeAttention(nn.Module): + -+_current_platform = None -+_init_trace: str = '' ++ def __init__( ++ self, ++ hidden_size: int, ++ num_heads: int, ++ num_kv_heads: int, ++ rope_theta: float = 10000, ++ rope_scaling: Optional[Dict[str, Any]] = None, ++ max_position_embeddings: int = 8192, ++ head_dim: Optional[int] = None, ++ rms_norm_eps: float = 1e-06, ++ qkv_bias: bool = False, ++ cache_config: Optional[CacheConfig] = None, ++ quant_config: Optional[QuantizationConfig] = None, ++ prefix: str = "", ++ ) -> None: ++ super().__init__() ++ self.hidden_size = hidden_size ++ tp_size = get_tensor_model_parallel_world_size() ++ self.total_num_heads = num_heads ++ assert self.total_num_heads % tp_size == 0 ++ self.num_heads = self.total_num_heads // tp_size ++ self.total_num_kv_heads = num_kv_heads ++ if self.total_num_kv_heads >= tp_size: ++ # Number of KV heads is greater than TP size, so we partition ++ # the KV heads across multiple tensor parallel GPUs. ++ assert self.total_num_kv_heads % tp_size == 0 ++ else: ++ # Number of KV heads is less than TP size, so we replicate ++ # the KV heads across multiple tensor parallel GPUs. ++ assert tp_size % self.total_num_kv_heads == 0 ++ self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) ++ self.head_dim = head_dim or (hidden_size // self.total_num_heads) ++ self.q_size = self.num_heads * self.head_dim ++ self.kv_size = self.num_kv_heads * self.head_dim ++ self.scaling = self.head_dim**-0.5 ++ self.rope_theta = rope_theta ++ self.max_position_embeddings = max_position_embeddings + -+if TYPE_CHECKING: -+ current_platform: Platform -+ -+ -+def __getattr__(name: str): -+ if name == 'current_platform': -+ # lazy init current_platform. -+ # 1. out-of-tree platform plugins need `from vllm.platforms import -+ # Platform` so that they can inherit `Platform` class. Therefore, -+ # we cannot resolve `current_platform` during the import of -+ # `vllm.platforms`. -+ # 2. when users use out-of-tree platform plugins, they might run -+ # `import vllm`, some vllm internal code might access -+ # `current_platform` during the import, and we need to make sure -+ # `current_platform` is only resolved after the plugins are loaded -+ # (we have tests for this, if any developer violate this, they will -+ # see the test failures). -+ global _current_platform -+ if _current_platform is None: -+ platform_cls_qualname = resolve_current_platform_cls_qualname() -+ _current_platform = resolve_obj_by_qualname( -+ platform_cls_qualname)() -+ global _init_trace -+ _init_trace = "".join(traceback.format_stack()) -+ return _current_platform -+ else: -+ return globals()[name] ++ self.qkv_proj = QKVParallelLinear(hidden_size, ++ self.head_dim, ++ self.total_num_heads, ++ self.total_num_kv_heads, ++ bias=qkv_bias, ++ quant_config=quant_config, ++ prefix=f"{prefix}.qkv_proj") ++ ++ self.o_proj = RowParallelLinear(self.total_num_heads * self.head_dim, ++ hidden_size, ++ bias=False, ++ quant_config=quant_config, ++ prefix=f"{prefix}.o_proj") ++ ++ self.rotary_emb = get_rope( ++ self.head_dim, ++ rotary_dim=self.head_dim, ++ max_position=max_position_embeddings, ++ base=rope_theta, ++ rope_scaling=rope_scaling, ++ ) ++ self.attn = Attention(self.num_heads, ++ self.head_dim, ++ self.scaling, ++ num_kv_heads=self.num_kv_heads, ++ cache_config=cache_config, ++ quant_config=quant_config, ++ prefix=f"{prefix}.attn") + ++ self.q_norm = RMSNorm(self.head_dim, eps=rms_norm_eps) ++ self.k_norm = RMSNorm(self.head_dim, eps=rms_norm_eps) + -+__all__ = [ -+ 'Platform', 'PlatformEnum', 'current_platform', 'CpuArchEnum', -+ "_init_trace" -+] -diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py -index 4150b0cdf..25724d927 100644 ---- a/vllm/platforms/interface.py -+++ b/vllm/platforms/interface.py -@@ -33,6 +33,7 @@ class _Backend(enum.Enum): - HPU_ATTN = enum.auto() - PALLAS = enum.auto() - IPEX = enum.auto() -+ IPEX_V1 = enum.auto() - NO_ATTENTION = enum.auto() - - -@@ -100,6 +101,9 @@ class Platform: - - def is_cpu(self) -> bool: - return self._enum == PlatformEnum.CPU -+ -+ def is_xpu(self) -> bool: -+ return self._enum == PlatformEnum.XPU - - def is_neuron(self) -> bool: - return self._enum == PlatformEnum.NEURON -@@ -199,6 +203,18 @@ class Platform: - """ - pass - -+ @classmethod -+ def verify_model_arch(cls, model_arch: str) -> None: -+ """ -+ Verify whether the current platform supports the specified model -+ architecture. ++ def forward( ++ self, ++ positions: torch.Tensor, ++ hidden_states: torch.Tensor, ++ ) -> torch.Tensor: ++ qkv, _ = self.qkv_proj(hidden_states) ++ q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) ++ # Add qk-norm ++ q_by_head = q.view(*q.shape[:-1], q.shape[-1] // self.head_dim, ++ self.head_dim) ++ q_by_head = self.q_norm.forward_native(q_by_head) ++ q = q_by_head.view(q.shape) ++ ++ k_by_head = k.view(*k.shape[:-1], k.shape[-1] // self.head_dim, ++ self.head_dim) ++ k_by_head = self.k_norm.forward_native(k_by_head) ++ k = k_by_head.view(k.shape) ++ q, k = self.rotary_emb(positions, q, k) ++ attn_output = self.attn(q, k, v) ++ output, _ = self.o_proj(attn_output) ++ return output + -+ - This will raise an Error or Warning based on the model support on -+ the current platform. -+ - By default all models are considered supported. -+ """ -+ pass + - @classmethod - def verify_quantization(cls, quant: str) -> None: - """ -diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py -index 7778b5653..aa779f265 100644 ---- a/vllm/platforms/rocm.py -+++ b/vllm/platforms/rocm.py -@@ -1,6 +1,6 @@ - import os - from functools import lru_cache --from typing import TYPE_CHECKING, Optional -+from typing import TYPE_CHECKING, Dict, List, Optional - - import torch - -@@ -33,6 +33,31 @@ if os.environ.get("VLLM_WORKER_MULTIPROC_METHOD", None) in ["fork", None]: - " `spawn` instead.") - os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" - -+# Models not supported by ROCm. -+_ROCM_UNSUPPORTED_MODELS: List[str] = [] -+ -+# Models partially supported by ROCm. -+# Architecture -> Reason. -+_ROCM_SWA_REASON = ("Sliding window attention (SWA) is not yet supported in " -+ "Triton flash attention. For half-precision SWA support, " -+ "please use CK flash attention by setting " -+ "`VLLM_USE_TRITON_FLASH_ATTN=0`") -+_ROCM_PARTIALLY_SUPPORTED_MODELS: Dict[str, str] = { -+ "Qwen2ForCausalLM": -+ _ROCM_SWA_REASON, -+ "MistralForCausalLM": -+ _ROCM_SWA_REASON, -+ "MixtralForCausalLM": -+ _ROCM_SWA_REASON, -+ "PaliGemmaForConditionalGeneration": -+ ("ROCm flash attention does not yet " -+ "fully support 32-bit precision on PaliGemma"), -+ "Phi3VForCausalLM": -+ ("ROCm Triton flash attention may run into compilation errors due to " -+ "excessive use of shared memory. If this happens, disable Triton FA " -+ "by setting `VLLM_USE_TRITON_FLASH_ATTN=0`") -+} ++class Qwen3MoeDecoderLayer(nn.Module): + - - class RocmPlatform(Platform): - _enum = PlatformEnum.ROCM -@@ -102,6 +127,18 @@ class RocmPlatform(Platform): - else: - parallel_config.worker_cls = "vllm.worker.worker.Worker" - -+ @classmethod -+ def verify_model_arch(cls, model_arch: str) -> None: -+ if model_arch in _ROCM_UNSUPPORTED_MODELS: -+ raise ValueError(f"Model architecture '{model_arch}' is not " -+ "supported by ROCm for now.") ++ def __init__( ++ self, ++ config: PretrainedConfig, ++ cache_config: Optional[CacheConfig] = None, ++ quant_config: Optional[QuantizationConfig] = None, ++ prefix: str = "", ++ ) -> None: ++ super().__init__() ++ self.hidden_size = config.hidden_size ++ rope_theta = getattr(config, "rope_theta", 10000) ++ rope_scaling = getattr(config, "rope_scaling", None) ++ max_position_embeddings = getattr(config, "max_position_embeddings", ++ 8192) ++ self.self_attn = Qwen3MoeAttention( ++ hidden_size=self.hidden_size, ++ num_heads=config.num_attention_heads, ++ num_kv_heads=config.num_key_value_heads, ++ rope_theta=rope_theta, ++ rope_scaling=rope_scaling, ++ max_position_embeddings=max_position_embeddings, ++ rms_norm_eps=config.rms_norm_eps, ++ qkv_bias=getattr(config, 'attention_bias', False), ++ head_dim=getattr(config, 'head_dim', None), ++ cache_config=cache_config, ++ quant_config=quant_config, ++ prefix=f"{prefix}.self_attn", ++ ) + -+ if model_arch in _ROCM_PARTIALLY_SUPPORTED_MODELS: -+ msg = _ROCM_PARTIALLY_SUPPORTED_MODELS[model_arch] -+ logger.warning( -+ "Model architecture '%s' is partially " -+ "supported by ROCm: %s", model_arch, msg) ++ # `mlp_only_layers` in the config. ++ layer_idx = extract_layer_index(prefix) ++ mlp_only_layers = ([] if not hasattr(config, "mlp_only_layers") else ++ config.mlp_only_layers) ++ if (layer_idx not in mlp_only_layers) and ( ++ config.num_experts > 0 and ++ (layer_idx + 1) % config.decoder_sparse_step == 0): ++ self.mlp = Qwen3MoeSparseMoeBlock(config=config, ++ quant_config=quant_config, ++ prefix=f"{prefix}.mlp") ++ else: ++ self.mlp = Qwen3MoeMLP(hidden_size=config.hidden_size, ++ intermediate_size=config.intermediate_size, ++ hidden_act=config.hidden_act, ++ quant_config=quant_config, ++ prefix=f"{prefix}.mlp") ++ self.input_layernorm = RMSNorm(config.hidden_size, ++ eps=config.rms_norm_eps) ++ self.post_attention_layernorm = RMSNorm(config.hidden_size, ++ eps=config.rms_norm_eps) + - @classmethod - def verify_quantization(cls, quant: str) -> None: - super().verify_quantization(quant) -diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py -index 78e17c2af..2b56a6df6 100644 ---- a/vllm/platforms/xpu.py -+++ b/vllm/platforms/xpu.py -@@ -2,6 +2,7 @@ from typing import TYPE_CHECKING, Optional - - import torch - -+import vllm.envs as envs - from vllm.logger import init_logger - - from .interface import DeviceCapability, Platform, PlatformEnum, _Backend -@@ -24,7 +25,11 @@ class XPUPlatform(Platform): - def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend: - if selected_backend != _Backend.IPEX: - logger.info("Cannot use %s backend on XPU.", selected_backend) -- return _Backend.IPEX -+ use_v1 = envs.VLLM_USE_V1 -+ if use_v1: -+ return _Backend.IPEX_V1 ++ def forward( ++ self, ++ positions: torch.Tensor, ++ hidden_states: torch.Tensor, ++ residual: Optional[torch.Tensor], ++ ) -> torch.Tensor: ++ # Self Attention ++ if residual is None: ++ residual = hidden_states ++ hidden_states = self.input_layernorm(hidden_states) + else: -+ return _Backend.IPEX - - @staticmethod - def get_device_capability(device_id: int = 0) -> DeviceCapability: -@@ -73,15 +78,19 @@ class XPUPlatform(Platform): - - # check and update parallel config - parallel_config = vllm_config.parallel_config -- if (parallel_config.distributed_executor_backend is not None -- and parallel_config.distributed_executor_backend != "ray"): -- logger.warning( -- "%s is not supported on XPU, fallback to ray distributed" -- " executor backend.", -- parallel_config.distributed_executor_backend) -- parallel_config.distributed_executor_backend = "ray" -+ # if (parallel_config.distributed_executor_backend is not None -+ # and parallel_config.distributed_executor_backend != "ray"): -+ # logger.warning( -+ # "%s is not supported on XPU, fallback to ray distributed" -+ # " executor backend.", -+ # parallel_config.distributed_executor_backend) -+ # parallel_config.distributed_executor_backend = "ray" - if parallel_config.worker_cls == "auto": -- parallel_config.worker_cls = "vllm.worker.xpu_worker.XPUWorker" -+ if envs.VLLM_USE_V1: -+ parallel_config.worker_cls = \ -+ "vllm.v1.worker.xpu_worker.XPUWorker" -+ else: -+ parallel_config.worker_cls = "vllm.worker.xpu_worker.XPUWorker" - - @classmethod - def is_pin_memory_available(cls): -diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py -index 17f604ea0..c50eb2cef 100644 ---- a/vllm/plugins/__init__.py -+++ b/vllm/plugins/__init__.py -@@ -1,10 +1,10 @@ - import logging - import os -+from typing import Callable, Dict - - import torch - - import vllm.envs as envs --from vllm.platforms import current_platform - - logger = logging.getLogger(__name__) - -@@ -12,6 +12,39 @@ logger = logging.getLogger(__name__) - plugins_loaded = False - - -+def load_plugins_by_group(group: str) -> Dict[str, Callable]: -+ import sys -+ if sys.version_info < (3, 10): -+ from importlib_metadata import entry_points -+ else: -+ from importlib.metadata import entry_points -+ -+ allowed_plugins = envs.VLLM_PLUGINS -+ -+ discovered_plugins = entry_points(group=group) -+ if len(discovered_plugins) == 0: -+ logger.debug("No plugins for group %s found.", group) -+ return {} -+ logger.info("Available plugins for group %s:", group) -+ for plugin in discovered_plugins: -+ logger.info("name=%s, value=%s", plugin.name, plugin.value) -+ if allowed_plugins is None: -+ logger.info("all available plugins for group %s will be loaded.", -+ group) -+ logger.info("set environment variable VLLM_PLUGINS to control" -+ " which plugins to load.") -+ plugins = {} -+ for plugin in discovered_plugins: -+ if allowed_plugins is None or plugin.name in allowed_plugins: -+ try: -+ func = plugin.load() -+ plugins[plugin.name] = func -+ logger.info("plugin %s loaded.", plugin.name) -+ except Exception: -+ logger.exception("Failed to load plugin %s", plugin.name) -+ return plugins -+ -+ - def load_general_plugins(): - """WARNING: plugins can be loaded for multiple times in different - processes. They should be designed in a way that they can be loaded -@@ -26,6 +59,9 @@ def load_general_plugins(): - os.environ['TORCHINDUCTOR_COMPILE_THREADS'] = '1' - # see https://github.com/vllm-project/vllm/issues/10619 - torch._inductor.config.compile_threads = 1 -+ -+ from vllm.platforms import current_platform -+ - if current_platform.is_xpu(): - # see https://github.com/pytorch/pytorch/blob/8cada5cbe5450e17c26fb8b358116785324537b2/torch/_dynamo/config.py#L158 # noqa - os.environ['TORCH_COMPILE_DISABLE'] = 'True' -@@ -47,33 +83,7 @@ def load_general_plugins(): - if plugins_loaded: - return - plugins_loaded = True -- import sys -- if sys.version_info < (3, 10): -- from importlib_metadata import entry_points -- else: -- from importlib.metadata import entry_points -- -- allowed_plugins = envs.VLLM_PLUGINS -- -- discovered_plugins = entry_points(group='vllm.general_plugins') -- if len(discovered_plugins) == 0: -- logger.debug("No plugins found.") -- return -- logger.info("Available plugins:") -- for plugin in discovered_plugins: -- logger.info("name=%s, value=%s, group=%s", plugin.name, plugin.value, -- plugin.group) -- if allowed_plugins is None: -- logger.info("all available plugins will be loaded.") -- logger.info("set environment variable VLLM_PLUGINS to control" -- " which plugins to load.") -- else: -- logger.info("plugins to load: %s", allowed_plugins) -- for plugin in discovered_plugins: -- if allowed_plugins is None or plugin.name in allowed_plugins: -- try: -- func = plugin.load() -- func() -- logger.info("plugin %s loaded.", plugin.name) -- except Exception: -- logger.exception("Failed to load plugin %s", plugin.name) -+ plugins = load_plugins_by_group(group='vllm.general_plugins') -+ # general plugins, we only need to execute the loaded functions -+ for func in plugins.values(): -+ func() -diff --git a/vllm/prompt_adapter/utils.py b/vllm/prompt_adapter/utils.py -index 473b87c89..8b2732923 100644 ---- a/vllm/prompt_adapter/utils.py -+++ b/vllm/prompt_adapter/utils.py -@@ -89,6 +89,7 @@ def load_peft_weights(model_id: str, - adapters_weights = safe_load_file(filename, device=device) - else: - adapters_weights = torch.load(filename, -- map_location=torch.device(device)) -+ map_location=torch.device(device), -+ weights_only=True) - - return adapters_weights -diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py -index fc77f3ca5..605c09b8d 100644 ---- a/vllm/sampling_params.py -+++ b/vllm/sampling_params.py -@@ -450,15 +450,16 @@ class SamplingParams( - return self._all_stop_token_ids - - def clone(self) -> "SamplingParams": -- """Deep copy excluding LogitsProcessor objects. -+ """Deep copy, but maybe not the LogitsProcessor objects. - -- LogitsProcessor objects are excluded because they may contain an -- arbitrary, nontrivial amount of data. -+ LogitsProcessor objects may contain an arbitrary, nontrivial amount of -+ data that is expensive to copy. However, if not copied, the processor -+ needs to support parallel decoding for multiple sequences - See https://github.com/vllm-project/vllm/issues/3087 - """ - - logit_processor_refs = None if self.logits_processors is None else { -- id(lp): lp -+ id(lp): lp.clone() if hasattr(lp, 'clone') else lp - for lp in self.logits_processors - } - return copy.deepcopy(self, memo=logit_processor_refs) -diff --git a/vllm/sequence.py b/vllm/sequence.py -index cc3d96fc9..0157abbd2 100644 ---- a/vllm/sequence.py -+++ b/vllm/sequence.py -@@ -667,6 +667,7 @@ class SequenceGroup: - first_scheduled_time=None, - first_token_time=None, - time_in_queue=None) -+ self.last_token_latency = 0.0 - self.lora_request = lora_request - self.prompt_logprobs: Optional[PromptLogprobs] = None - self.state = SequenceGroupState() -@@ -709,15 +710,27 @@ class SequenceGroup: - - @property - def multi_modal_data(self) -> MultiModalDataDict: -- return self.first_seq.multi_modal_data -+ if self.first_seq.multi_modal_data: -+ return self.first_seq.multi_modal_data -+ elif self.encoder_seq is not None: -+ return self.encoder_seq.multi_modal_data -+ return {} - - @property - def multi_modal_placeholders(self) -> MultiModalPlaceholderDict: -- return self.first_seq.multi_modal_placeholders -+ if self.first_seq.multi_modal_data: -+ return self.first_seq.multi_modal_placeholders -+ elif self.encoder_seq is not None: -+ return self.encoder_seq.multi_modal_placeholders -+ return {} - - @property - def mm_processor_kwargs(self) -> Dict[str, Any]: -- return self.first_seq.mm_processor_kwargs -+ if self.first_seq.multi_modal_data: -+ return self.first_seq.mm_processor_kwargs -+ elif self.encoder_seq is not None: -+ return self.encoder_seq.mm_processor_kwargs -+ return {} - - @property - def lora_int_id(self) -> int: -@@ -762,18 +775,21 @@ class SequenceGroup: - assert num_lookahead_slots + 1 == num_scheduler_steps or is_prefill - self.init_multi_step(num_steps=num_lookahead_slots + 1) - -- def get_last_latency(self, now: float) -> float: -+ def set_last_token_time(self, now: float) -> None: - """Sets the last token time for Request level timings.""" -- # If still in prefill phase, raise Error. -- if self.is_prefill(): -- raise ValueError( -- "seq_group.get_last_latency() should not be called " -- "if the seq_group is in prefill phase.") -- -- # Otherwise return token latency. -- latency = now - self.metrics.last_token_time -+ # If still in prefill phase, assertion fails. -+ assert not self.is_prefill(), ( -+ "seq_group.set_last_token_time() should not be called " -+ "if the seq_group is in prefill phase.") -+ self.last_token_latency = now - self.metrics.last_token_time - self.metrics.last_token_time = now -- return latency -+ -+ def get_last_token_latency(self) -> float: -+ """Returns the latency of the last token.""" -+ assert not self.is_prefill(), ( -+ "seq_group.get_last_token_latency() should not be called " -+ "if the seq_group is in prefill phase.") -+ return self.last_token_latency - - def maybe_set_first_token_time(self, time: float) -> None: - """Sets the first token time for Request level timings.""" -@@ -1368,7 +1384,7 @@ class ParallelSampleSequenceGroup(SequenceGroupBase): - @staticmethod - def add_request(request_id: str, engine, params, **kwargs): - original_params = params -- params = copy.deepcopy(original_params) -+ params = original_params.clone() - params.n = 1 - group = ParallelSampleSequenceGroup(request_id) - seqs = [] -diff --git a/vllm/spec_decode/metrics.py b/vllm/spec_decode/metrics.py -index 03dc46600..d678f4578 100644 ---- a/vllm/spec_decode/metrics.py -+++ b/vllm/spec_decode/metrics.py -@@ -6,7 +6,6 @@ import torch - - from vllm.model_executor.layers.spec_decode_base_sampler import ( - SpecDecodeBaseSampler) --from vllm.platforms import current_platform - from vllm.utils import is_pin_memory_available - - -@@ -94,6 +93,7 @@ class AsyncMetricsCollector: - def maybe_collect_rejsample_metrics( - self, k: int) -> Optional[SpecDecodeWorkerMetrics]: - # currently using cuda.Event, skip for any non_cuda_alike platform -+ from vllm.platforms import current_platform - if not current_platform.is_cuda_alike(): - return None - -diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py -index 4529cf27e..58417980e 100644 ---- a/vllm/transformers_utils/config.py -+++ b/vllm/transformers_utils/config.py -@@ -22,9 +22,9 @@ from vllm.envs import VLLM_USE_MODELSCOPE - from vllm.logger import init_logger - # yapf conflicts with isort for this block - # yapf: disable --from vllm.transformers_utils.configs import (ChatGLMConfig, DbrxConfig, -- EAGLEConfig, ExaoneConfig, -- H2OVLChatConfig, -+from vllm.transformers_utils.configs import (ChatGLMConfig, Cohere2Config, -+ DbrxConfig, EAGLEConfig, -+ ExaoneConfig, H2OVLChatConfig, - InternVLChatConfig, JAISConfig, - MedusaConfig, MllamaConfig, - MLPSpeculatorConfig, MPTConfig, -@@ -52,6 +52,7 @@ _CONFIG_REGISTRY_OVERRIDE_HF: Dict[str, Type[PretrainedConfig]] = { - - _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = { - "chatglm": ChatGLMConfig, -+ "cohere2": Cohere2Config, - "dbrx": DbrxConfig, - "mpt": MPTConfig, - "RefinedWeb": RWConfig, # For tiiuae/falcon-40b(-instruct) -diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py -index c24433cd4..e247ffdcd 100644 ---- a/vllm/transformers_utils/configs/__init__.py -+++ b/vllm/transformers_utils/configs/__init__.py -@@ -1,4 +1,6 @@ - from vllm.transformers_utils.configs.chatglm import ChatGLMConfig -+from vllm.transformers_utils.configs.telechat2 import Telechat2Config -+from vllm.transformers_utils.configs.cohere2 import Cohere2Config - from vllm.transformers_utils.configs.dbrx import DbrxConfig - from vllm.transformers_utils.configs.eagle import EAGLEConfig - from vllm.transformers_utils.configs.exaone import ExaoneConfig -@@ -21,7 +23,9 @@ from vllm.transformers_utils.configs.telechat2 import Telechat2Config - from vllm.transformers_utils.configs.ultravox import UltravoxConfig - - __all__ = [ -+ "Telechat2Config", - "ChatGLMConfig", -+ "Cohere2Config", - "DbrxConfig", - "MPTConfig", - "RWConfig", -diff --git a/vllm/transformers_utils/configs/cohere2.py b/vllm/transformers_utils/configs/cohere2.py -new file mode 100644 -index 000000000..1509330fc ---- /dev/null -+++ b/vllm/transformers_utils/configs/cohere2.py -@@ -0,0 +1,192 @@ -+# ruff: noqa ++ hidden_states, residual = self.input_layernorm( ++ hidden_states, residual) ++ hidden_states = self.self_attn( ++ positions=positions, ++ hidden_states=hidden_states, ++ ) ++ ++ # Fully Connected ++ hidden_states, residual = self.post_attention_layernorm( ++ hidden_states, residual) ++ hidden_states = self.mlp(hidden_states) ++ return hidden_states, residual + -+# Adapted from -+# https://github.com/huggingface/transformers/blob/main/src/transformers/models/cohere2/configuration_cohere2.py -+from transformers import PretrainedConfig -+from transformers.modeling_rope_utils import rope_config_validation + ++@support_torch_compile ++class Qwen3MoeModel(nn.Module): + -+class Cohere2Config(PretrainedConfig): -+ r""" -+ This is the configuration class to store the configuration of a [`CohereModel`]. It is used to instantiate an Cohere -+ model according to the specified arguments, defining the model architecture. ++ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ++ super().__init__() + -+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the -+ documentation from [`PretrainedConfig`] for more information. Instantiating a configuration -+ with the defaults will yield a similar configuration to that of the [CohereForAI/c4ai-command-r-v01](https://huggingface.co/CohereForAI/c4ai-command-r-v01) model. ++ config = vllm_config.model_config.hf_config ++ cache_config = vllm_config.cache_config ++ quant_config = vllm_config.quant_config + ++ self.padding_idx = config.pad_token_id ++ self.vocab_size = config.vocab_size + -+ Args: -+ vocab_size (`int`, *optional*, defaults to 256000): -+ Vocabulary size of the Cohere model. Defines the number of different tokens that can be represented by the -+ `inputs_ids` passed when calling [`CohereModel`] -+ hidden_size (`int`, *optional*, defaults to 8192): -+ Dimension of the hidden representations. -+ intermediate_size (`int`, *optional*, defaults to 22528): -+ Dimension of the MLP representations. -+ logit_scale (`float`, *optional*, defaults to 0.0625): -+ The scaling factor for the output logits. -+ num_hidden_layers (`int`, *optional*, defaults to 40): -+ Number of hidden layers in the Transformer decoder. -+ num_attention_heads (`int`, *optional*, defaults to 64): -+ Number of attention heads for each attention layer in the Transformer decoder. -+ num_key_value_heads (`int`, *optional*): -+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If -+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if -+ `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When -+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed -+ by meanpooling all the original heads within that group. For more details checkout [this -+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to -+ `num_attention_heads`. -+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): -+ The non-linear activation function (function or string) in the decoder. -+ max_position_embeddings (`int`, *optional*, defaults to 8192): -+ The maximum sequence length that this model might ever be used with. -+ initializer_range (`float`, *optional*, defaults to 0.02): -+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices. -+ layer_norm_eps (`float`, *optional*, defaults to 1e-05): -+ The epsilon used by the layer normalization. -+ use_cache (`bool`, *optional*, defaults to `True`): -+ Whether or not the model should return the last key/values attentions (not used by all models). Only -+ relevant if `config.is_decoder=True`. -+ pad_token_id (`int`, *optional*, defaults to 0): -+ Padding token id. -+ bos_token_id (`int`, *optional*, defaults to 5): -+ Beginning of stream token id. -+ eos_token_id (`int`, *optional*, defaults to 255001): -+ End of stream token id. -+ tie_word_embeddings (`bool`, *optional*, defaults to `True`): -+ Whether to tie weight embeddings -+ rope_theta (`float`, *optional*, defaults to 10000.0): -+ The base period of the RoPE embeddings. -+ rope_scaling (`Dict`, *optional*): -+ Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type -+ and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value -+ accordingly. -+ Expected contents: -+ `rope_type` (`str`): -+ The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', -+ 'llama3'], with 'default' being the original RoPE implementation. -+ `factor` (`float`, *optional*): -+ Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In -+ most scaling types, a `factor` of x will enable the model to handle sequences of length x * -+ original maximum pre-trained length. -+ `original_max_position_embeddings` (`int`, *optional*): -+ Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during -+ pretraining. -+ `attention_factor` (`float`, *optional*): -+ Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention -+ computation. If unspecified, it defaults to value recommended by the implementation, using the -+ `factor` field to infer the suggested value. -+ `beta_fast` (`float`, *optional*): -+ Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear -+ ramp function. If unspecified, it defaults to 32. -+ `beta_slow` (`float`, *optional*): -+ Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear -+ ramp function. If unspecified, it defaults to 1. -+ `short_factor` (`List[float]`, *optional*): -+ Only used with 'longrope'. The scaling factor to be applied to short contexts (< -+ `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden -+ size divided by the number of attention heads divided by 2 -+ `long_factor` (`List[float]`, *optional*): -+ Only used with 'longrope'. The scaling factor to be applied to long contexts (< -+ `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden -+ size divided by the number of attention heads divided by 2 -+ `low_freq_factor` (`float`, *optional*): -+ Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE -+ `high_freq_factor` (`float`, *optional*): -+ Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE -+ attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): -+ Whether to use a bias in the query, key, value and output projection layers during self-attention. -+ attention_dropout (`float`, *optional*, defaults to 0.0): -+ The dropout ratio for the attention probabilities. -+ sliding_window (`int`, *optional*, defaults to 4096): -+ Size of the sliding window attention context. -+ sliding_window_pattern (`int`, *optional*, defaults to 4): -+ Pattern for the sliding window attention. -+ cache_implementation (`str`, *optional*, defaults to `"hybrid"`): the cache type to be used with `generate`. -+ -+ ```python -+ >>> from transformers import Cohere2Model, Cohere2Config -+ -+ >>> # Initializing a Cohere Nextmodel configuration -+ >>> configuration = Cohere2Config() -+ -+ >>> # Initializing a model from the Cohere2 configuration -+ >>> model = Cohere2Model(configuration) # doctest: +SKIP -+ -+ >>> # Accessing the model configuration -+ >>> configuration = model.config # doctest: +SKIP -+ ``` -+ """ ++ self.embed_tokens = VocabParallelEmbedding( ++ config.vocab_size, ++ config.hidden_size, ++ prefix=f"{prefix}.embed_tokens") ++ self.start_layer, self.end_layer, self.layers = make_layers( ++ config.num_hidden_layers, ++ lambda prefix: Qwen3MoeDecoderLayer(config=config, ++ cache_config=cache_config, ++ quant_config=quant_config, ++ prefix=prefix), ++ prefix=f"{prefix}.layers", ++ ) ++ self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) ++ self.make_empty_intermediate_tensors = ( ++ make_empty_intermediate_tensors_factory( ++ ["hidden_states", "residual"], config.hidden_size)) + -+ model_type = "cohere2" -+ keys_to_ignore_at_inference = ["past_key_values"] ++ def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: ++ return self.embed_tokens(input_ids) + -+ def __init__( ++ def forward( + self, -+ vocab_size=256000, -+ hidden_size=8192, -+ intermediate_size=22528, -+ logit_scale=0.0625, -+ num_hidden_layers=40, -+ num_attention_heads=64, -+ num_key_value_heads=None, -+ hidden_act="silu", -+ max_position_embeddings=8192, -+ initializer_range=0.02, -+ layer_norm_eps=1e-5, -+ use_cache=True, -+ pad_token_id=0, -+ bos_token_id=5, -+ eos_token_id=255001, -+ tie_word_embeddings=True, -+ rope_theta=10000.0, -+ rope_scaling=None, -+ attention_bias=False, -+ attention_dropout=0.0, -+ sliding_window=4096, -+ sliding_window_pattern=4, -+ cache_implementation="hybrid", -+ **kwargs, -+ ): -+ self.vocab_size = vocab_size -+ self.max_position_embeddings = max_position_embeddings -+ self.hidden_size = hidden_size -+ self.logit_scale = logit_scale -+ self.intermediate_size = intermediate_size -+ self.num_hidden_layers = num_hidden_layers -+ self.num_attention_heads = num_attention_heads -+ -+ # for backward compatibility -+ if num_key_value_heads is None: -+ num_key_value_heads = num_attention_heads -+ -+ self.num_key_value_heads = num_key_value_heads -+ self.hidden_act = hidden_act -+ self.initializer_range = initializer_range -+ self.layer_norm_eps = layer_norm_eps -+ self.use_cache = use_cache -+ self.rope_theta = rope_theta -+ self.rope_scaling = rope_scaling -+ self.attention_bias = attention_bias -+ self.attention_dropout = attention_dropout -+ self.sliding_window = sliding_window -+ self.sliding_window_pattern = sliding_window_pattern -+ # Need to specify head_dim in the config so it can be used in the attention forward functions -+ self.head_dim = hidden_size // num_attention_heads -+ self.cache_implementation = cache_implementation -+ -+ # Validate the correctness of rotary position embeddings parameters -+ rope_config_validation(self) -+ -+ super().__init__( -+ pad_token_id=pad_token_id, -+ bos_token_id=bos_token_id, -+ eos_token_id=eos_token_id, -+ tie_word_embeddings=tie_word_embeddings, -+ **kwargs, -+ ) ++ input_ids: torch.Tensor, ++ positions: torch.Tensor, ++ intermediate_tensors: Optional[IntermediateTensors] = None, ++ inputs_embeds: Optional[torch.Tensor] = None, ++ ) -> Union[torch.Tensor, IntermediateTensors]: ++ if get_pp_group().is_first_rank: ++ if inputs_embeds is not None: ++ hidden_states = inputs_embeds ++ else: ++ hidden_states = self.get_input_embeddings(input_ids) ++ residual = None ++ else: ++ assert intermediate_tensors is not None ++ hidden_states = intermediate_tensors["hidden_states"] ++ residual = intermediate_tensors["residual"] ++ for i in range(self.start_layer, self.end_layer): ++ layer = self.layers[i] ++ hidden_states, residual = layer(positions, hidden_states, residual) ++ if not get_pp_group().is_last_rank: ++ return IntermediateTensors({ ++ "hidden_states": hidden_states, ++ "residual": residual ++ }) ++ hidden_states, _ = self.norm(hidden_states, residual) ++ return hidden_states + + -+__all__ = ["Cohere2Config"] -diff --git a/vllm/transformers_utils/processor.py b/vllm/transformers_utils/processor.py -index f1523667b..b12cc83a2 100644 ---- a/vllm/transformers_utils/processor.py -+++ b/vllm/transformers_utils/processor.py -@@ -1,25 +1,31 @@ - from functools import lru_cache - from typing import Any, cast - -+from transformers.processing_utils import ProcessorMixin ++class Qwen3MoeForCausalLM(nn.Module, SupportsPP): + - - def get_processor( - processor_name: str, - *args: Any, - trust_remote_code: bool = False, -+ processor_cls: type[ProcessorMixin] = ProcessorMixin, - **kwargs: Any, - ): - """Load a processor for the given model name via HuggingFace.""" - # don't put this import at the top level - # it will call torch.cuda.device_count() - from transformers import AutoProcessor -- from transformers.processing_utils import ProcessorMixin -+ -+ processor_factory = (AutoProcessor -+ if processor_cls == ProcessorMixin else processor_cls) - - try: -- processor = AutoProcessor.from_pretrained( -+ processor = processor_factory.from_pretrained( - processor_name, - *args, - trust_remote_code=trust_remote_code, -- **kwargs) -+ **kwargs, -+ ) - except ValueError as e: - # If the error pertains to the processor class not existing or not - # currently being imported, suggest using the --trust-remote-code flag. -diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py -index e6701f4c4..97920f42e 100644 ---- a/vllm/transformers_utils/tokenizer.py -+++ b/vllm/transformers_utils/tokenizer.py -@@ -21,6 +21,38 @@ AnyTokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast, - MistralTokenizer] - - -+def decode_tokens( -+ tokenizer: AnyTokenizer, -+ token_ids: list[int], -+ *, -+ skip_special_tokens: bool = False, -+) -> str: -+ """ -+ Backend-agnostic equivalent of HF's -+ :code:`tokenizer.decode(token_ids, skip_special_tokens=...)`. -+ """ -+ return tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens) ++ fall_back_to_pt_during_load = False + ++ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ++ super().__init__() ++ config = vllm_config.model_config.hf_config ++ quant_config = vllm_config.quant_config ++ self.config = config ++ self.quant_config = quant_config ++ self.model = Qwen3MoeModel(vllm_config=vllm_config, ++ prefix=maybe_prefix(prefix, "model")) ++ self.lm_head = ParallelLMHead(config.vocab_size, ++ config.hidden_size, ++ quant_config=quant_config) ++ if self.config.tie_word_embeddings: ++ self.lm_head.weight = self.model.embed_tokens.weight ++ self.logits_processor = LogitsProcessor(config.vocab_size) ++ self.sampler = get_sampler() ++ self.make_empty_intermediate_tensors = ( ++ self.model.make_empty_intermediate_tensors) + -+def encode_tokens( -+ tokenizer: AnyTokenizer, -+ text: str, -+ *, -+ add_special_tokens: Optional[bool] = None, -+) -> list[int]: -+ """ -+ Backend-agnostic equivalent of HF's -+ :code:`tokenizer.encode(text, add_special_tokens=...)`. -+ """ -+ if isinstance(tokenizer, MistralTokenizer): -+ return tokenizer.tokenizer.encode(text, -+ bos=add_special_tokens, -+ eos=add_special_tokens) -+ elif add_special_tokens is not None: -+ return tokenizer.encode(text, add_special_tokens=add_special_tokens) -+ return tokenizer.encode(text) ++ def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: ++ return self.model.get_input_embeddings(input_ids) + ++ def forward( ++ self, ++ input_ids: torch.Tensor, ++ positions: torch.Tensor, ++ intermediate_tensors: Optional[IntermediateTensors] = None, ++ inputs_embeds: Optional[torch.Tensor] = None, ++ ) -> Union[torch.Tensor, IntermediateTensors]: ++ hidden_states = self.model(input_ids, positions, intermediate_tensors, ++ inputs_embeds) ++ return hidden_states + - def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer: - """Get tokenizer with cached properties. - -diff --git a/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py -index 8f78ef65b..e6cc7cd4e 100644 ---- a/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py -+++ b/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py -@@ -32,7 +32,8 @@ class BaseTokenizerGroup(ABC): - def encode(self, - prompt: str, - request_id: Optional[str] = None, -- lora_request: Optional[LoRARequest] = None) -> List[int]: -+ lora_request: Optional[LoRARequest] = None, -+ add_special_tokens: Optional[bool] = None) -> List[int]: - """Encode a prompt using the tokenizer group.""" - pass - -@@ -41,7 +42,8 @@ class BaseTokenizerGroup(ABC): - self, - prompt: str, - request_id: Optional[str] = None, -- lora_request: Optional[LoRARequest] = None) -> List[int]: -+ lora_request: Optional[LoRARequest] = None, -+ add_special_tokens: Optional[bool] = None) -> List[int]: - """Encode a prompt using the tokenizer group.""" - pass - -diff --git a/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py -index 9a999a0d6..3f7627e11 100644 ---- a/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py -+++ b/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py -@@ -112,7 +112,8 @@ class RayTokenizerGroupPool(BaseTokenizerGroup): - def encode(self, - prompt: str, - request_id: Optional[str] = None, -- lora_request: Optional[LoRARequest] = None) -> List[int]: -+ lora_request: Optional[LoRARequest] = None, -+ add_special_tokens: Optional[bool] = None) -> List[int]: - """Encode a prompt using the tokenizer group. - - We pick an idle actor and use it to encode the prompt. -@@ -132,7 +133,8 @@ class RayTokenizerGroupPool(BaseTokenizerGroup): - ret = ray.get( - actor.encode.remote(request_id=request_id, - prompt=prompt, -- lora_request=lora_request)) -+ lora_request=lora_request, -+ add_special_tokens=add_special_tokens)) - except ActorDiedError as e: - # If the actor is dead, we first try to reinitialize it. - logger.warning("%s died with ActorDiedError, reinitializing.", -@@ -143,7 +145,8 @@ class RayTokenizerGroupPool(BaseTokenizerGroup): - ret = ray.get( - actor.encode.remote(request_id=request_id, - prompt=prompt, -- lora_request=lora_request)) -+ lora_request=lora_request, -+ add_special_tokens=add_special_tokens)) - except ActorDiedError as e: - logger.error( - "%s died for second time in a row, marking " -@@ -160,7 +163,8 @@ class RayTokenizerGroupPool(BaseTokenizerGroup): - self, - prompt: str, - request_id: Optional[str] = None, -- lora_request: Optional[LoRARequest] = None) -> List[int]: -+ lora_request: Optional[LoRARequest] = None, -+ add_special_tokens: Optional[bool] = None) -> List[int]: - """Encode a prompt using the tokenizer group. - - We pick an idle actor and use it to encode the prompt. -@@ -177,9 +181,11 @@ class RayTokenizerGroupPool(BaseTokenizerGroup): - actor_is_alive = True - original_actor = actor - try: -- ret = await actor.encode.remote(request_id=request_id, -- prompt=prompt, -- lora_request=lora_request) -+ ret = await actor.encode.remote( -+ request_id=request_id, -+ prompt=prompt, -+ lora_request=lora_request, -+ add_special_tokens=add_special_tokens) - except ActorDiedError as e: - # If the actor is dead, we first try to reinitialize it. - logger.warning("%s died with ActorDiedError, reinitializing.", -@@ -187,9 +193,11 @@ class RayTokenizerGroupPool(BaseTokenizerGroup): - exc_info=e) - actor = self._init_actor() - try: -- ret = await actor.encode.remote(request_id=request_id, -- prompt=prompt, -- lora_request=lora_request) -+ ret = await actor.encode.remote( -+ request_id=request_id, -+ prompt=prompt, -+ lora_request=lora_request, -+ add_special_tokens=add_special_tokens) - except ActorDiedError as e: - logger.error( - "%s died for second time in a row, marking " -diff --git a/vllm/transformers_utils/tokenizer_group/tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/tokenizer_group.py -index 95a8f7098..6dc2f9056 100644 ---- a/vllm/transformers_utils/tokenizer_group/tokenizer_group.py -+++ b/vllm/transformers_utils/tokenizer_group/tokenizer_group.py -@@ -2,7 +2,7 @@ from typing import List, Optional - - from vllm.config import TokenizerPoolConfig - from vllm.lora.request import LoRARequest --from vllm.transformers_utils.tokenizer import (AnyTokenizer, -+from vllm.transformers_utils.tokenizer import (AnyTokenizer, encode_tokens, - get_lora_tokenizer, - get_lora_tokenizer_async, - get_tokenizer) -@@ -55,9 +55,12 @@ class TokenizerGroup(BaseTokenizerGroup): - def encode(self, - prompt: str, - request_id: Optional[str] = None, -- lora_request: Optional[LoRARequest] = None) -> List[int]: -+ lora_request: Optional[LoRARequest] = None, -+ add_special_tokens: Optional[bool] = None) -> List[int]: - tokenizer = self.get_lora_tokenizer(lora_request) -- ret = tokenizer.encode(prompt) -+ ret = encode_tokens(tokenizer, -+ prompt, -+ add_special_tokens=add_special_tokens) - self._raise_if_input_too_long(ret, lora_request) - return ret - -@@ -65,9 +68,12 @@ class TokenizerGroup(BaseTokenizerGroup): - self, - prompt: str, - request_id: Optional[str] = None, -- lora_request: Optional[LoRARequest] = None) -> List[int]: -+ lora_request: Optional[LoRARequest] = None, -+ add_special_tokens: Optional[bool] = None) -> List[int]: - tokenizer = await self.get_lora_tokenizer_async(lora_request) -- ret = tokenizer.encode(prompt) -+ ret = encode_tokens(tokenizer, -+ prompt, -+ add_special_tokens=add_special_tokens) - self._raise_if_input_too_long(ret, lora_request) - return ret - -diff --git a/vllm/triton_utils/importing.py b/vllm/triton_utils/importing.py -index 36315abcd..0c96e0632 100644 ---- a/vllm/triton_utils/importing.py -+++ b/vllm/triton_utils/importing.py -@@ -8,7 +8,6 @@ logger = init_logger(__name__) - HAS_TRITON = ( - find_spec("triton") is not None - and not current_platform.is_xpu() # Not compatible -- and not current_platform.is_neuron() # neuron has too old torch - ) - - if not HAS_TRITON: -diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py -index 9ae46ff43..a9deee881 100644 ---- a/vllm/usage/usage_lib.py -+++ b/vllm/usage/usage_lib.py -@@ -17,7 +17,6 @@ import torch - - import vllm.envs as envs - from vllm.connections import global_http_connection --from vllm.platforms import current_platform - from vllm.version import __version__ as VLLM_VERSION - - _config_home = envs.VLLM_CONFIG_ROOT -@@ -152,6 +151,7 @@ class UsageMessage: - usage_context: UsageContext, - extra_kvs: Dict[str, Any]) -> None: - # Platform information -+ from vllm.platforms import current_platform - if current_platform.is_cuda_alike(): - device_property = torch.cuda.get_device_properties(0) - self.gpu_count = torch.cuda.device_count() -diff --git a/vllm/utils.py b/vllm/utils.py -index 3d1988870..6b13e4746 100644 ---- a/vllm/utils.py -+++ b/vllm/utils.py -@@ -10,6 +10,7 @@ import importlib.metadata - import importlib.util - import inspect - import ipaddress -+import multiprocessing - import os - import re - import resource -@@ -20,17 +21,19 @@ import sys - import tempfile - import threading - import time -+import traceback - import uuid - import warnings - import weakref - from asyncio import FIRST_COMPLETED, AbstractEventLoop, Task - from collections import OrderedDict, UserDict, defaultdict --from collections.abc import Iterable, Mapping -+from collections.abc import Hashable, Iterable, Mapping - from dataclasses import dataclass, field - from functools import lru_cache, partial, wraps - from typing import (TYPE_CHECKING, Any, AsyncGenerator, Awaitable, Callable, -- Dict, Generator, Generic, Hashable, List, Literal, -- Optional, Tuple, Type, TypeVar, Union, overload) -+ Dict, Generator, Generic, Iterator, List, Literal, -+ NamedTuple, Optional, Tuple, Type, TypeVar, Union, -+ overload) - from uuid import uuid4 - - import numpy as np -@@ -39,13 +42,14 @@ import psutil - import torch - import torch.types - import yaml -+import zmq -+import zmq.asyncio - from packaging.version import Version - from torch.library import Library - from typing_extensions import ParamSpec, TypeIs, assert_never - - import vllm.envs as envs - from vllm.logger import enable_trace_function_call, init_logger --from vllm.platforms import current_platform - - if TYPE_CHECKING: - from vllm.config import VllmConfig -@@ -194,13 +198,29 @@ class Counter: - self.counter = 0 - - -+class CacheInfo(NamedTuple): -+ hits: int -+ total: int ++ def compute_logits( ++ self, ++ hidden_states: torch.Tensor, ++ sampling_metadata: SamplingMetadata, ++ ) -> Optional[torch.Tensor]: ++ logits = self.logits_processor(self.lm_head, hidden_states, ++ sampling_metadata) ++ return logits + -+ @property -+ def hit_ratio(self) -> float: -+ if self.total == 0: -+ return 0 ++ def sample( ++ self, ++ logits: Optional[torch.Tensor], ++ sampling_metadata: SamplingMetadata, ++ ) -> Optional[SamplerOutput]: ++ next_tokens = self.sampler(logits, sampling_metadata) ++ return next_tokens + -+ return self.hits / self.total ++ def load_weights(self, weights: Iterable[Tuple[str, ++ torch.Tensor]]) -> Set[str]: ++ stacked_params_mapping = [ ++ # (param_name, shard_name, shard_id) ++ ("qkv_proj", "q_proj", "q"), ++ ("qkv_proj", "k_proj", "k"), ++ ("qkv_proj", "v_proj", "v"), ++ ("gate_up_proj", "gate_proj", 0), ++ ("gate_up_proj", "up_proj", 1), ++ ] + ++ # Params for weights, fp8 weight scales, fp8 activation scales ++ # (param_name, weight_name, expert_id, shard_id) ++ expert_params_mapping = FusedMoE.make_expert_params_mapping( ++ ckpt_gate_proj_name="gate_proj", ++ ckpt_down_proj_name="down_proj", ++ ckpt_up_proj_name="up_proj", ++ num_experts=self.config.num_experts) + - class LRUCache(Generic[_K, _V]): -+ """Note: This class is not thread safe!""" - - def __init__(self, capacity: int) -> None: - self.cache = OrderedDict[_K, _V]() - self.pinned_items = set[_K]() - self.capacity = capacity - -+ self._hits = 0 -+ self._total = 0 ++ params_dict = dict(self.named_parameters()) ++ loaded_params: Set[str] = set() ++ for name, loaded_weight in weights: ++ if "rotary_emb.inv_freq" in name: ++ continue ++ for (param_name, weight_name, shard_id) in stacked_params_mapping: ++ # Skip non-stacked layers and experts (experts handled below). ++ if weight_name not in name: ++ continue ++ # We have mlp.experts[0].gate_proj in the checkpoint. ++ # Since we handle the experts below in expert_params_mapping, ++ # we need to skip here BEFORE we update the name, otherwise ++ # name will be updated to mlp.experts[0].gate_up_proj, which ++ # will then be updated below in expert_params_mapping ++ # for mlp.experts[0].gate_gate_up_proj, which breaks load. ++ if "mlp.experts" in name: ++ continue ++ name = name.replace(weight_name, param_name) ++ # Skip loading extra bias for GPTQ models. ++ if ((name.endswith(".bias") or name.endswith("_bias")) ++ and name not in params_dict): ++ continue ++ # Skip layers on other devices. ++ if is_pp_missing_parameter(name, self): ++ continue ++ if name not in params_dict: ++ continue + - def __contains__(self, key: _K) -> bool: - return key in self.cache - -@@ -218,6 +238,9 @@ class LRUCache(Generic[_K, _V]): - def __delitem__(self, key: _K) -> None: - self.pop(key) ++ param = params_dict[name] ++ weight_loader = param.weight_loader ++ weight_loader(param, loaded_weight, shard_id) ++ break ++ else: ++ for mapping in expert_params_mapping: ++ param_name, weight_name, expert_id, shard_id = mapping ++ if weight_name not in name: ++ continue ++ name = name.replace(weight_name, param_name) ++ # Skip layers on other devices. ++ if is_pp_missing_parameter(name, self): ++ continue ++ # Skip loading extra bias for GPTQ models. ++ if ((name.endswith(".bias") or name.endswith("_bias")) ++ and name not in params_dict): ++ continue ++ param = params_dict[name] ++ weight_loader = param.weight_loader ++ weight_loader(param, ++ loaded_weight, ++ name, ++ shard_id=shard_id, ++ expert_id=expert_id) ++ break ++ else: ++ # Skip loading extra bias for GPTQ models. ++ if ((name.endswith(".bias") or name.endswith("_bias")) ++ and name not in params_dict): ++ continue ++ # Skip layers on other devices. ++ if is_pp_missing_parameter(name, self): ++ continue ++ # Remapping the name of FP8 kv-scale. ++ if name.endswith("kv_scale"): ++ remapped_kv_scale_name = name.replace( ++ ".kv_scale", ".attn.kv_scale") ++ if remapped_kv_scale_name not in params_dict: ++ logger.warning_once( ++ "Found kv scale in the checkpoint " ++ f"(e.g. {name}), but not found the expected " ++ f"name in the model " ++ f"(e.g. {remapped_kv_scale_name}). " ++ "kv-scale is not loaded.") ++ continue ++ else: ++ name = remapped_kv_scale_name ++ param = params_dict[name] ++ weight_loader = getattr(param, "weight_loader", ++ default_weight_loader) ++ weight_loader(param, loaded_weight) ++ loaded_params.add(name) ++ return loaded_params +diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py +index c0a3c59ba..8614c2273 100644 +--- a/vllm/model_executor/models/registry.py ++++ b/vllm/model_executor/models/registry.py +@@ -57,6 +57,7 @@ _TEXT_GENERATION_MODELS = { + "Gemma2ForCausalLM": ("gemma2", "Gemma2ForCausalLM"), + "Gemma3ForCausalLM": ("gemma3", "Gemma3ForCausalLM"), + "GlmForCausalLM": ("glm", "GlmForCausalLM"), ++ "Glm4ForCausalLM": ("glm4", "Glm4ForCausalLM"), + "GPT2LMHeadModel": ("gpt2", "GPT2LMHeadModel"), + "GPTBigCodeForCausalLM": ("gpt_bigcode", "GPTBigCodeForCausalLM"), + "GPTJForCausalLM": ("gpt_j", "GPTJForCausalLM"), +@@ -101,6 +102,8 @@ _TEXT_GENERATION_MODELS = { + "QWenLMHeadModel": ("qwen", "QWenLMHeadModel"), + "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"), + "Qwen2MoeForCausalLM": ("qwen2_moe", "Qwen2MoeForCausalLM"), ++ "Qwen3ForCausalLM": ("qwen3", "Qwen3ForCausalLM"), ++ "Qwen3MoeForCausalLM": ("qwen3_moe", "Qwen3MoeForCausalLM"), + "RWForCausalLM": ("falcon", "FalconForCausalLM"), + "StableLMEpochForCausalLM": ("stablelm", "StablelmForCausalLM"), + "StableLmForCausalLM": ("stablelm", "StablelmForCausalLM"), +@@ -108,6 +111,7 @@ _TEXT_GENERATION_MODELS = { + "SolarForCausalLM": ("solar", "SolarForCausalLM"), + "TeleChat2ForCausalLM": ("telechat2", "TeleChat2ForCausalLM"), + "TeleFLMForCausalLM": ("teleflm", "TeleFLMForCausalLM"), ++ "TeleChatForCausalLM": ("telechat2", "TeleChat2ForCausalLM"), + "XverseForCausalLM": ("llama", "LlamaForCausalLM"), + "Zamba2ForCausalLM": ("zamba2", "Zamba2ForCausalLM"), + # [Encoder-decoder] +diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py +index cecad9e89..df4cf4776 100644 +--- a/vllm/model_executor/models/siglip.py ++++ b/vllm/model_executor/models/siglip.py +@@ -140,6 +140,74 @@ class SiglipVisionEmbeddings(nn.Module): + return embeddings -+ def stat(self) -> CacheInfo: -+ return CacheInfo(hits=self._hits, total=self._total) -+ - def touch(self, key: _K) -> None: - self.cache.move_to_end(key) -@@ -226,8 +249,12 @@ class LRUCache(Generic[_K, _V]): - if key in self.cache: - value = self.cache[key] - self.cache.move_to_end(key) ++class SelfAttention(nn.Module): ++ """Multi-headed attention without any cache, used for ViT.""" + -+ self._hits += 1 - else: - value = default ++ def __init__( ++ self, ++ num_heads: int, ++ head_size: int, ++ scale: float, ++ num_kv_heads: Optional[int] = None, ++ ): ++ super().__init__() ++ self.num_heads = num_heads ++ self.head_size = head_size ++ self.scale = scale ++ self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads + -+ self._total += 1 - return value - - def put(self, key: _K, value: _V) -> None: -@@ -324,6 +351,60 @@ class PyObjectCache: - self._index = 0 - - -+def is_hip() -> bool: -+ return torch.version.hip is not None ++ assert self.num_heads % self.num_kv_heads == 0 ++ self.num_queries_per_kv = self.num_heads // self.num_kv_heads + ++ def forward( ++ self, ++ query: torch.Tensor, ++ key: torch.Tensor, ++ value: torch.Tensor, ++ ) -> torch.Tensor: ++ """Input shape: batch_size x seq_len x hidden_size""" ++ # TODO(Isotr0py): Use existing backend implementations and support FA2 ++ bsz, q_len, _ = query.size() ++ kv_len = key.size(1) + -+@lru_cache(maxsize=None) -+def is_cpu() -> bool: -+ from importlib.metadata import PackageNotFoundError, version -+ try: -+ return "cpu" in version("vllm") -+ except PackageNotFoundError: -+ return False ++ query = query.view(bsz, q_len, self.num_heads, self.head_size) ++ key = key.view(bsz, kv_len, self.num_kv_heads, self.head_size) ++ value = value.view(bsz, kv_len, self.num_kv_heads, self.head_size) + ++ if (num_repeat := self.num_queries_per_kv) > 1: ++ # Handle MQA and GQA ++ key = torch.repeat_interleave(key, num_repeat, dim=2) ++ value = torch.repeat_interleave(value, num_repeat, dim=2) + -+@lru_cache(maxsize=None) -+def is_openvino() -> bool: -+ from importlib.metadata import PackageNotFoundError, version -+ try: -+ return "openvino" in version("vllm") -+ except PackageNotFoundError: -+ return False ++ query, key, value = (x.transpose(1, 2) ++ for x in (query, key, value)) ++ from ipex_llm.transformers.models.utils import use_sdp_causal ++ from vllm.attention.backends.ipex_attn import use_sdp_causal ++ import xe_addons, math ++ mask = None ++ scale = 1 / math.sqrt(self.head_size) if self.scale is None else self.scale ++ from ipex_llm.transformers.models.common import padding_qkv_hd + ++ num = 80 ++ if self.head_size > 80: ++ num = 128 ++ query, key, value, = padding_qkv_hd( ++ query, key, value, ++ self.head_size, num ++ ) ++ if use_sdp_causal(query.shape[-1], query, 0): ++ out = xe_addons.sdp_non_causal(query.contiguous(), key.contiguous(), value.contiguous(), mask, scale)[:, :, :, :self.head_size].transpose(1, 2) ++ # import torch.nn.functional as F ++ # out = F.scaled_dot_product_attention(query, ++ # key, ++ # value, ++ # scale=self.scale) ++ # out = out.transpose(1, 2) ++ #return out.view(bsz, q_len, -1) ++ return out.reshape(bsz, q_len, -1) + -+@lru_cache(maxsize=None) -+def is_neuron() -> bool: -+ try: -+ import transformers_neuronx -+ except ImportError: -+ transformers_neuronx = None -+ return transformers_neuronx is not None + + -+@lru_cache(maxsize=None) -+def is_xpu() -> bool: -+ from importlib.metadata import PackageNotFoundError, version -+ try: -+ is_xpu_flag = "xpu" in version("vllm") -+ except PackageNotFoundError: -+ return False -+ # vllm is not build with xpu -+ if not is_xpu_flag: -+ return False -+ # try: -+ # import intel_extension_for_pytorch as ipex # noqa: F401 -+ # _import_ipex = True -+ # except ImportError as e: -+ # logger.warning("Import Error for IPEX: %s", e.msg) -+ # _import_ipex = False -+ # # ipex dependency is not ready -+ # if not _import_ipex: -+ # logger.warning("not found ipex lib") -+ # return False -+ return hasattr(torch, "xpu") and torch.xpu.is_available() -+ -+ - @lru_cache(maxsize=None) - def get_max_shared_memory_bytes(gpu: int = 0) -> int: - """Returns the maximum shared memory per thread block in bytes.""" -@@ -581,6 +662,7 @@ def create_kv_caches_with_random_flash( - seed: int = 0, - device: Optional[str] = "cuda", - ) -> Tuple[List[torch.Tensor], List[torch.Tensor]]: -+ from vllm.platforms import current_platform - current_platform.seed_everything(seed) + class SiglipAttention(nn.Module): - torch_dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype) -@@ -622,7 +704,7 @@ def create_kv_caches_with_random( - raise ValueError( - f"Does not support key cache of type fp8 with head_size {head_size}" - ) -- -+ from vllm.platforms import current_platform - current_platform.seed_everything(seed) + def __init__( +@@ -179,8 +247,10 @@ class SiglipAttention(nn.Module): + self.tp_size = get_tensor_model_parallel_world_size() + self.num_heads_per_partition = divide(self.num_heads, self.tp_size) + +- self.attn = MultiHeadAttention(self.num_heads_per_partition, +- self.head_dim, self.scale) ++ # self.attn = MultiHeadAttention(self.num_heads_per_partition, ++ # self.head_dim, self.scale) ++ self.attn = SelfAttention(self.num_heads_per_partition, ++ self.head_dim, self.scale) + + def forward( + self, +diff --git a/vllm/model_executor/models/telechat2.py b/vllm/model_executor/models/telechat2.py +index a38035e37..9631fbd83 100644 +--- a/vllm/model_executor/models/telechat2.py ++++ b/vllm/model_executor/models/telechat2.py +@@ -44,9 +44,9 @@ class TeleChat2Model(LlamaModel): + for layer in self.layers: + if not isinstance(layer, PPMissingLayer): + layer.self_attn.qkv_proj.bias = None +- layer.self_attn.qkv_proj.skip_bias_add = True ++ #layer.self_attn.qkv_proj.skip_bias_add = True + layer.mlp.gate_up_proj.bias = None +- layer.mlp.gate_up_proj.skip_bias_add = True ++ #layer.mlp.gate_up_proj.skip_bias_add = True + + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: +diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py +index fc0fb8929..6454e7006 100644 +--- a/vllm/multimodal/utils.py ++++ b/vllm/multimodal/utils.py +@@ -118,20 +118,29 @@ class MediaConnector: + fetch_timeout: Optional[int] = None, + ) -> _M: + url_spec = urlparse(url) +- + if url_spec.scheme.startswith("http"): +- connection = self.connection +- data = await connection.async_get_bytes(url, timeout=fetch_timeout) ++ try: ++ import requests ++ image = Image.open(requests.get(url, stream=True).raw) ++ return image ++ except: ++ connection = self.connection ++ data = await connection.async_get_bytes(url, timeout=fetch_timeout) - torch_dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype) -@@ -675,6 +757,7 @@ def print_warning_once(msg: str) -> None: +- return media_io.load_bytes(data) ++ return media_io.load_bytes(data) - @lru_cache(maxsize=None) - def is_pin_memory_available() -> bool: -+ from vllm.platforms import current_platform - return current_platform.is_pin_memory_available() + if url_spec.scheme == "data": + return self._load_data_url(url_spec, media_io) + if url_spec.scheme == "file": + return self._load_file_url(url_spec, media_io) ++ ++ import os ++ if url_spec.scheme == "" and os.path.exists(url): ++ image = Image.open(url).convert('RGB') ++ return image -@@ -685,6 +768,7 @@ class DeviceMemoryProfiler: +- msg = "The URL must be either a HTTP, data or file URL." ++ msg = "The URL must be either a HTTP, data, file URL or a exist path." + raise ValueError(msg) - def current_memory_usage(self) -> float: - # Return the memory usage in bytes. -+ from vllm.platforms import current_platform - if current_platform.is_cuda_alike(): - torch.cuda.reset_peak_memory_stats(self.device) - mem = torch.cuda.max_memory_allocated(self.device) -@@ -1038,6 +1122,7 @@ def _cuda_device_count_stateless( - import torch.cuda - import torch.version + def fetch_audio( +diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py +index b6f6029de..b90fea9fd 100644 +--- a/vllm/platforms/interface.py ++++ b/vllm/platforms/interface.py +@@ -40,8 +40,9 @@ class _Backend(enum.Enum): + HPU_ATTN = enum.auto() + PALLAS = enum.auto() + PALLAS_VLLM_V1 = enum.auto() +- IPEX = enum.auto() + BLOCK_SPARSE_FLASH_ATTN = enum.auto() ++ IPEX = enum.auto() ++ IPEX_V1 = enum.auto() + NO_ATTENTION = enum.auto() -+ from vllm.platforms import current_platform - if not torch.cuda._is_compiled(): - return 0 - if current_platform.is_rocm(): -@@ -1645,6 +1730,7 @@ def direct_register_custom_op( - return - if not supports_custom_op(): -+ from vllm.platforms import current_platform - assert not current_platform.is_cuda_alike(), ( - "cuda platform needs torch>=2.4 to support custom op, " - "chances are you are using an old version of pytorch " -@@ -1821,7 +1907,7 @@ def memory_profiling( - result.non_kv_cache_memory_in_bytes = result.non_torch_increase_in_bytes + result.torch_peak_increase_in_bytes + result.weights_memory_in_bytes # noqa +@@ -131,6 +132,9 @@ class Platform: + def is_cpu(self) -> bool: + return self._enum == PlatformEnum.CPU ++ ++ def is_xpu(self) -> bool: ++ return self._enum == PlatformEnum.XPU --# Adapted from: https://github.com/sgl-project/sglang/blob/f46f394f4d4dbe4aae85403dec006199b34d2840/python/sglang/srt/utils.py#L630 # noqa: E501Curre -+# Adapted from: https://github.com/sgl-project/sglang/blob/v0.4.1/python/sglang/srt/utils.py#L630 # noqa: E501 - def set_ulimit(target_soft_limit=65535): - resource_type = resource.RLIMIT_NOFILE - current_soft, current_hard = resource.getrlimit(resource_type) -@@ -1836,3 +1922,82 @@ def set_ulimit(target_soft_limit=65535): - "with error %s. This can cause fd limit errors like" - "`OSError: [Errno 24] Too many open files`. Consider " - "increasing with ulimit -n", current_soft, e) -+ -+ -+# Adapted from: https://github.com/sgl-project/sglang/blob/v0.4.1/python/sglang/utils.py#L28 # noqa: E501 -+def get_exception_traceback(): -+ etype, value, tb = sys.exc_info() -+ err_str = "".join(traceback.format_exception(etype, value, tb)) -+ return err_str -+ -+ -+# Adapted from: https://github.com/sgl-project/sglang/blob/v0.4.1/python/sglang/srt/utils.py#L783 # noqa: E501 -+def make_zmq_socket( -+ ctx: Union[zmq.asyncio.Context, zmq.Context], # type: ignore[name-defined] -+ path: str, -+ type: Any, -+) -> Union[zmq.Socket, zmq.asyncio.Socket]: # type: ignore[name-defined] -+ """Make a ZMQ socket with the proper bind/connect semantics.""" -+ -+ mem = psutil.virtual_memory() -+ socket = ctx.socket(type) -+ -+ # Calculate buffer size based on system memory -+ total_mem = mem.total / 1024**3 -+ available_mem = mem.available / 1024**3 -+ # For systems with substantial memory (>32GB total, >16GB available): -+ # - Set a large 0.5GB buffer to improve throughput -+ # For systems with less memory: -+ # - Use system default (-1) to avoid excessive memory consumption -+ if total_mem > 32 and available_mem > 16: -+ buf_size = int(0.5 * 1024**3) # 0.5GB in bytes -+ else: -+ buf_size = -1 # Use system default buffer size -+ -+ if type == zmq.constants.PULL: -+ socket.setsockopt(zmq.constants.RCVHWM, 0) -+ socket.setsockopt(zmq.constants.RCVBUF, buf_size) -+ socket.connect(path) -+ elif type == zmq.constants.PUSH: -+ socket.setsockopt(zmq.constants.SNDHWM, 0) -+ socket.setsockopt(zmq.constants.SNDBUF, buf_size) -+ socket.bind(path) -+ else: -+ raise ValueError(f"Unknown Socket Type: {type}") -+ -+ return socket -+ -+ -+@contextlib.contextmanager -+def zmq_socket_ctx( -+ path: str, -+ type: Any) -> Iterator[zmq.Socket]: # type: ignore[name-defined] -+ """Context manager for a ZMQ socket""" -+ -+ ctx = zmq.Context(io_threads=2) # type: ignore[attr-defined] -+ try: -+ yield make_zmq_socket(ctx, path, type) -+ -+ except KeyboardInterrupt: -+ logger.debug("Got Keyboard Interrupt.") -+ -+ finally: -+ ctx.destroy(linger=0) -+ -+ -+def _check_multiproc_method(): -+ if (cuda_is_initialized() -+ and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn"): -+ logger.warning("CUDA was previously initialized. We must use " -+ "the `spawn` multiprocessing start method. Setting " -+ "VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. " -+ "See https://docs.vllm.ai/en/latest/getting_started/" -+ "troubleshooting.html#python-multiprocessing " -+ "for more information.") -+ os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" -+ -+ -+def get_mp_context(): -+ _check_multiproc_method() -+ mp_method = envs.VLLM_WORKER_MULTIPROC_METHOD -+ return multiprocessing.get_context(mp_method) -diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py -index 026a0292c..660889a17 100644 ---- a/vllm/v1/attention/backends/flash_attn.py -+++ b/vllm/v1/attention/backends/flash_attn.py -@@ -2,11 +2,18 @@ - from dataclasses import dataclass - from typing import Any, Dict, List, Optional, Tuple, Type + def is_neuron(self) -> bool: + return self._enum == PlatformEnum.NEURON +diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py +index 225e756cd..4fd7fe220 100644 +--- a/vllm/platforms/xpu.py ++++ b/vllm/platforms/xpu.py +@@ -4,6 +4,7 @@ from typing import TYPE_CHECKING, Optional -+import numpy as np import torch -+import triton -+import triton.language as tl - from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, - AttentionMetadata, AttentionType) --from vllm.vllm_flash_attn import flash_attn_varlen_func -+from vllm.platforms import current_platform -+from vllm.utils import cdiv -+ -+if current_platform.is_cuda(): -+ from vllm.vllm_flash_attn import flash_attn_varlen_func ++import vllm.envs as envs + from vllm.logger import init_logger + from .interface import DeviceCapability, Platform, PlatformEnum, _Backend +@@ -33,8 +34,13 @@ class XPUPlatform(Platform): + use_mla: bool) -> str: + if selected_backend != _Backend.IPEX: + logger.info("Cannot use %s backend on XPU.", selected_backend) +- logger.info("Using IPEX attention backend.") +- return "vllm.attention.backends.ipex_attn.IpexAttnBackend" ++ use_v1 = envs.VLLM_USE_V1 ++ if use_v1: ++ logger.info("Using IPEX_V1 attention backend.") ++ return "vllm.v1.attention.backends.ipex_attn.IPEXAttentionBackend" ++ else: ++ logger.info("Using IPEX attention backend.") ++ return "vllm.attention.backends.ipex_attn.IpexAttnBackend" - class FlashAttentionBackend(AttentionBackend): -@@ -38,6 +45,10 @@ class FlashAttentionBackend(AttentionBackend): - raise ValueError("Block size must be a multiple of 16.") - return (2, num_blocks, block_size, num_kv_heads, head_size) + @staticmethod + def get_device_capability( +@@ -63,6 +69,8 @@ class XPUPlatform(Platform): + @classmethod + def check_and_update_config(cls, vllm_config: VllmConfig) -> None: + cache_config = vllm_config.cache_config ++ if cache_config and envs.VLLM_USE_V1: ++ cache_config.block_size = 64 + if cache_config and cache_config.block_size is None: + cache_config.block_size = 16 -+ @staticmethod -+ def use_cascade_attention(*args, **kwargs) -> bool: -+ return use_cascade_attention(*args, **kwargs) -+ +@@ -87,31 +95,46 @@ class XPUPlatform(Platform): + raise NotImplementedError( + "XPU does not support speculative decoding") - @dataclass - class FlashAttentionMetadata: -@@ -56,6 +67,18 @@ class FlashAttentionMetadata: - seq_start_loc: torch.Tensor - block_table: torch.Tensor - slot_mapping: torch.Tensor -+ -+ # For cascade attention. -+ use_cascade: bool -+ common_prefix_len: int -+ cu_prefix_query_lens: Optional[torch.Tensor] -+ cu_prefix_kv_lens: Optional[torch.Tensor] -+ cu_suffix_kv_lens: Optional[torch.Tensor] +- if vllm_config.device_config is not None: +- assert vllm_config.device_config.device_type == "xpu" ++ # if vllm_config.device_config is not None: ++ # assert vllm_config.device_config.device_type == "xpu" + + # check and update parallel config + parallel_config = vllm_config.parallel_config +- if parallel_config.worker_cls == "auto": + -+ context_lens: Optional[torch.Tensor] -+ seq_lens: Optional[torch.Tensor] ++ # TODO(xiangyu): check logic here + -+ # For logging. - num_input_tokens: int = 0 # Number of tokens including padding. ++ if envs.VLLM_USE_V1: ++ parallel_config.worker_cls = \ ++ "vllm.v1.worker.xpu_worker.XPUWorker" ++ else: + parallel_config.worker_cls = "vllm.worker.xpu_worker.XPUWorker" + + if parallel_config.distributed_executor_backend is None: +- parallel_config.distributed_executor_backend = "ray" ++ if parallel_config.world_size > 1: ++ parallel_config.distributed_executor_backend = "ray" ++ else: ++ parallel_config.distributed_executor_backend = "uni" + elif parallel_config.distributed_executor_backend == "mp": + # FIXME(kunshang): + # spawn needs calling `if __name__ == '__main__':`` + # fork is not supported for xpu start new process. +- logger.error( +- "Both start methods (spawn and fork) have issue " +- "on XPU if you use mp backend, setting it to ray instead.") +- parallel_config.distributed_executor_backend = "ray" +- +- elif parallel_config.distributed_executor_backend != "ray": ++ logger.warning( ++ "Please use spawn as start method if you want to use mp.") ++ elif parallel_config.distributed_executor_backend != "ray" and \ ++ parallel_config.distributed_executor_backend != "uni": + logger.warning( + "%s is not supported on XPU, fallback to ray distributed" + " executor backend.", + parallel_config.distributed_executor_backend) + parallel_config.distributed_executor_backend = "ray" ++ # if (parallel_config.distributed_executor_backend is not None ++ # and parallel_config.distributed_executor_backend != "ray"): ++ # logger.warning( ++ # "%s is not supported on XPU, fallback to ray distributed" ++ # " executor backend.", ++ # parallel_config.distributed_executor_backend) ++ # parallel_config.distributed_executor_backend = "ray" ++ + @classmethod + def is_pin_memory_available(cls): +diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py +index 53699341b..6bc039068 100644 +--- a/vllm/transformers_utils/configs/__init__.py ++++ b/vllm/transformers_utils/configs/__init__.py +@@ -1,6 +1,7 @@ + # SPDX-License-Identifier: Apache-2.0 -@@ -169,21 +192,246 @@ class FlashAttentionImpl(AttentionImpl): - ) + from vllm.transformers_utils.configs.chatglm import ChatGLMConfig ++from vllm.transformers_utils.configs.telechat2 import Telechat2Config + from vllm.transformers_utils.configs.cohere2 import Cohere2Config + from vllm.transformers_utils.configs.dbrx import DbrxConfig + from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekVLV2Config +@@ -26,6 +27,7 @@ from vllm.transformers_utils.configs.telechat2 import Telechat2Config + from vllm.transformers_utils.configs.ultravox import UltravoxConfig - # Compute attention and update output up to `num_actual_tokens`. -- flash_attn_varlen_func( -- q=query[:num_actual_tokens], -- k=key_cache, -- v=value_cache, -- out=output[:num_actual_tokens], -- cu_seqlens_q=attn_metadata.query_start_loc, -- max_seqlen_q=attn_metadata.max_query_len, -- cu_seqlens_k=attn_metadata.seq_start_loc, -- max_seqlen_k=attn_metadata.max_seq_len, -+ if not attn_metadata.use_cascade: -+ # Regular attention (common case). -+ flash_attn_varlen_func( -+ q=query[:num_actual_tokens], -+ k=key_cache, -+ v=value_cache, -+ out=output[:num_actual_tokens], -+ cu_seqlens_q=attn_metadata.query_start_loc, -+ max_seqlen_q=attn_metadata.max_query_len, -+ cu_seqlens_k=attn_metadata.seq_start_loc, -+ max_seqlen_k=attn_metadata.max_seq_len, -+ softmax_scale=self.scale, -+ causal=True, -+ alibi_slopes=self.alibi_slopes, -+ window_size=self.sliding_window, -+ block_table=attn_metadata.block_table, -+ softcap=self.logits_soft_cap, -+ ) -+ return output -+ -+ # Cascade attention (rare case). -+ cascade_attention( -+ output[:num_actual_tokens], -+ query[:num_actual_tokens], -+ key_cache, -+ value_cache, -+ cu_query_lens=attn_metadata.query_start_loc, -+ max_query_len=attn_metadata.max_query_len, -+ cu_prefix_query_lens=attn_metadata.cu_prefix_query_lens, -+ cu_prefix_kv_lens=attn_metadata.cu_prefix_kv_lens, -+ cu_suffix_kv_lens=attn_metadata.cu_suffix_kv_lens, -+ max_kv_len=attn_metadata.max_seq_len, - softmax_scale=self.scale, -- causal=True, - alibi_slopes=self.alibi_slopes, -- window_size=self.sliding_window, -+ sliding_window=self.sliding_window, -+ logits_soft_cap=self.logits_soft_cap, - block_table=attn_metadata.block_table, -- softcap=self.logits_soft_cap, -+ common_prefix_len=attn_metadata.common_prefix_len, - ) -- - return output -+ -+ -+def use_cascade_attention( -+ common_prefix_len: int, -+ query_lens: np.ndarray, -+ num_query_heads: int, -+ num_kv_heads: int, -+ use_alibi: bool, -+ use_sliding_window: bool, -+ num_sms: int, -+) -> bool: -+ """Decide whether to use cascade attention. -+ -+ This function 1) checks whether cascade attention is supported with the -+ given configuration, and 2) heuristically decides whether using cascade -+ attention can improve performance. -+ """ -+ # Too short common prefix. Probably not worth using cascade attention. -+ # We use an arbitrary threshold of 256 tokens. TODO: Tune this threshold. -+ # NOTE(woosuk): This is the common case. We should return False as soon as -+ # possible to avoid any unnecessary computation. -+ if common_prefix_len < 256: -+ return False -+ # Cascade attention is currently not supported with these variants. -+ if use_alibi or use_sliding_window: -+ return False -+ # Too few queries. Probably not worth using cascade attention. -+ # We use an arbitrary threshold of 8 queries. TODO: Tune this threshold. -+ num_reqs = len(query_lens) -+ if num_reqs < 8: -+ return False -+ -+ # Heuristics to decide whether using cascade attention is beneficial. -+ # 1. When FlashDecoding is not used for normal attention, cascade attention -+ # is likely to be faster since it saves memory bandwidth. -+ num_queries_per_kv = num_query_heads // num_kv_heads -+ # The criteria for using FlashDecoding can be found in the following link: -+ # https://github.com/vllm-project/flash-attention/blob/96266b1111111f3d11aabefaf3bacbab6a89d03c/csrc/flash_attn/flash_api.cpp#L535 -+ use_flash_decoding = (num_queries_per_kv > 1 and not use_sliding_window -+ and not use_alibi and np.all(query_lens == 1)) -+ if not use_flash_decoding: -+ # Use cascade attention. -+ return True -+ -+ # 2. When FlashDecoding is used for normal attention, it is not clear -+ # whether cascade attention is beneficial, because FlashDecoding can -+ # launch more CTAs than cascade attention. -+ # We use a simple performance model to compare the two methods. -+ # NOTE(woosuk): The performance model is very rough and may not be -+ # accurate. -+ num_tokens = num_reqs -+ # NOTE(woosuk): These are default tile sizes. flash-attn might use -+ # different tile sizes (e.g., 64 or 256) depending on the configuration. -+ q_tile_size = 128 -+ kv_tile_size = 128 -+ num_prefix_tiles = cdiv(common_prefix_len, kv_tile_size) -+ -+ cascade_ctas = num_query_heads * cdiv(num_tokens, q_tile_size) -+ cascade_waves = cdiv(cascade_ctas, num_sms) -+ cascade_time = cascade_waves * num_prefix_tiles -+ -+ flash_decoding_ctas = (num_reqs * num_kv_heads * -+ cdiv(num_queries_per_kv, q_tile_size)) -+ flash_decoding_ctas *= num_prefix_tiles -+ flash_decoding_time = cdiv(flash_decoding_ctas, num_sms) -+ -+ # Use cascade attention if it is faster than FlashDecoding. -+ return cascade_time < flash_decoding_time -+ -+ -+def cascade_attention( -+ output: torch.Tensor, -+ query: torch.Tensor, -+ key_cache: torch.Tensor, -+ value_cache: torch.Tensor, -+ cu_query_lens: torch.Tensor, -+ max_query_len: int, -+ cu_prefix_query_lens: torch.Tensor, -+ cu_prefix_kv_lens: torch.Tensor, -+ cu_suffix_kv_lens: torch.Tensor, -+ max_kv_len: int, -+ softmax_scale: float, -+ alibi_slopes: Optional[torch.Tensor], -+ sliding_window: Tuple[int, int], -+ logits_soft_cap: float, -+ block_table: torch.Tensor, -+ common_prefix_len: int, -+) -> torch.Tensor: -+ -+ assert alibi_slopes is None, ("Cascade attention does not support ALiBi.") -+ # TODO: Support sliding window. -+ assert sliding_window == (-1, -1), ( -+ "Cascade attention does not support sliding window.") -+ -+ num_tokens = query.shape[0] -+ block_size = key_cache.shape[-3] -+ assert common_prefix_len % block_size == 0 -+ num_common_kv_blocks = common_prefix_len // block_size -+ assert num_common_kv_blocks > 0 -+ -+ # Process shared prefix. -+ prefix_output, prefix_lse = flash_attn_varlen_func( -+ q=query, -+ k=key_cache, -+ v=value_cache, -+ cu_seqlens_q=cu_prefix_query_lens, -+ cu_seqlens_k=cu_prefix_kv_lens, -+ max_seqlen_q=num_tokens, -+ max_seqlen_k=common_prefix_len, -+ softmax_scale=softmax_scale, -+ causal=False, -+ window_size=sliding_window, -+ block_table=block_table[:1], -+ softcap=logits_soft_cap, -+ return_softmax_lse=True, -+ ) -+ -+ # Process suffix per query. -+ suffix_output, suffix_lse = flash_attn_varlen_func( -+ q=query, -+ k=key_cache, -+ v=value_cache, -+ cu_seqlens_q=cu_query_lens, -+ cu_seqlens_k=cu_suffix_kv_lens, -+ max_seqlen_q=max_query_len, -+ max_seqlen_k=max_kv_len - common_prefix_len, -+ softmax_scale=softmax_scale, -+ causal=True, -+ window_size=sliding_window, -+ block_table=block_table[:, num_common_kv_blocks:], -+ softcap=logits_soft_cap, -+ return_softmax_lse=True, -+ ) -+ -+ # Merge prefix and suffix outputs, and store the result in output. -+ merge_attn_states(output, prefix_output, prefix_lse, suffix_output, -+ suffix_lse) -+ -+ -+def merge_attn_states( -+ output: torch.Tensor, -+ prefix_output: torch.Tensor, -+ prefix_lse: torch.Tensor, -+ suffix_output: torch.Tensor, -+ suffix_lse: torch.Tensor, -+) -> None: -+ num_tokens = output.shape[0] -+ num_query_heads = output.shape[1] -+ head_size = output.shape[2] -+ padded_head_size = triton.next_power_of_2(head_size) -+ -+ # TODO(woosuk): Use CUDA kernel instead of Triton to minimize CPU overhead. -+ merge_attn_states_kernel[(num_tokens, num_query_heads)]( -+ output, -+ prefix_output, -+ prefix_lse, -+ suffix_output, -+ suffix_lse, -+ head_size, -+ padded_head_size, -+ ) -+ + __all__ = [ ++ "Telechat2Config", + "ChatGLMConfig", + "Cohere2Config", + "DbrxConfig", +diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py +index c271f438e..cf7180606 100755 +--- a/vllm/v1/attention/backends/flash_attn.py ++++ b/vllm/v1/attention/backends/flash_attn.py +@@ -14,8 +14,8 @@ from vllm.attention.ops.triton_merge_attn_states import merge_attn_states + from vllm.logger import init_logger + from vllm.platforms import current_platform + from vllm.utils import cdiv +-from vllm.vllm_flash_attn.fa_utils import (flash_attn_supports_fp8, +- get_flash_attn_version) ++# from vllm.vllm_flash_attn.fa_utils import (flash_attn_supports_fp8, ++# get_flash_attn_version) + + if TYPE_CHECKING: + from vllm.v1.core.sched.output import SchedulerOutput +@@ -640,6 +640,7 @@ def cascade_attention( + k_descale: Optional[torch.Tensor] = None, + v_descale: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + -+@triton.jit -+def merge_attn_states_kernel( -+ output, # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE] -+ prefix_output, # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE] -+ prefix_lse, # [NUM_HEADS, NUM_TOKENS] -+ suffix_output, # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE] -+ suffix_lse, # [NUM_HEADS, NUM_TOKENS] -+ HEAD_SIZE: tl.constexpr, -+ PADDED_HEAD_SIZE: tl.constexpr, -+): -+ token_idx = tl.program_id(0) -+ num_tokens = tl.num_programs(0) -+ head_idx = tl.program_id(1) -+ num_heads = tl.num_programs(1) -+ -+ p_lse = tl.load(prefix_lse + head_idx * num_tokens + token_idx) -+ s_lse = tl.load(suffix_lse + head_idx * num_tokens + token_idx) -+ max_lse = tl.maximum(p_lse, s_lse) -+ p_lse = p_lse - max_lse -+ s_lse = s_lse - max_lse -+ -+ head_arange = tl.arange(0, PADDED_HEAD_SIZE) -+ head_mask = head_arange < HEAD_SIZE -+ p_out = tl.load(prefix_output + token_idx * num_heads * HEAD_SIZE + -+ head_idx * HEAD_SIZE + head_arange, -+ mask=head_mask) -+ s_out = tl.load(suffix_output + token_idx * num_heads * HEAD_SIZE + -+ head_idx * HEAD_SIZE + head_arange, -+ mask=head_mask) -+ -+ # NOTE(woosuk): Be careful with the numerical stability. -+ # We should compute the scale first, and then multiply it with the output. -+ # Do not multiply the output with tl.exp(p_lse) or tl.exp(s_lse) directly. -+ p_scale = tl.exp(p_lse) / (tl.exp(p_lse) + tl.exp(s_lse)) -+ s_scale = tl.exp(s_lse) / (tl.exp(p_lse) + tl.exp(s_lse)) -+ out = p_out * p_scale + s_out * s_scale -+ tl.store(output + token_idx * num_heads * HEAD_SIZE + -+ head_idx * HEAD_SIZE + head_arange, -+ out, -+ mask=head_mask) + assert alibi_slopes is None, ("Cascade attention does not support ALiBi.") + # TODO: Support sliding window. + assert sliding_window == (-1, -1), ( diff --git a/vllm/v1/attention/backends/ipex_attn.py b/vllm/v1/attention/backends/ipex_attn.py new file mode 100644 -index 000000000..5567a4656 +index 000000000..29cde02f3 --- /dev/null +++ b/vllm/v1/attention/backends/ipex_attn.py -@@ -0,0 +1,349 @@ +@@ -0,0 +1,358 @@ ++from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Tuple, Type + +import torch + +from vllm._ipex_ops import ipex_ops +from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, -+ AttentionMetadata, AttentionType) ++ AttentionMetadata, AttentionType, AttentionLayer) +from vllm.forward_context import get_forward_context +from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata +from vllm.attention.ops.paged_attn import (PagedAttention, @@ -37073,6 +14476,10 @@ index 000000000..5567a4656 +from vllm.attention.backends.ipex_attn import use_gqa_kernel +import os + ++@dataclass ++class IPEXAttentionMetadata(FlashAttentionMetadata): ++ seq_start_loc: torch.Tensor = torch.tensor([0], dtype=torch.int64) ++ + +class IPEXAttentionBackend(AttentionBackend): + @@ -37085,12 +14492,12 @@ index 000000000..5567a4656 + return "IPEX_V1" + + @staticmethod -+ def get_impl_cls() -> Type["IPEXAttentionImpl"]: -+ return IPEXAttentionImpl ++ def get_impl_cls() -> Type["IPEXAttentionBackendImpl"]: ++ return IPEXAttentionBackendImpl + + @staticmethod + def get_metadata_cls() -> Type["AttentionMetadata"]: -+ return FlashAttentionMetadata ++ return IPEXAttentionMetadata + + @staticmethod + def get_kv_cache_shape( @@ -37108,7 +14515,7 @@ index 000000000..5567a4656 + + + -+class IPEXAttentionImpl(AttentionImpl): ++class IPEXAttentionBackendImpl(AttentionImpl): + + def __init__( + self, @@ -37116,11 +14523,13 @@ index 000000000..5567a4656 + head_size: int, + scale: float, + num_kv_heads: int, -+ alibi_slopes: Optional[List[float]], ++ alibi_slopes: Optional[list[float]], + sliding_window: Optional[int], + kv_cache_dtype: str, -+ blocksparse_params: Optional[Dict[str, Any]] = None, ++ blocksparse_params: Optional[dict[str, Any]] = None, + logits_soft_cap: Optional[float] = None, ++ attn_type: str = AttentionType.DECODER, ++ use_irope: bool = False, + ) -> None: + if blocksparse_params is not None: + raise ValueError( @@ -37137,6 +14546,8 @@ index 000000000..5567a4656 + else: + self.sliding_window = (sliding_window - 1, 0) + self.kv_cache_dtype = kv_cache_dtype ++ self.use_irope = use_irope ++ + if logits_soft_cap is None: + # In flash-attn, setting logits_soft_cap as 0 means no soft cap. + logits_soft_cap = 0 @@ -37150,18 +14561,22 @@ index 000000000..5567a4656 + raise ValueError( + f"Head size {head_size} is not supported by FlashAttention. " + f"Supported head sizes are: {support_head_sizes}.") ++ if attn_type != AttentionType.DECODER: ++ raise NotImplementedError("Encoder self-attention and " ++ "encoder/decoder cross-attention " ++ "are not implemented for " ++ "IpexAttnBackendImpl") + + # TODO(gc): Refine this logic..., because of bad performance... + def forward( + self, ++ layer: AttentionLayer, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + kv_cache: torch.Tensor, -+ attn_metadata: FlashAttentionMetadata, -+ k_scale: float = 1.0, -+ v_scale: float = 1.0, -+ attn_type: AttentionType = AttentionType.DECODER, ++ attn_metadata: IPEXAttentionBackend, ++ output: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + """Forward pass with IPEXAttention. + @@ -37174,16 +14589,12 @@ index 000000000..5567a4656 + Returns: + shape = [num_tokens, num_heads * head_size] + """ -+ if attn_type != AttentionType.DECODER: -+ raise NotImplementedError("Encoder self-attention and " -+ "encoder/decoder cross-attention " -+ "are not implemented for " -+ "IPEXAttentionImpl") -+ -+ # NOTE(woosuk): IPEXAttention does not support FP8 KV cache. -+ assert k_scale == 1.0 and v_scale == 1.0, ( -+ "key/v_scale is not supported in IPEXAttention.") ++ # # NOTE(woosuk): IPEXAttention does not support FP8 KV cache. ++ # assert k_scale == 1.0 and v_scale == 1.0, ( ++ # "key/v_scale is not supported in IPEXAttention.") + ++ k_scale = layer._k_scale ++ v_scale = layer._v_scale + output = torch.empty_like(query) + # torch.ops.vllm.ipex_attn_chunked_prefill( + ipex_llm_chunked_prefill( @@ -37216,133 +14627,49 @@ index 000000000..5567a4656 + key_cache = kv_cache[0] + key_cache = key_cache.view(num_blocks, num_kv_heads, -1, head_size) + value_cache = kv_cache[1] -+ value_cache = value_cache.view(num_blocks, num_kv_heads, -1, head_size) -+ return key_cache, value_cache -+ -+def split_kv_cache( -+ kv_cache: torch.Tensor, -+ num_kv_heads: int, -+ head_size: int, -+) -> Tuple[torch.Tensor, torch.Tensor]: -+ x = 16 // kv_cache.element_size() -+ num_blocks = kv_cache.shape[1] -+ -+ key_cache = kv_cache[0] -+ key_cache = key_cache.view(num_blocks, num_kv_heads, head_size // x, -+ -1, x) -+ -+ value_cache = kv_cache[1] -+ value_cache = value_cache.view(num_blocks, num_kv_heads, head_size, -1) -+ return key_cache, value_cache -+ -+ -+ -+@torch.library.custom_op("vllm::ipex_attn_fake", -+ mutates_args=["output", "kv_cache"]) -+def ipex_attn_fake( -+ output: torch.Tensor, -+ query: torch.Tensor, -+ key: torch.Tensor, -+ value: torch.Tensor, -+ num_heads: int, -+ head_size: int, -+ num_kv_heads: int, -+ kv_cache: torch.Tensor, -+ kv_cache_dtype: str, -+ k_scale: float, -+ v_scale: float, -+ scale: float, -+ sliding_window: Optional[List[int]] = None, -+ alibi_slopes: Optional[torch.Tensor] = None, -+ logits_soft_cap: Optional[float] = None, -+) -> None: -+ pass -+ -+def ipex_llm_chunked_prefill( -+ output: torch.Tensor, -+ query: torch.Tensor, -+ key: torch.Tensor, -+ value: torch.Tensor, -+ num_heads: int, -+ head_size: int, -+ num_kv_heads: int, -+ kv_cache: torch.Tensor, -+ kv_cache_dtype: str, -+ k_scale: float, -+ v_scale: float, -+ scale: float, -+ sliding_window: Optional[List[int]] = None, -+ alibi_slopes: Optional[torch.Tensor] = None, -+ logits_soft_cap: Optional[float] = None, -+) -> None: -+ context = get_forward_context() -+ current_metadata = context.dynamic_forward_context -+ if current_metadata is None: -+ # Profiling run. -+ return -+ assert current_metadata is not None -+ assert isinstance(current_metadata, FlashAttentionMetadata) -+ attn_metadata: FlashAttentionMetadata = current_metadata -+ num_actual_tokens = attn_metadata.num_actual_tokens -+ -+ query = query.view(-1, num_heads, head_size) -+ key = key.view(-1, num_kv_heads, head_size) -+ value = value.view(-1, num_kv_heads, head_size) -+ -+ using_gqa_kernel = use_gqa_kernel(num_heads, num_kv_heads, head_size, logits_soft_cap) -+ -+ -+ if using_gqa_kernel: -+ key_cache, value_cache = split_kv_cache_ipexllm( -+ kv_cache, num_kv_heads, head_size) -+ ipex_ops.reshape_and_cache_ipexllm( -+ key[:num_actual_tokens], -+ value[:num_actual_tokens], -+ key_cache, -+ value_cache, -+ attn_metadata.slot_mapping.flatten(), -+ kv_cache_dtype, -+ k_scale, -+ v_scale, -+ ) -+ else: -+ key_cache, value_cache = split_kv_cache( -+ kv_cache, num_kv_heads, head_size) -+ ipex_ops.reshape_and_cache( -+ key[:num_actual_tokens], -+ value[:num_actual_tokens], -+ key_cache, -+ value_cache, -+ attn_metadata.slot_mapping.flatten(), -+ kv_cache_dtype, -+ k_scale, -+ v_scale, -+ ) -+ # Invoke chunked prefill method... -+ import vllm._C.ops -+ assert head_size == 128 or head_size == 64 -+ value = os.environ.get('USE_CONTEXT_V1') -+ query_len = attn_metadata.query_start_loc[1:] - attn_metadata.query_start_loc[:-1] -+ seq_len = attn_metadata.seq_start_loc[1:] - attn_metadata.seq_start_loc[:-1] -+ context_len = seq_len - query_len -+ if using_gqa_kernel: -+ # if using_gqa_kernel, then only the v1 kernel can be used -+ out = vllm._C.ops.context_attention_forward_v1(query[:num_actual_tokens], key_cache, value_cache, attn_metadata.block_table, attn_metadata.query_start_loc, seq_len, context_len, attn_metadata.max_seq_len, torch.amax(context_len).item()) -+ elif value is None: -+ # Otherwise, by default use v2 attention forward kernel... -+ out = vllm._C.ops.context_attention_forward_v2(query[:num_actual_tokens], key_cache, value_cache, attn_metadata.block_table, attn_metadata.query_start_loc, seq_len, context_len, attn_metadata.max_seq_len, torch.amax(context_len).item(), torch.amax(query_len).item()) -+ else: -+ out = vllm._C.ops.context_attention_forward_v1(query[:num_actual_tokens], key_cache, value_cache, attn_metadata.block_table, attn_metadata.query_start_loc, seq_len, context_len, attn_metadata.max_seq_len, torch.amax(context_len).item()) -+ -+ # output[:num_actual_tokens] = out -+ output[:num_actual_tokens] = out.view(out.shape[0], -1) ++ value_cache = value_cache.view(num_blocks, num_kv_heads, -1, head_size) ++ return key_cache, value_cache + ++def split_kv_cache( ++ kv_cache: torch.Tensor, ++ num_kv_heads: int, ++ head_size: int, ++) -> Tuple[torch.Tensor, torch.Tensor]: ++ x = 16 // kv_cache.element_size() ++ num_blocks = kv_cache.shape[1] + ++ key_cache = kv_cache[0] ++ key_cache = key_cache.view(num_blocks, num_kv_heads, head_size // x, ++ -1, x) + ++ value_cache = kv_cache[1] ++ value_cache = value_cache.view(num_blocks, num_kv_heads, head_size, -1) ++ return key_cache, value_cache + -+@torch.library.custom_op("vllm::ipex_attn_chunked_prefill", ++ ++ ++@torch.library.custom_op("vllm::ipex_attn_fake", + mutates_args=["output", "kv_cache"]) -+def ipex_attn_chunked_prefill( ++def ipex_attn_fake( ++ output: torch.Tensor, ++ query: torch.Tensor, ++ key: torch.Tensor, ++ value: torch.Tensor, ++ num_heads: int, ++ head_size: int, ++ num_kv_heads: int, ++ kv_cache: torch.Tensor, ++ kv_cache_dtype: str, ++ k_scale: float, ++ v_scale: float, ++ scale: float, ++ sliding_window: Optional[List[int]] = None, ++ alibi_slopes: Optional[torch.Tensor] = None, ++ logits_soft_cap: Optional[float] = None, ++) -> None: ++ pass ++ ++def ipex_llm_chunked_prefill( + output: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor, @@ -37360,11 +14687,10 @@ index 000000000..5567a4656 + logits_soft_cap: Optional[float] = None, +) -> None: + context = get_forward_context() -+ current_metadata = context.dynamic_forward_context ++ current_metadata = context.attn_metadata + if current_metadata is None: + # Profiling run. + return -+ + assert current_metadata is not None + assert isinstance(current_metadata, FlashAttentionMetadata) + attn_metadata: FlashAttentionMetadata = current_metadata @@ -37374,878 +14700,145 @@ index 000000000..5567a4656 + key = key.view(-1, num_kv_heads, head_size) + value = value.view(-1, num_kv_heads, head_size) + -+ # Reshape the input keys and values and store them in the cache. -+ key_cache = kv_cache[0] -+ value_cache = kv_cache[1] -+ -+ ipex_ops.reshape_and_cache_flash( -+ key[:num_actual_tokens], -+ value[:num_actual_tokens], -+ key_cache, -+ value_cache, -+ attn_metadata.slot_mapping, -+ kv_cache_dtype, -+ k_scale, -+ v_scale, -+ ) -+ -+ ipex_ops.chunked_prefill( -+ query[:num_actual_tokens], -+ key_cache, -+ value_cache, -+ output[:num_actual_tokens], -+ attn_metadata.query_start_loc, -+ attn_metadata.seq_start_loc, -+ None, -+ attn_metadata.block_table, -+ alibi_slopes, -+ attn_metadata.max_query_len, -+ attn_metadata.max_seq_len, -+ 0.0, -+ scale, -+ False, -+ True, -+ False, -+ None, -+ ) -diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py -index 78efacccf..1cbff1e2d 100644 ---- a/vllm/v1/core/kv_cache_manager.py -+++ b/vllm/v1/core/kv_cache_manager.py -@@ -8,7 +8,7 @@ from vllm.v1.core.kv_cache_utils import (BlockHashType, FreeKVCacheBlockQueue, - generate_block_hash_extra_keys, - hash_block_tokens, - hash_request_tokens) --from vllm.v1.request import Request -+from vllm.v1.request import Request, RequestStatus - - logger = init_logger(__name__) - -@@ -191,7 +191,7 @@ class KVCacheManager: - request: The request to allocate slots. - num_tokens: The number of tokens to allocate. Note that this does - not include the tokens that have already been computed. -- computed_blocks: The blocks that have already been computed. -+ computed_blocks: A list of computed blocks. - - Returns: - A list of new allocated blocks. -@@ -200,6 +200,18 @@ class KVCacheManager: - raise ValueError( - f"num_tokens must be greater than 0, got {num_tokens}") - -+ # If a computed block of a request is an eviction candidate (in the -+ # free queue and ref_cnt == 0), it cannot be counted as a free block -+ # when allocating this request. -+ num_evictable_computed_blocks = sum(1 for blk in computed_blocks -+ if blk.ref_cnt == 0) -+ -+ num_required_blocks = cdiv(num_tokens, self.block_size) -+ if (num_required_blocks > self.free_block_queue.num_free_blocks - -+ num_evictable_computed_blocks): -+ # Cannot allocate new blocks. -+ return None -+ - # Touch the computed blocks to make sure they won't be evicted. - if self.enable_caching: - self._touch(computed_blocks) -@@ -208,11 +220,6 @@ class KVCacheManager: - "Computed blocks should be empty when " - "prefix caching is disabled") - -- num_required_blocks = cdiv(num_tokens, self.block_size) -- if (num_required_blocks > self.free_block_queue.num_free_blocks): -- # Cannot allocate new blocks. -- return None -- - # Determine the number of new blocks to allocate considering - # preallocated blocks. - num_new_blocks = min( -@@ -271,6 +278,56 @@ class KVCacheManager: - if block.ref_cnt == 0: - self.free_block_queue.append(block) - -+ def get_num_common_prefix_blocks( -+ self, -+ request: Request, -+ num_running_requests: int, -+ ) -> int: -+ """Calculate the number of common prefix blocks shared by all requests -+ in the RUNNING state. -+ -+ The function determines this by selecting any request and iterating -+ through its blocks. A block is considered a common prefix block if its -+ `ref_cnt` equals the total number of requests in the RUNNING state. -+ -+ NOTE(woosuk): The number of requests in the RUNNING state is **greater -+ than or equal to** the number of requests scheduled in the current step. -+ This is because the RUNNING state only indicates that: -+ 1. The request has not yet finished, and -+ 2. The request holds its blocks unfreed. -+ -+ While all scheduled requests must be in the RUNNING state, the inverse -+ is not necessarily true. There may be RUNNING requests that are not -+ scheduled in the current step. As of 1/1/2025, the scheduler does not -+ allow this case, but it is possible in the future, as we allow more -+ flexible scheduling. -+ -+ This can result in an edge case where the number of common prefix blocks -+ is 0, even though all scheduled requests share a common prefix. This -+ occurs because there may be unscheduled RUNNING requests that do not -+ share the common prefix. Currently, this case cannot be easily detected, -+ so the function returns 0 in such cases. -+ -+ Args: -+ request: Any request in the RUNNING state, used to identify the -+ common prefix blocks. -+ num_running_requests: The total number of requests in the RUNNING -+ state. This can be different from the number of scheduled -+ requests in the current step. -+ -+ Returns: -+ int: The number of common prefix blocks. -+ """ -+ assert request.status == RequestStatus.RUNNING -+ blocks = self.req_to_blocks[request.request_id] -+ num_common_blocks = 0 -+ for block in blocks: -+ if block.ref_cnt == num_running_requests: -+ num_common_blocks += 1 -+ else: -+ break -+ return num_common_blocks -+ - def _get_new_blocks(self, num_blocks: int) -> List[KVCacheBlock]: - """Get new blocks from the free block pool. - -diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py -index 9ddbff7c9..84ff48bf4 100644 ---- a/vllm/v1/core/kv_cache_utils.py -+++ b/vllm/v1/core/kv_cache_utils.py -@@ -218,8 +218,8 @@ def generate_block_hash_extra_keys( - continue - - # The block contains the current mm input. -- mm_start = max(0, start_token_idx - offset) -- extra_keys.append((mm_hashes[curr_mm_idx], mm_start)) -+ extra_keys.append(mm_hashes[curr_mm_idx]) -+ - if end_token_idx >= offset + length: - # If this block contains the end of the current mm input, - # move to the next mm input as this block may also contain -diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py -index 08e7c0fd4..baaf3329d 100644 ---- a/vllm/v1/core/scheduler.py -+++ b/vllm/v1/core/scheduler.py -@@ -262,6 +262,14 @@ class Scheduler: - assert (len(scheduled_new_reqs) + len(scheduled_resumed_reqs) + - len(scheduled_running_reqs) == len(self.running)) - -+ # Get the longest common prefix among all requests in the running queue. -+ # This can be potentially used for cascade attention. -+ if self.running: -+ any_request = self.running[0] -+ num_common_prefix_blocks = ( -+ self.kv_cache_manager.get_num_common_prefix_blocks( -+ any_request, len(self.running))) -+ - # Construct the scheduler output. - new_reqs_data = [ - NewRequestData.from_request(req, -@@ -287,6 +295,7 @@ class Scheduler: - num_scheduled_tokens=num_scheduled_tokens, - total_num_scheduled_tokens=total_num_scheduled_tokens, - scheduled_encoder_inputs=scheduled_encoder_inputs, -+ num_common_prefix_blocks=num_common_prefix_blocks, - preempted_req_ids=preempted_req_ids, - # finished_req_ids is an existing state in the scheduler, - # instead of being newly scheduled in this step. -@@ -594,6 +603,7 @@ class SchedulerOutput: - num_scheduled_tokens: Dict[str, int] - total_num_scheduled_tokens: int - scheduled_encoder_inputs: Dict[str, List[int]] -+ num_common_prefix_blocks: int - - preempted_req_ids: Set[str] - finished_req_ids: Set[str] -diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py -index cc0c7ea23..f70464fc8 100644 ---- a/vllm/v1/engine/__init__.py -+++ b/vllm/v1/engine/__init__.py -@@ -6,21 +6,7 @@ import msgspec - - from vllm.lora.request import LoRARequest - from vllm.multimodal import MultiModalKwargs, MultiModalPlaceholderDict --from vllm.sampling_params import RequestOutputKind, SamplingParams -- -- --@dataclass --class DetokenizerRequest: -- -- request_id: str -- prompt: Optional[str] -- prompt_token_ids: List[int] -- skip_special_tokens: bool -- spaces_between_special_tokens: bool -- output_kind: RequestOutputKind -- -- stop: List[str] -- include_stop_str_in_output: bool -+from vllm.sampling_params import SamplingParams - - - @dataclass -diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py -index ba2b83777..b963ba74f 100644 ---- a/vllm/v1/engine/async_llm.py -+++ b/vllm/v1/engine/async_llm.py -@@ -1,4 +1,5 @@ - import asyncio -+import os - from typing import AsyncGenerator, Dict, List, Mapping, Optional, Type, Union - - from vllm.config import ModelConfig, VllmConfig -@@ -16,6 +17,7 @@ from vllm.sampling_params import SamplingParams - from vllm.transformers_utils.tokenizer import AnyTokenizer - from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs - from vllm.usage.usage_lib import UsageContext -+from vllm.utils import kill_process_tree - from vllm.v1.engine.core_client import EngineCoreClient - from vllm.v1.engine.detokenizer import Detokenizer - from vllm.v1.engine.processor import Processor -@@ -38,6 +40,7 @@ class AsyncLLM(EngineClient): - log_requests: bool = True, - start_engine_loop: bool = True, - ) -> None: ++ using_gqa_kernel = use_gqa_kernel(num_heads, num_kv_heads, head_size, logits_soft_cap) + - assert start_engine_loop - - self.log_requests = log_requests -@@ -75,18 +78,15 @@ class AsyncLLM(EngineClient): - - # EngineCore (starts the engine in background process). - self.engine_core = EngineCoreClient.make_client( -- vllm_config=vllm_config, -- executor_class=executor_class, -- usage_context=usage_context, - multiprocess_mode=True, - asyncio_mode=True, -+ vllm_config=vllm_config, -+ executor_class=executor_class, -+ log_stats=self.log_stats, - ) - - self.output_handler: Optional[asyncio.Task] = None - -- def __del__(self): -- self.shutdown() -- - @classmethod - def from_engine_args( - cls, -@@ -104,7 +104,7 @@ class AsyncLLM(EngineClient): - else: - vllm_config = engine_config - -- executor_class = cls._get_executor_cls(vllm_config) -+ executor_class = Executor.get_class(vllm_config) - - # Create the AsyncLLM. - return cls( -@@ -126,20 +126,6 @@ class AsyncLLM(EngineClient): - if handler := getattr(self, "output_handler", None): - handler.cancel() - -- @classmethod -- def _get_executor_cls(cls, vllm_config: VllmConfig) -> Type[Executor]: -- executor_class: Type[Executor] -- distributed_executor_backend = ( -- vllm_config.parallel_config.distributed_executor_backend) -- if distributed_executor_backend == "mp": -- from vllm.v1.executor.multiproc_executor import MultiprocExecutor -- executor_class = MultiprocExecutor -- else: -- assert (distributed_executor_backend is None) -- from vllm.v1.executor.uniproc_executor import UniprocExecutor -- executor_class = UniprocExecutor -- return executor_class -- - async def add_request( - self, - request_id: str, -@@ -158,16 +144,18 @@ class AsyncLLM(EngineClient): - raise ValueError(f"Request id {request_id} already running.") - self.rid_to_queue[request_id] = asyncio.Queue() - -- # 2) Convert input --> DetokenizerRequest / EngineCoreRequest. -- detokenizer_req, engine_core_req = self.processor.process_inputs( -- request_id, prompt, params, arrival_time, lora_request, -- trace_headers, prompt_adapter_request, priority) -+ # 2) Convert Input --> Request. -+ request = self.processor.process_inputs(request_id, prompt, params, -+ arrival_time, lora_request, -+ trace_headers, -+ prompt_adapter_request, -+ priority) - - # 3) Add the request to Detokenizer (this process). -- self.detokenizer.add_request(detokenizer_req) -+ self.detokenizer.add_request(request) - - # 4) Add the EngineCoreRequest to EngineCore (separate process). -- await self.engine_core.add_request_async(engine_core_req) -+ await self.engine_core.add_request_async(request) - - if self.log_requests: - logger.info("Added request %s.", request_id) -@@ -274,9 +262,9 @@ class AsyncLLM(EngineClient): - # 4) Abort any requests that finished due to stop strings. - await self.engine_core.abort_requests_async(reqs_to_abort) - -- except BaseException as e: -- logger.error(e) -- raise e -+ except Exception as e: -+ logger.exception("EngineCore output handler hit an error: %s", e) -+ kill_process_tree(os.getpid()) - - async def abort(self, request_id: str) -> None: - """Abort RequestId in self, detokenizer, and engine core.""" -diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py -index 0aef61fc7..975ce11fe 100644 ---- a/vllm/v1/engine/core.py -+++ b/vllm/v1/engine/core.py -@@ -3,20 +3,19 @@ import queue - import signal - import threading - import time --from dataclasses import dataclass --from multiprocessing.process import BaseProcess -+from multiprocessing.connection import Connection - from typing import List, Tuple, Type - -+import psutil - import zmq - import zmq.asyncio - from msgspec import msgpack - - from vllm.config import CacheConfig, VllmConfig --from vllm.executor.multiproc_worker_utils import get_mp_context - from vllm.logger import init_logger - from vllm.transformers_utils.config import ( - maybe_register_config_serialize_by_value) --from vllm.usage.usage_lib import UsageContext -+from vllm.utils import get_exception_traceback, zmq_socket_ctx - from vllm.v1.core.scheduler import Scheduler - from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs, - EngineCoreProfile, EngineCoreRequest, -@@ -25,14 +24,13 @@ from vllm.v1.engine.mm_input_mapper import MMInputMapperServer - from vllm.v1.executor.abstract import Executor - from vllm.v1.request import Request, RequestStatus - from vllm.v1.serial_utils import PickleEncoder --from vllm.v1.utils import make_zmq_socket - from vllm.version import __version__ as VLLM_VERSION - - logger = init_logger(__name__) - - POLLING_TIMEOUT_MS = 5000 - POLLING_TIMEOUT_S = POLLING_TIMEOUT_MS // 1000 --LOGGING_TIME_S = POLLING_TIMEOUT_S -+LOGGING_TIME_S = 5 - - - class EngineCore: -@@ -42,9 +40,10 @@ class EngineCore: - self, - vllm_config: VllmConfig, - executor_class: Type[Executor], -- usage_context: UsageContext, -+ log_stats: bool = False, - ): - assert vllm_config.model_config.runner_type != "pooling" -+ self.log_stats = log_stats - - logger.info("Initializing an LLM engine (v%s) with config: %s", - VLLM_VERSION, vllm_config) -@@ -134,29 +133,19 @@ class EngineCore: - self.model_executor.profile(is_start) - - --@dataclass --class EngineCoreProcHandle: -- proc: BaseProcess -- ready_path: str -- input_path: str -- output_path: str -- -- - class EngineCoreProc(EngineCore): - """ZMQ-wrapper for running EngineCore in background process.""" - -- READY_STR = "READY" -- - def __init__( - self, -- vllm_config: VllmConfig, -- executor_class: Type[Executor], -- usage_context: UsageContext, - input_path: str, - output_path: str, -- ready_path: str, -+ ready_pipe: Connection, -+ vllm_config: VllmConfig, -+ executor_class: Type[Executor], -+ log_stats: bool = False, - ): -- super().__init__(vllm_config, executor_class, usage_context) -+ super().__init__(vllm_config, executor_class, log_stats) - - # Background Threads and Queues for IO. These enable us to - # overlap ZMQ socket IO with GPU since they release the GIL, -@@ -173,68 +162,7 @@ class EngineCoreProc(EngineCore): - daemon=True).start() - - # Send Readiness signal to EngineClient. -- with make_zmq_socket(ready_path, zmq.constants.PUSH) as ready_socket: -- ready_socket.send_string(EngineCoreProc.READY_STR) -- -- @staticmethod -- def wait_for_startup( -- proc: BaseProcess, -- ready_path: str, -- ) -> None: -- """Wait until the EngineCore is ready.""" -- -- try: -- sync_ctx = zmq.Context() # type: ignore[attr-defined] -- socket = sync_ctx.socket(zmq.constants.PULL) -- socket.connect(ready_path) -- -- # Wait for EngineCore to send EngineCoreProc.READY_STR. -- while socket.poll(timeout=POLLING_TIMEOUT_MS) == 0: -- logger.debug("Waiting for EngineCoreProc to startup.") -- -- if not proc.is_alive(): -- raise RuntimeError("EngineCoreProc failed to start.") -- -- message = socket.recv_string() -- assert message == EngineCoreProc.READY_STR -- -- except BaseException as e: -- logger.exception(e) -- raise e -- -- finally: -- sync_ctx.destroy(linger=0) -- -- @staticmethod -- def make_engine_core_process( -- vllm_config: VllmConfig, -- executor_class: Type[Executor], -- usage_context: UsageContext, -- input_path: str, -- output_path: str, -- ready_path: str, -- ) -> EngineCoreProcHandle: -- context = get_mp_context() -- -- process_kwargs = { -- "input_path": input_path, -- "output_path": output_path, -- "ready_path": ready_path, -- "vllm_config": vllm_config, -- "executor_class": executor_class, -- "usage_context": usage_context, -- } -- # Run EngineCore busy loop in background process. -- proc = context.Process(target=EngineCoreProc.run_engine_core, -- kwargs=process_kwargs) -- proc.start() -- -- # Wait for startup -- EngineCoreProc.wait_for_startup(proc, ready_path) -- return EngineCoreProcHandle(proc=proc, -- ready_path=ready_path, -- input_path=input_path, -- output_path=output_path) -+ ready_pipe.send({"status": "READY"}) - - @staticmethod - def run_engine_core(*args, **kwargs): -@@ -258,6 +186,7 @@ class EngineCoreProc(EngineCore): - signal.signal(signal.SIGTERM, signal_handler) - signal.signal(signal.SIGINT, signal_handler) - -+ parent_process = psutil.Process().parent() - engine_core = None - try: - engine_core = EngineCoreProc(*args, **kwargs) -@@ -266,14 +195,14 @@ class EngineCoreProc(EngineCore): - except SystemExit: - logger.debug("EngineCore interrupted.") - -- except BaseException as e: -- logger.exception(e) -- raise e -+ except Exception: -+ traceback = get_exception_traceback() -+ logger.error("EngineCore hit an exception: %s", traceback) -+ parent_process.send_signal(signal.SIGUSR1) - - finally: - if engine_core is not None: - engine_core.shutdown() -- engine_core = None - - def run_busy_loop(self): - """Core busy loop of the EngineCore.""" -@@ -309,6 +238,9 @@ class EngineCoreProc(EngineCore): - def _log_stats(self): - """Log basic stats every LOGGING_TIME_S""" - -+ if not self.log_stats: -+ return + - now = time.time() - - if now - self._last_logging_time > LOGGING_TIME_S: -@@ -339,7 +271,7 @@ class EngineCoreProc(EngineCore): - decoder_add_req = PickleEncoder() - decoder_abort_req = PickleEncoder() - -- with make_zmq_socket(input_path, zmq.constants.PULL) as socket: -+ with zmq_socket_ctx(input_path, zmq.constants.PULL) as socket: - while True: - # (RequestType, RequestData) - type_frame, data_frame = socket.recv_multipart(copy=False) -@@ -367,7 +299,7 @@ class EngineCoreProc(EngineCore): - # Reuse send buffer. - buffer = bytearray() - -- with make_zmq_socket(output_path, zmq.constants.PUSH) as socket: -+ with zmq_socket_ctx(output_path, zmq.constants.PUSH) as socket: - while True: - engine_core_outputs = self.output_queue.get() - outputs = EngineCoreOutputs(outputs=engine_core_outputs) -diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py -index d56fcbdb1..a4a45ae05 100644 ---- a/vllm/v1/engine/core_client.py -+++ b/vllm/v1/engine/core_client.py -@@ -1,24 +1,29 @@ - import os -+import signal - import weakref --from typing import List, Optional -+from abc import ABC, abstractmethod -+from typing import List, Type - - import msgspec - import zmq - import zmq.asyncio - -+from vllm.config import VllmConfig - from vllm.logger import init_logger --from vllm.utils import get_open_zmq_ipc_path, kill_process_tree -+from vllm.utils import (get_open_zmq_ipc_path, kill_process_tree, -+ make_zmq_socket) - from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs, - EngineCoreProfile, EngineCoreRequest, - EngineCoreRequestType, EngineCoreRequestUnion) --from vllm.v1.engine.core import (EngineCore, EngineCoreProc, -- EngineCoreProcHandle) -+from vllm.v1.engine.core import EngineCore, EngineCoreProc -+from vllm.v1.executor.abstract import Executor - from vllm.v1.serial_utils import PickleEncoder -+from vllm.v1.utils import BackgroundProcHandle - - logger = init_logger(__name__) - - --class EngineCoreClient: -+class EngineCoreClient(ABC): - """ - EngineCoreClient: subclasses handle different methods for pushing - and pulling from the EngineCore for asyncio / multiprocessing. -@@ -31,10 +36,11 @@ class EngineCoreClient: - - @staticmethod - def make_client( -- *args, - multiprocess_mode: bool, - asyncio_mode: bool, -- **kwargs, -+ vllm_config: VllmConfig, -+ executor_class: Type[Executor], -+ log_stats: bool = False, - ) -> "EngineCoreClient": - - # TODO: support this for debugging purposes. -@@ -44,15 +50,16 @@ class EngineCoreClient: - "is not currently supported.") - - if multiprocess_mode and asyncio_mode: -- return AsyncMPClient(*args, **kwargs) -+ return AsyncMPClient(vllm_config, executor_class, log_stats) - - if multiprocess_mode and not asyncio_mode: -- return SyncMPClient(*args, **kwargs) -+ return SyncMPClient(vllm_config, executor_class, log_stats) - -- return InprocClient(*args, **kwargs) -+ return InprocClient(vllm_config, executor_class, log_stats) - -+ @abstractmethod - def shutdown(self): -- pass -+ ... - - def get_output(self) -> List[EngineCoreOutput]: - raise NotImplementedError -@@ -87,8 +94,6 @@ class InprocClient(EngineCoreClient): - - * pushes EngineCoreRequest directly into the EngineCore - * pulls EngineCoreOutputs by stepping the EngineCore -- -- TODO: support asyncio-mode for debugging. - """ - - def __init__(self, *args, **kwargs): -@@ -106,9 +111,6 @@ class InprocClient(EngineCoreClient): - def shutdown(self): - self.engine_core.shutdown() - -- def __del__(self): -- self.shutdown() -- - def profile(self, is_start: bool = True) -> None: - self.engine_core.profile(is_start) - -@@ -128,75 +130,80 @@ class MPClient(EngineCoreClient): - - def __init__( - self, -- *args, - asyncio_mode: bool, -- **kwargs, -+ vllm_config: VllmConfig, -+ executor_class: Type[Executor], -+ log_stats: bool = False, - ): -+ # The child processes will send SIGUSR1 when unrecoverable -+ # errors happen. We kill the process tree here so that the -+ # stack trace is very evident. -+ # TODO(rob): rather than killing the main process, we should -+ # figure out how to raise an AsyncEngineDeadError and -+ # handle at the API server level so we can return a better -+ # error code to the clients calling VLLM. -+ def sigusr1_handler(signum, frame): -+ logger.fatal("Got fatal signal from worker processes, shutting " -+ "down. See stack trace above for root cause issue.") -+ kill_process_tree(os.getpid()) -+ -+ signal.signal(signal.SIGUSR1, sigusr1_handler) -+ - # Serialization setup. - self.encoder = PickleEncoder() - self.decoder = msgspec.msgpack.Decoder(EngineCoreOutputs) - - # ZMQ setup. -- if asyncio_mode: -- self.ctx = zmq.asyncio.Context() -- else: -- self.ctx = zmq.Context() # type: ignore[attr-defined] -+ self.ctx = ( -+ zmq.asyncio.Context() # type: ignore[attr-defined] -+ if asyncio_mode else zmq.Context()) # type: ignore[attr-defined] - -- # Path for IPC. -- ready_path = get_open_zmq_ipc_path() -+ # Note(rob): shutdown function cannot be a bound method, -+ # else the gc cannot collect the object. -+ self._finalizer = weakref.finalize(self, lambda x: x.destroy(linger=0), -+ self.ctx) -+ -+ # Paths and sockets for IPC. - output_path = get_open_zmq_ipc_path() - input_path = get_open_zmq_ipc_path() -- -- # Get output (EngineCoreOutput) from EngineCore. -- self.output_socket = self.ctx.socket(zmq.constants.PULL) -- self.output_socket.connect(output_path) -- -- # Send input (EngineCoreRequest) to EngineCore. -- self.input_socket = self.ctx.socket(zmq.constants.PUSH) -- self.input_socket.bind(input_path) -+ self.output_socket = make_zmq_socket(self.ctx, output_path, -+ zmq.constants.PULL) -+ self.input_socket = make_zmq_socket(self.ctx, input_path, -+ zmq.constants.PUSH) - - # Start EngineCore in background process. -- self.proc_handle: Optional[EngineCoreProcHandle] -- self.proc_handle = EngineCoreProc.make_engine_core_process( -- *args, -- input_path= -- input_path, # type: ignore[misc] # MyPy incorrectly flags duplicate keywords -- output_path=output_path, # type: ignore[misc] -- ready_path=ready_path, # type: ignore[misc] -- **kwargs, -- ) -- self._finalizer = weakref.finalize(self, self.shutdown) -+ self.proc_handle = BackgroundProcHandle( -+ input_path=input_path, -+ output_path=output_path, -+ process_name="EngineCore", -+ target_fn=EngineCoreProc.run_engine_core, -+ process_kwargs={ -+ "vllm_config": vllm_config, -+ "executor_class": executor_class, -+ "log_stats": log_stats, -+ }) - - def shutdown(self): -- # Shut down the zmq context. -- self.ctx.destroy(linger=0) -- -- if hasattr(self, "proc_handle") and self.proc_handle: -- # Shutdown the process if needed. -- if self.proc_handle.proc.is_alive(): -- self.proc_handle.proc.terminate() -- self.proc_handle.proc.join(5) -- -- if self.proc_handle.proc.is_alive(): -- kill_process_tree(self.proc_handle.proc.pid) -- -- # Remove zmq ipc socket files -- ipc_sockets = [ -- self.proc_handle.ready_path, self.proc_handle.output_path, -- self.proc_handle.input_path -- ] -- for ipc_socket in ipc_sockets: -- socket_file = ipc_socket.replace("ipc://", "") -- if os and os.path.exists(socket_file): -- os.remove(socket_file) -- self.proc_handle = None -+ """Clean up background resources.""" -+ if hasattr(self, "proc_handle"): -+ self.proc_handle.shutdown() -+ -+ self._finalizer() - - - class SyncMPClient(MPClient): - """Synchronous client for multi-proc EngineCore.""" - -- def __init__(self, *args, **kwargs): -- super().__init__(*args, asyncio_mode=False, **kwargs) -+ def __init__(self, -+ vllm_config: VllmConfig, -+ executor_class: Type[Executor], -+ log_stats: bool = False): -+ super().__init__( -+ asyncio_mode=False, -+ vllm_config=vllm_config, -+ executor_class=executor_class, -+ log_stats=log_stats, ++ if using_gqa_kernel: ++ key_cache, value_cache = split_kv_cache_ipexllm( ++ kv_cache, num_kv_heads, head_size) ++ ipex_ops.reshape_and_cache_ipexllm( ++ key[:num_actual_tokens], ++ value[:num_actual_tokens], ++ key_cache, ++ value_cache, ++ attn_metadata.slot_mapping.flatten(), ++ kv_cache_dtype, ++ k_scale, ++ v_scale, + ) - - def get_output(self) -> List[EngineCoreOutput]: - -@@ -225,8 +232,16 @@ class SyncMPClient(MPClient): - class AsyncMPClient(MPClient): - """Asyncio-compatible client for multi-proc EngineCore.""" - -- def __init__(self, *args, **kwargs): -- super().__init__(*args, asyncio_mode=True, **kwargs) -+ def __init__(self, -+ vllm_config: VllmConfig, -+ executor_class: Type[Executor], -+ log_stats: bool = False): -+ super().__init__( -+ asyncio_mode=True, -+ vllm_config=vllm_config, -+ executor_class=executor_class, -+ log_stats=log_stats, ++ else: ++ key_cache, value_cache = split_kv_cache( ++ kv_cache, num_kv_heads, head_size) ++ ipex_ops.reshape_and_cache( ++ key[:num_actual_tokens], ++ value[:num_actual_tokens], ++ key_cache, ++ value_cache, ++ attn_metadata.slot_mapping.flatten(), ++ kv_cache_dtype, ++ k_scale, ++ v_scale, + ) - - async def get_output_async(self) -> List[EngineCoreOutput]: - -diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py -index 02f34e2b5..65be9e58e 100644 ---- a/vllm/v1/engine/detokenizer.py -+++ b/vllm/v1/engine/detokenizer.py -@@ -8,7 +8,7 @@ from vllm.sampling_params import RequestOutputKind - from vllm.transformers_utils.detokenizer_utils import ( - AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally) - from vllm.transformers_utils.tokenizer import get_tokenizer --from vllm.v1.engine import DetokenizerRequest, EngineCoreOutput -+from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest - - logger = init_logger(__name__) - -@@ -55,19 +55,19 @@ class IncrementalDetokenizer: - def from_new_request( - cls, - tokenizer: AnyTokenizer, -- request: DetokenizerRequest, -+ request: EngineCoreRequest, - ) -> "IncrementalDetokenizer": - - tokens, prefix_offset, read_offset = convert_prompt_ids_to_tokens( - tokenizer=tokenizer, - prompt_ids=request.prompt_token_ids, -- skip_special_tokens=request.skip_special_tokens, -+ skip_special_tokens=request.sampling_params.skip_special_tokens, - ) - -- stops = request.stop -+ stops = request.sampling_params.stop - # Number of chars to hold back when stop strings are to be excluded - # from streamed output. -- if stops and not request.include_stop_str_in_output: -+ if stops and not request.sampling_params.include_stop_str_in_output: - stop_buffer_length = max(len(s) for s in stops) - 1 - else: - stop_buffer_length = 0 -@@ -79,13 +79,14 @@ class IncrementalDetokenizer: - # NOTE(Nick): could we take ownership of it though? - token_ids=request.prompt_token_ids.copy(), - stop=stops, -- include_stop_str_in_output=request.include_stop_str_in_output, -+ include_stop_str_in_output=request.sampling_params. -+ include_stop_str_in_output, - prefix_offset=prefix_offset, - read_offset=read_offset, -- skip_special_tokens=request.skip_special_tokens, -- spaces_between_special_tokens=request. -+ skip_special_tokens=request.sampling_params.skip_special_tokens, -+ spaces_between_special_tokens=request.sampling_params. - spaces_between_special_tokens, -- output_kind=request.output_kind, -+ output_kind=request.sampling_params.output_kind, - request_id=request.request_id, - prompt=request.prompt, - prompt_token_ids=request.prompt_token_ids, -@@ -227,7 +228,7 @@ class Detokenizer: - - def add_request( - self, -- request: DetokenizerRequest, -+ request: EngineCoreRequest, - ): - """Add new request to the Detokenizer.""" - ++ # Invoke chunked prefill method... ++ import vllm._C.ops ++ assert head_size == 128 or head_size == 64 ++ value = os.environ.get('USE_CONTEXT_V1') ++ query_len = attn_metadata.query_start_loc[1:] - attn_metadata.query_start_loc[:-1] ++ seq_len = attn_metadata.seq_start_loc[1:] - attn_metadata.seq_start_loc[:-1] ++ context_len = seq_len - query_len ++ if using_gqa_kernel: ++ # if using_gqa_kernel, then only the v1 kernel can be used ++ out = vllm._C.ops.context_attention_forward_v1(query[:num_actual_tokens], key_cache, value_cache, attn_metadata.block_table, attn_metadata.query_start_loc, seq_len, context_len, attn_metadata.max_seq_len, torch.amax(context_len).item()) ++ elif value is None: ++ # Otherwise, by default use v2 attention forward kernel... ++ out = vllm._C.ops.context_attention_forward_v2(query[:num_actual_tokens], key_cache, value_cache, attn_metadata.block_table, attn_metadata.query_start_loc, seq_len, context_len, attn_metadata.max_seq_len, torch.amax(context_len).item(), torch.amax(query_len).item()) ++ else: ++ out = vllm._C.ops.context_attention_forward_v1(query[:num_actual_tokens], key_cache, value_cache, attn_metadata.block_table, attn_metadata.query_start_loc, seq_len, context_len, attn_metadata.max_seq_len, torch.amax(context_len).item()) ++ ++ # output[:num_actual_tokens] = out ++ output[:num_actual_tokens] = out.view(out.shape[0], -1) ++ ++ ++ ++ ++@torch.library.custom_op("vllm::ipex_attn_chunked_prefill", ++ mutates_args=["output", "kv_cache"]) ++def ipex_attn_chunked_prefill( ++ output: torch.Tensor, ++ query: torch.Tensor, ++ key: torch.Tensor, ++ value: torch.Tensor, ++ num_heads: int, ++ head_size: int, ++ num_kv_heads: int, ++ kv_cache: torch.Tensor, ++ kv_cache_dtype: str, ++ k_scale: float, ++ v_scale: float, ++ scale: float, ++ sliding_window: Optional[List[int]] = None, ++ alibi_slopes: Optional[torch.Tensor] = None, ++ logits_soft_cap: Optional[float] = None, ++) -> None: ++ context = get_forward_context() ++ current_metadata = context.attn_metadata ++ if current_metadata is None: ++ # Profiling run. ++ return ++ ++ assert current_metadata is not None ++ assert isinstance(current_metadata, FlashAttentionMetadata) ++ attn_metadata: FlashAttentionMetadata = current_metadata ++ num_actual_tokens = attn_metadata.num_actual_tokens ++ ++ query = query.view(-1, num_heads, head_size) ++ key = key.view(-1, num_kv_heads, head_size) ++ value = value.view(-1, num_kv_heads, head_size) ++ ++ # Reshape the input keys and values and store them in the cache. ++ key_cache = kv_cache[0] ++ value_cache = kv_cache[1] ++ ++ ipex_ops.reshape_and_cache_flash( ++ key[:num_actual_tokens], ++ value[:num_actual_tokens], ++ key_cache, ++ value_cache, ++ attn_metadata.slot_mapping, ++ kv_cache_dtype, ++ k_scale, ++ v_scale, ++ ) ++ ++ ipex_ops.chunked_prefill( ++ query[:num_actual_tokens], ++ key_cache, ++ value_cache, ++ output[:num_actual_tokens], ++ attn_metadata.query_start_loc, ++ attn_metadata.seq_start_loc, ++ None, ++ attn_metadata.block_table, ++ alibi_slopes, ++ attn_metadata.max_query_len, ++ attn_metadata.max_seq_len, ++ 0.0, ++ scale, ++ False, ++ True, ++ False, ++ None, ++ ) +diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py +index 39caca0c2..666e59eb3 100644 +--- a/vllm/v1/engine/core.py ++++ b/vllm/v1/engine/core.py +@@ -345,6 +345,9 @@ class EngineCoreProc(EngineCore): + ready_pipe, + **kwargs): + """Launch EngineCore busy loop in background process.""" ++ from ipex_llm.vllm.xpu.model_convert import _ipex_llm_convert ++ lowbit = os.getenv("IPEX_LLM_LOWBIT", "sym_int4") ++ _ipex_llm_convert(lowbit) + + # Signal handler used for graceful termination. + # SystemExit exception is only raised once to allow this and worker diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py -index b58f62778..093cb68e8 100644 +index 4c67186f7..daf8f539a 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py -@@ -42,11 +42,13 @@ class LLMEngine: - use_cached_outputs: bool = False, - multiprocess_mode: bool = False, - ) -> None: -- -- # TODO: Can we avoid this? - self.model_config = vllm_config.model_config +@@ -70,6 +70,10 @@ class LLMEngine: + self.should_execute_dummy_batch = False # Tokenizer (+ ensure liveness if running in another process). + # Create tokenizer, which is needed... @@ -38255,356 +14848,342 @@ index b58f62778..093cb68e8 100644 self.tokenizer = init_tokenizer_from_configs( model_config=vllm_config.model_config, scheduler_config=vllm_config.scheduler_config, -@@ -72,11 +74,11 @@ class LLMEngine: - - # EngineCore (gets EngineCoreRequests and gives EngineCoreOutputs) - self.engine_core = EngineCoreClient.make_client( -- vllm_config, -- executor_class, -- usage_context, - multiprocess_mode=multiprocess_mode, - asyncio_mode=False, -+ vllm_config=vllm_config, -+ executor_class=executor_class, -+ log_stats=False, - ) - - @classmethod -@@ -91,7 +93,7 @@ class LLMEngine: - - # Create the engine configs. - vllm_config = engine_args.create_engine_config(usage_context) -- executor_class = cls._get_executor_cls(vllm_config) -+ executor_class = Executor.get_class(vllm_config) - - if VLLM_ENABLE_V1_MULTIPROCESSING: - logger.debug("Enabling multiprocessing for LLMEngine.") -@@ -105,24 +107,6 @@ class LLMEngine: - stat_loggers=stat_loggers, - multiprocess_mode=enable_multiprocessing) - -- @classmethod -- def _get_executor_cls(cls, vllm_config: VllmConfig) -> Type[Executor]: -- executor_class: Type[Executor] -- distributed_executor_backend = ( -- vllm_config.parallel_config.distributed_executor_backend) -- if distributed_executor_backend == "ray": -- from vllm.v1.executor.ray_executor import RayExecutor -- executor_class = RayExecutor -- elif distributed_executor_backend == "mp": -- from vllm.v1.executor.multiproc_executor import MultiprocExecutor -- executor_class = MultiprocExecutor -- else: -- assert (distributed_executor_backend is None) -- from vllm.v1.executor.uniproc_executor import UniprocExecutor -- executor_class = UniprocExecutor -- -- return executor_class -- - def get_num_unfinished_requests(self) -> int: - return self.detokenizer.get_num_unfinished_requests() - -@@ -152,15 +136,17 @@ class LLMEngine: - ) -> None: - - # 1) Process raw inputs into the request. -- detokenizer_req, engine_core_req = self.processor.process_inputs( -- request_id, prompt, params, arrival_time, lora_request, -- trace_headers, prompt_adapter_request, priority) -+ request = self.processor.process_inputs(request_id, prompt, params, -+ arrival_time, lora_request, -+ trace_headers, -+ prompt_adapter_request, -+ priority) - - # 2) Add the request to Detokenizer. -- self.detokenizer.add_request(detokenizer_req) -+ self.detokenizer.add_request(request) - - # 3) Add the request to EngineCore. -- self.engine_core.add_request(engine_core_req) -+ self.engine_core.add_request(request) - - def step(self) -> List[RequestOutput]: - -@@ -177,8 +163,6 @@ class LLMEngine: - - return request_outputs - -- # TODO(rob): Can we get rid of these? -- - def get_model_config(self): - return self.model_config - -@@ -203,10 +187,3 @@ class LLMEngine: - f"found type: {type(tokenizer_group)}") - - return tokenizer_group -- -- def __del__(self): -- self.shutdown() -- -- def shutdown(self): -- if engine_core := getattr(self, "engine_core", None): -- engine_core.shutdown() -diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py -index 6ee8732bc..c0f6cfab4 100644 ---- a/vllm/v1/engine/processor.py -+++ b/vllm/v1/engine/processor.py -@@ -1,5 +1,5 @@ - import time --from typing import Mapping, Optional, Tuple, Union -+from typing import Mapping, Optional, Union - - from vllm.config import CacheConfig, LoRAConfig, ModelConfig - from vllm.inputs import (INPUT_REGISTRY, InputRegistry, ProcessorInputs, -@@ -13,7 +13,7 @@ from vllm.pooling_params import PoolingParams - from vllm.prompt_adapter.request import PromptAdapterRequest - from vllm.sampling_params import SamplingParams - from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup --from vllm.v1.engine import DetokenizerRequest, EngineCoreRequest -+from vllm.v1.engine import EngineCoreRequest - from vllm.v1.engine.mm_input_mapper import MMHasher, MMInputMapperClient - - -@@ -49,9 +49,6 @@ class Processor: - cache_config.enable_prefix_caching - self.mm_hasher = MMHasher() - -- # TODO: run in an ThreadpoolExecutor or BackgroundProcess. -- # This ideally should releases the GIL, so we should not block the -- # asyncio loop while this is running. - def process_inputs( - self, - request_id: str, -@@ -62,7 +59,7 @@ class Processor: - trace_headers: Optional[Mapping[str, str]] = None, - prompt_adapter_request: Optional[PromptAdapterRequest] = None, - priority: int = 0, -- ) -> Tuple[DetokenizerRequest, EngineCoreRequest]: -+ ) -> EngineCoreRequest: - - # TODO(woosuk): Support pooling models. - # TODO(woosuk): Check max_logprobs -@@ -113,30 +110,29 @@ class Processor: - - # For merged preprocessor, mm_data is already mm_inputs - precomputed_mm_inputs = None -- if isinstance(decoder_inputs.multi_modal_data, MultiModalKwargs): -- precomputed_mm_inputs = [decoder_inputs.multi_modal_data] -+ decoder_mm_data = decoder_inputs.multi_modal_data -+ if isinstance(decoder_mm_data, MultiModalKwargs): -+ # The output of merged multi-modal processor (`decoder_mm_data`) -+ # contains the kwargs for all items from all modalities. -+ # This code separates them so that there is one set of kwargs -+ # per item per modality. -+ precomputed_mm_inputs = [ -+ MultiModalKwargs.from_items([item]) -+ for modality in decoder_mm_data.modalities -+ for item in decoder_mm_data.get_items(modality) -+ ] - - # Apply MM mapper - mm_inputs = None -- if len(decoder_inputs.multi_modal_data) > 0: -+ if len(decoder_mm_data) > 0: - mm_inputs = self.mm_input_mapper_client.process_inputs( -- decoder_inputs.multi_modal_data, mm_hashes, -- decoder_inputs.mm_processor_kwargs, precomputed_mm_inputs) -+ decoder_mm_data, -+ mm_hashes, -+ decoder_inputs.mm_processor_kwargs, -+ precomputed_mm_inputs, -+ ) - -- # Make Request for Detokenizer. -- detokenizer_request = DetokenizerRequest( -- request_id, -- decoder_inputs.prompt, -- decoder_inputs.prompt_token_ids, -- sampling_params.skip_special_tokens, -- sampling_params.spaces_between_special_tokens, -- sampling_params.output_kind, -- sampling_params.stop, -- sampling_params.include_stop_str_in_output, -- ) -- -- # Make Request for EngineCore. -- engine_core_request = EngineCoreRequest( -+ return EngineCoreRequest( - request_id, - decoder_inputs.prompt, - decoder_inputs.prompt_token_ids, -@@ -149,8 +145,6 @@ class Processor: - lora_request, - ) - -- return detokenizer_request, engine_core_request -- - def _validate_model_inputs(self, inputs: ProcessorInputs): - if is_encoder_decoder_inputs(inputs): - # For encoder-decoder multimodal models, the max_prompt_len diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py -index 564d0447f..d424420d1 100644 +index e3a4cd98c..79fd179b6 100644 --- a/vllm/v1/executor/abstract.py +++ b/vllm/v1/executor/abstract.py -@@ -1,13 +1,40 @@ - from abc import ABC, abstractmethod --from typing import Tuple -+from typing import Tuple, Type - - from vllm.config import VllmConfig -+from vllm.platforms import current_platform - from vllm.v1.outputs import ModelRunnerOutput - - - class Executor(ABC): - """Abstract class for executors.""" - -+ @staticmethod -+ def get_class(vllm_config: VllmConfig) -> Type["Executor"]: -+ executor_class: Type[Executor] -+ distributed_executor_backend = ( -+ vllm_config.parallel_config.distributed_executor_backend) -+ if distributed_executor_backend == "ray": -+ if current_platform.is_cuda(): -+ from vllm.v1.executor.ray_executor import RayExecutor -+ executor_class = RayExecutor -+ elif current_platform.is_xpu(): -+ from vllm.v1.executor.xpu_ray_executor import RayXPUExecutor -+ executor_class = RayXPUExecutor -+ elif distributed_executor_backend == "mp": -+ from vllm.v1.executor.multiproc_executor import MultiprocExecutor -+ executor_class = MultiprocExecutor -+ else: -+ assert (distributed_executor_backend is None) -+ if current_platform.is_cuda(): -+ from vllm.v1.executor.uniproc_executor import UniprocExecutor -+ executor_class = UniprocExecutor -+ elif current_platform.is_xpu(): -+ from vllm.v1.executor.xpu_uniproc_executor import ( # noqa: E501 -+ XPUUniprocExecutor) -+ executor_class = XPUUniprocExecutor -+ return executor_class -+ - @abstractmethod - def __init__(self, vllm_config: VllmConfig) -> None: - raise NotImplementedError -diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py -index 128101aa6..41e6abbd6 100644 ---- a/vllm/v1/executor/multiproc_executor.py -+++ b/vllm/v1/executor/multiproc_executor.py -@@ -9,6 +9,7 @@ from enum import Enum, auto - from multiprocessing.process import BaseProcess - from typing import Any, Dict, List, Optional, Tuple - -+import psutil - import zmq - - from vllm.config import VllmConfig -@@ -17,13 +18,12 @@ from vllm.distributed import (destroy_distributed_environment, - from vllm.distributed.device_communicators.shm_broadcast import (Handle, - MessageQueue) - from vllm.executor.multiproc_worker_utils import ( -- _add_prefix, get_mp_context, set_multiprocessing_worker_envs) -+ _add_prefix, set_multiprocessing_worker_envs) - from vllm.logger import init_logger --from vllm.utils import (get_distributed_init_method, get_open_port, -- get_open_zmq_ipc_path) -+from vllm.utils import (get_distributed_init_method, get_mp_context, -+ get_open_port, get_open_zmq_ipc_path, zmq_socket_ctx) - from vllm.v1.executor.abstract import Executor - from vllm.v1.outputs import ModelRunnerOutput --from vllm.v1.utils import make_zmq_socket - from vllm.worker.worker_base import WorkerWrapperBase - - logger = init_logger(__name__) -@@ -39,6 +39,19 @@ class MultiprocExecutor(Executor): - # and ensure workers will be terminated. - self._finalizer = weakref.finalize(self, self.shutdown) - -+ # The child processes will send SIGUSR1 when unrecoverable -+ # errors happen. -+ def sigusr1_handler(signum, frame): -+ logger.fatal( -+ "MulitprocExecutor got fatal signal from worker processes, " -+ "shutting down. See stack trace above for root cause issue.") -+ # Propagate error up to parent process. -+ parent_process = psutil.Process().parent() -+ parent_process.send_signal(signal.SIGUSR1) -+ self.shutdown() -+ -+ signal.signal(signal.SIGUSR1, sigusr1_handler) -+ - self.vllm_config = vllm_config - self.parallel_config = vllm_config.parallel_config - -@@ -82,6 +95,7 @@ class MultiprocExecutor(Executor): - Initialize the KV caches and begin the model execution loop of the - underlying workers. - """ -+ logger.info("# GPU blocks: %d", num_gpu_blocks) - self.collective_rpc("initialize_cache", args=(num_gpu_blocks, )) - self.collective_rpc("compile_or_warm_up_model") - -@@ -250,7 +264,7 @@ class WorkerProc: - worker_response_mq_handle = self.worker_response_mq.export_handle() - - # Send Readiness signal to EngineCore process. -- with make_zmq_socket(ready_path, zmq.constants.PUSH) as ready_socket: -+ with zmq_socket_ctx(ready_path, zmq.constants.PUSH) as ready_socket: - payload = pickle.dumps(worker_response_mq_handle, - protocol=pickle.HIGHEST_PROTOCOL) - ready_socket.send_string(WorkerProc.READY_STR) -@@ -336,8 +350,11 @@ class WorkerProc: - except SystemExit: - logger.debug("Worker interrupted.") - -- except BaseException as e: -- logger.exception(e) -+ except Exception: -+ # worker_busy_loop sends exceptions exceptons to Executor -+ # for shutdown, but if there is an error in startup or an -+ # error with IPC itself, we need to alert the parent. -+ psutil.Process().parent().send_signal(signal.SIGUSR1) - raise - - finally: -@@ -352,7 +369,7 @@ class WorkerProc: - ready_path: str, - ) -> Optional[Handle]: - """Wait until the Worker is ready.""" -- with make_zmq_socket(ready_path, zmq.constants.PULL) as socket: -+ with zmq_socket_ctx(ready_path, zmq.constants.PULL) as socket: - - # Wait for Worker to send READY. - while socket.poll(timeout=POLLING_TIMEOUT_MS) == 0: -@@ -378,9 +395,10 @@ class WorkerProc: - - try: - output = getattr(self.worker, method)(*args, **kwargs) -- except BaseException as e: -+ except Exception as e: - self.worker_response_mq.enqueue( - (WorkerProc.ResponseStatus.FAILURE, e)) -+ logger.exception("WorkerProc hit an exception: %s", exc_info=e) - continue - - self.worker_response_mq.enqueue( +@@ -35,9 +35,15 @@ class Executor(ExecutorBase): + f"ExecutorBase. Got {distributed_executor_backend}.") + executor_class = distributed_executor_backend + elif distributed_executor_backend == "ray": +- from vllm.v1.executor.ray_distributed_executor import ( # noqa +- RayDistributedExecutor) +- executor_class = RayDistributedExecutor ++ from vllm.platforms import current_platform ++ if current_platform.is_xpu(): ++ from vllm.v1.executor.ray_distributed_executor import ( # noqa ++ XPURayDistributedExecutor) ++ executor_class = XPURayDistributedExecutor ++ else: ++ from vllm.v1.executor.ray_distributed_executor import ( # noqa ++ RayDistributedExecutor) ++ executor_class = RayDistributedExecutor + elif distributed_executor_backend == "mp": + from vllm.v1.executor.multiproc_executor import MultiprocExecutor + executor_class = MultiprocExecutor +diff --git a/vllm/v1/executor/ray_distributed_executor.py b/vllm/v1/executor/ray_distributed_executor.py +index 320ebfd37..6ca3179af 100644 +--- a/vllm/v1/executor/ray_distributed_executor.py ++++ b/vllm/v1/executor/ray_distributed_executor.py +@@ -59,3 +59,30 @@ class RayDistributedExecutor(RayDistributedExecutorV0, Executor): + # When PP is used, we return a FutureWrapper immediately so that + # the scheduler can yield to the next batch. + return FutureWrapper(refs[0]) ++ ++class XPURayDistributedExecutor(RayDistributedExecutorV0, Executor): ++ """XPU Ray distributed executor without Compiled Graphs.""" ++ ++ def __init__(self, *args, **kwargs): ++ import os ++ lowbit = os.getenv("IPEX_LLM_LOWBIT", None) ++ if lowbit is not None: ++ from ipex_llm.vllm.xpu.model_convert import _ipex_llm_convert ++ _ipex_llm_convert(lowbit) ++ super().__init__(*args, **kwargs) ++ ++ ++ def execute_model( ++ self, ++ scheduler_output, ++ ) -> Union[ModelRunnerOutput, Future[ModelRunnerOutput]]: ++ output = self.collective_rpc("execute_model", ++ args=(scheduler_output, )) ++ return output[0] ++ ++ @property ++ def max_concurrent_batches(self) -> int: ++ """Ray distributed executor supports pipeline parallelism, ++ meaning that it allows PP size batches to be executed concurrently. ++ """ ++ return self.parallel_config.pipeline_parallel_size diff --git a/vllm/v1/executor/ray_utils.py b/vllm/v1/executor/ray_utils.py -index 7733610e5..cce9315e8 100644 ---- a/vllm/v1/executor/ray_utils.py +new file mode 100644 +index 000000000..cce9315e8 +--- /dev/null +++ b/vllm/v1/executor/ray_utils.py -@@ -51,7 +51,8 @@ try: - # We can remove this API after it is fixed in compiled graph. - import torch - assert self.worker is not None, "Worker is not initialized" -- if not self.compiled_dag_cuda_device_set: +@@ -0,0 +1,272 @@ ++import time ++from collections import defaultdict ++from typing import TYPE_CHECKING, Dict, List, Optional, Tuple ++ ++from vllm.config import ParallelConfig ++from vllm.logger import init_logger ++from vllm.platforms import current_platform ++from vllm.utils import get_ip ++from vllm.v1.outputs import ModelRunnerOutput ++from vllm.worker.worker_base import WorkerWrapperBase ++ ++if TYPE_CHECKING: ++ from vllm.v1.core.scheduler import SchedulerOutput ++ ++logger = init_logger(__name__) ++PG_WAIT_TIMEOUT = 60 ++ ++try: ++ import ray ++ from ray.util import placement_group_table ++ from ray.util.placement_group import PlacementGroup ++ try: ++ from ray._private.state import available_resources_per_node ++ except ImportError: ++ # Ray 2.9.x doesn't expose `available_resources_per_node` ++ from ray._private.state import state as _state ++ available_resources_per_node = _state._available_resources_per_node ++ ++ class RayWorkerWrapper(WorkerWrapperBase): ++ ++ def __init__(self, *args, **kwargs) -> None: ++ super().__init__(*args, **kwargs) ++ # Since the compiled DAG runs a main execution ++ # in a different thread that calls cuda.set_device. ++ # The flag indicates is set_device is called on ++ # that thread. It will be removed soon. ++ self.compiled_dag_cuda_device_set = False ++ ++ def get_node_ip(self) -> str: ++ return get_ip() ++ ++ def get_node_and_gpu_ids(self) -> Tuple[str, List[int]]: ++ node_id = ray.get_runtime_context().get_node_id() ++ gpu_ids = ray.get_gpu_ids() ++ return node_id, gpu_ids ++ ++ def setup_device_if_necessary(self): ++ # TODO(swang): This is needed right now because Ray CG executes ++ # on a background thread, so we need to reset torch's current ++ # device. ++ # We can remove this API after it is fixed in compiled graph. ++ import torch ++ assert self.worker is not None, "Worker is not initialized" + if not self.compiled_dag_cuda_device_set \ + and current_platform.is_cuda(): - torch.cuda.set_device(self.worker.device) - self.compiled_dag_cuda_device_set = True - ++ torch.cuda.set_device(self.worker.device) ++ self.compiled_dag_cuda_device_set = True ++ ++ def execute_model( ++ self, ++ scheduler_output: "SchedulerOutput", ++ ) -> ModelRunnerOutput: ++ self.setup_device_if_necessary() ++ assert self.worker is not None, "Worker is not initialized" ++ output = self.worker.model_runner.execute_model(scheduler_output) ++ return output ++ ++ ray_import_err = None ++ ++except ImportError as e: ++ ray = None # type: ignore ++ ray_import_err = e ++ RayWorkerWrapper = None # type: ignore ++ ++ ++def ray_is_available() -> bool: ++ """Returns True if Ray is available.""" ++ return ray is not None ++ ++ ++def assert_ray_available(): ++ """ ++ Raise an exception if Ray is not available. ++ """ ++ if ray is None: ++ raise ValueError("Failed to import Ray, please install Ray with " ++ "`pip install ray`.") from ray_import_err ++ ++ ++def _verify_bundles(placement_group: "PlacementGroup", ++ parallel_config: ParallelConfig, device_str: str): ++ """ ++ Verify a given placement group has bundles located in the right place. ++ ++ There are 2 rules. ++ - Warn if all tensor parallel workers cannot fit in a single node. ++ - Fail if driver node is not included in a placement group. ++ ++ Args: ++ placement_group: The placement group to verify. ++ parallel_config: The parallel configuration. ++ device_str: The required device. ++ """ ++ assert ray.is_initialized(), ( ++ "Ray is not initialized although distributed-executor-backend is ray.") ++ pg_data = placement_group_table(placement_group) ++ # bundle_idx -> node_id ++ bundle_to_node_ids = pg_data["bundles_to_node_id"] ++ # bundle_idx -> bundle (e.g., {"GPU": 1}) ++ bundles = pg_data["bundles"] ++ # node_id -> List of bundle (e.g., {"GPU": 1}) ++ node_id_to_bundle: Dict[str, List[Dict[str, float]]] = defaultdict(list) ++ ++ for bundle_idx, node_id in bundle_to_node_ids.items(): ++ node_id_to_bundle[node_id].append(bundles[bundle_idx]) ++ driver_node_id = ray.get_runtime_context().get_node_id() ++ ++ if driver_node_id not in node_id_to_bundle: ++ raise RuntimeError( ++ f"driver node id {driver_node_id} is not included in a placement " ++ f"group {placement_group.id}. Node id -> bundles " ++ f"{node_id_to_bundle}. " ++ "You don't have enough GPUs available in a current node. Check " ++ "`ray status` to see if you have available GPUs in a node " ++ f"{driver_node_id} before starting an vLLM engine.") ++ ++ for node_id, bundles in node_id_to_bundle.items(): ++ if len(bundles) < parallel_config.tensor_parallel_size: ++ logger.warning( ++ "tensor_parallel_size=%d " ++ "is bigger than a reserved number of %ss (%d " ++ "%ss) in a node %s. Tensor parallel workers can be " ++ "spread out to 2+ nodes which can degrade the performance " ++ "unless you have fast interconnect across nodes, like " ++ "Infiniband. To resolve this issue, make sure you have more " ++ "than %d GPUs available at each node.", ++ parallel_config.tensor_parallel_size, device_str, len(bundles), ++ device_str, node_id, parallel_config.tensor_parallel_size) ++ ++ ++def _wait_until_pg_ready(current_placement_group: "PlacementGroup"): ++ """Wait until a placement group is ready. ++ ++ It prints the informative log messages if the placement group is ++ not created within time. ++ ++ """ ++ # Wait until PG is ready - this will block until all ++ # requested resources are available, and will timeout ++ # if they cannot be provisioned. ++ placement_group_specs = current_placement_group.bundle_specs ++ ++ s = time.time() ++ pg_ready_ref = current_placement_group.ready() ++ wait_interval = 10 ++ while time.time() - s < PG_WAIT_TIMEOUT: ++ ready, _ = ray.wait([pg_ready_ref], timeout=wait_interval) ++ if len(ready) > 0: ++ break ++ ++ # Exponential backoff for warning print. ++ wait_interval *= 2 ++ logger.info( ++ "Waiting for creating a placement group of specs for " ++ "%d seconds. specs=%s. Check " ++ "`ray status` to see if you have enough resources.", ++ int(time.time() - s), placement_group_specs) ++ ++ try: ++ ray.get(pg_ready_ref, timeout=0) ++ except ray.exceptions.GetTimeoutError: ++ raise ValueError( ++ "Cannot provide a placement group of " ++ f"{placement_group_specs=} within {PG_WAIT_TIMEOUT} seconds. See " ++ "`ray status` to make sure the cluster has enough resources." ++ ) from None ++ ++ ++def initialize_ray_cluster( ++ parallel_config: ParallelConfig, ++ ray_address: Optional[str] = None, ++): ++ """Initialize the distributed cluster with Ray. ++ ++ it will connect to the Ray cluster and create a placement group ++ for the workers, which includes the specification of the resources ++ for each distributed worker. ++ ++ Args: ++ parallel_config: The configurations for parallel execution. ++ ray_address: The address of the Ray cluster. If None, uses ++ the default Ray cluster address. ++ """ ++ assert_ray_available() ++ ++ # Connect to a ray cluster. ++ if current_platform.is_rocm() or current_platform.is_xpu(): ++ # Try to connect existing ray instance and create a new one if not found ++ try: ++ ray.init("auto") ++ except ConnectionError: ++ logger.warning( ++ "No existing RAY instance detected. " ++ "A new instance will be launched with current node resources.") ++ ray.init(address=ray_address, ++ ignore_reinit_error=True, ++ num_gpus=parallel_config.world_size) ++ else: ++ ray.init(address=ray_address, ignore_reinit_error=True) ++ ++ if parallel_config.placement_group: ++ # Placement group is already set. ++ return ++ ++ device_str = "GPU" if not current_platform.is_tpu() else "TPU" ++ # Create placement group for worker processes ++ current_placement_group = ray.util.get_current_placement_group() ++ if current_placement_group: ++ # We are in a placement group ++ bundles = current_placement_group.bundle_specs ++ # Verify that we can use the placement group. ++ device_bundles = 0 ++ for bundle in bundles: ++ bundle_devices = bundle.get(device_str, 0) ++ if bundle_devices > 1: ++ raise ValueError( ++ "Placement group bundle cannot have more than 1 " ++ f"{device_str}.") ++ if bundle_devices: ++ device_bundles += 1 ++ if parallel_config.world_size > device_bundles: ++ raise ValueError( ++ f"The number of required {device_str}s exceeds the total " ++ f"number of available {device_str}s in the placement group." ++ f"Required number of devices: {parallel_config.world_size}. " ++ f"Total number of devices: {device_bundles}.") ++ else: ++ num_devices_in_cluster = ray.cluster_resources().get(device_str, 0) ++ if parallel_config.world_size > num_devices_in_cluster: ++ raise ValueError( ++ f"The number of required {device_str}s exceeds the total " ++ f"number of available {device_str}s in the placement group.") ++ # Create a new placement group ++ placement_group_specs: List[Dict[str, float]] = ([{ ++ device_str: 1.0 ++ } for _ in range(parallel_config.world_size)]) ++ ++ # vLLM engine is also a worker to execute model with an accelerator, ++ # so it requires to have the device in a current node. Check if ++ # the current node has at least one device. ++ current_ip = get_ip() ++ current_node_id = ray.get_runtime_context().get_node_id() ++ current_node_resource = available_resources_per_node()[current_node_id] ++ if current_node_resource.get(device_str, 0) < 1: ++ raise ValueError( ++ f"Current node has no {device_str} available. " ++ f"{current_node_resource=}. vLLM engine cannot start without " ++ f"{device_str}. Make sure you have at least 1 {device_str} " ++ f"available in a node {current_node_id=} {current_ip=}.") ++ # This way, at least bundle is required to be created in a current ++ # node. ++ placement_group_specs[0][f"node:{current_ip}"] = 0.001 ++ ++ # By default, Ray packs resources as much as possible. ++ current_placement_group = ray.util.placement_group( ++ placement_group_specs, strategy="PACK") ++ _wait_until_pg_ready(current_placement_group) ++ ++ assert current_placement_group is not None ++ _verify_bundles(current_placement_group, parallel_config, device_str) ++ # Set the placement group in the parallel config ++ parallel_config.placement_group = current_placement_group diff --git a/vllm/v1/executor/xpu_ray_executor.py b/vllm/v1/executor/xpu_ray_executor.py new file mode 100644 index 000000000..ed3948331 @@ -38679,557 +15258,52 @@ index 000000000..a55d6e96d + rank=rank, + distributed_init_method=distributed_init_method, + ) -diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py -index c088c3c12..f2007d85c 100644 ---- a/vllm/v1/sample/ops/topk_topp_sampler.py -+++ b/vllm/v1/sample/ops/topk_topp_sampler.py -@@ -44,7 +44,7 @@ class TopKTopPSampler(nn.Module): - logger.warning( - "FlashInfer is not available. Falling back to the PyTorch-" - "native implementation of top-p & top-k sampling. For the " -- "best performance, please install FalshInfer.") -+ "best performance, please install FlashInfer.") - self.forward = self.forward_native - else: - self.forward = self.forward_native -diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py -index e802c6439..b0a7affbe 100644 ---- a/vllm/v1/utils.py -+++ b/vllm/v1/utils.py -@@ -1,11 +1,12 @@ -+import multiprocessing -+import os -+import weakref - from collections.abc import Sequence --from contextlib import contextmanager --from typing import (Any, Generic, Iterator, List, Optional, TypeVar, Union, -- overload) -- --import zmq -+from typing import (Any, Callable, Dict, Generic, List, Optional, TypeVar, -+ Union, overload) - - from vllm.logger import init_logger -+from vllm.utils import get_mp_context, kill_process_tree - - logger = init_logger(__name__) - -@@ -77,27 +78,59 @@ class ConstantList(Generic[T], Sequence): - return len(self._x) - - --@contextmanager --def make_zmq_socket( -- path: str, -- type: Any) -> Iterator[zmq.Socket]: # type: ignore[name-defined] -- """Context manager for a ZMQ socket""" -- -- ctx = zmq.Context() # type: ignore[attr-defined] -- try: -- socket = ctx.socket(type) -- -- if type == zmq.constants.PULL: -- socket.connect(path) -- elif type == zmq.constants.PUSH: -- socket.bind(path) -- else: -- raise ValueError(f"Unknown Socket Type: {type}") -- -- yield socket -- -- except KeyboardInterrupt: -- logger.debug("Worker had Keyboard Interrupt.") -- -- finally: -- ctx.destroy(linger=0) -+class BackgroundProcHandle: -+ """ -+ Utility class to handle creation, readiness, and shutdown -+ of background processes used by the AsyncLLM and LLMEngine. -+ """ -+ -+ def __init__( -+ self, -+ input_path: str, -+ output_path: str, -+ process_name: str, -+ target_fn: Callable, -+ process_kwargs: Dict[Any, Any], -+ ): -+ context = get_mp_context() -+ reader, writer = context.Pipe(duplex=False) -+ -+ assert ("ready_pipe" not in process_kwargs -+ and "input_path" not in process_kwargs -+ and "output_path" not in process_kwargs) -+ process_kwargs["ready_pipe"] = writer -+ process_kwargs["input_path"] = input_path -+ process_kwargs["output_path"] = output_path -+ -+ # Run busy loop in background process. -+ self.proc = context.Process(target=target_fn, kwargs=process_kwargs) -+ self._finalizer = weakref.finalize(self, shutdown, self.proc, -+ input_path, output_path) -+ self.proc.start() -+ -+ # Wait for startup. -+ if reader.recv()["status"] != "READY": -+ raise RuntimeError(f"{process_name} initialization failed. " -+ "See root cause above.") -+ -+ def shutdown(self): -+ self._finalizer() -+ -+ -+# Note(rob): shutdown function cannot be a bound method, -+# else the gc cannot collect the object. -+def shutdown(proc: multiprocessing.Process, input_path: str, output_path: str): -+ # Shutdown the process. -+ if proc.is_alive(): -+ proc.terminate() -+ proc.join(5) -+ -+ if proc.is_alive(): -+ kill_process_tree(proc.pid) -+ -+ # Remove zmq ipc socket files. -+ ipc_sockets = [output_path, input_path] -+ for ipc_socket in ipc_sockets: -+ socket_file = ipc_socket.replace("ipc://", "") -+ if os and os.path.exists(socket_file): -+ os.remove(socket_file) -diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py -new file mode 100644 -index 000000000..26a2084b1 ---- /dev/null -+++ b/vllm/v1/worker/block_table.py -@@ -0,0 +1,78 @@ -+from typing import List -+ -+import numpy as np -+import torch -+ -+from vllm.logger import init_logger -+ -+logger = init_logger(__name__) -+ -+ -+class BlockTable: -+ -+ def __init__( -+ self, -+ max_num_reqs: int, -+ max_model_len: int, -+ max_num_blocks_per_req: int, -+ pin_memory: bool, -+ device: torch.device, -+ ): -+ self.max_num_reqs = max_num_reqs -+ self.max_model_len = max_model_len -+ self.max_num_blocks_per_req = max_num_blocks_per_req -+ self.pin_memory = pin_memory -+ self.device = device -+ -+ self.block_table = torch.zeros( -+ (max_num_reqs, max_num_blocks_per_req), -+ device=self.device, -+ dtype=torch.int32, -+ ) -+ self.block_table_cpu = torch.zeros( -+ (max_num_reqs, max_num_blocks_per_req), -+ device="cpu", -+ dtype=torch.int32, -+ pin_memory=pin_memory, -+ ) -+ self.block_table_np = self.block_table_cpu.numpy() -+ self.num_blocks_per_row = np.zeros(max_num_reqs, dtype=np.int32) -+ -+ def append_row( -+ self, -+ row_idx: int, -+ start: int, -+ block_ids: List[int], -+ ) -> None: -+ num_blocks = len(block_ids) -+ self.block_table_np[row_idx, start:start + num_blocks] = block_ids -+ self.num_blocks_per_row[row_idx] = start + num_blocks -+ -+ def add_row(self, row_idx: int, block_ids: List[int]) -> None: -+ self.append_row(row_idx, 0, block_ids) -+ -+ def move_row(self, src: int, tgt: int) -> None: -+ num_blocks = self.num_blocks_per_row[src] -+ self.block_table_np[tgt, :num_blocks] = self.block_table_np[ -+ src, :num_blocks] -+ self.num_blocks_per_row[tgt] = num_blocks -+ -+ def commit(self, num_reqs: int) -> None: -+ self.block_table[:num_reqs].copy_(self.block_table_cpu[:num_reqs], -+ non_blocking=True) -+ -+ def clear(self) -> None: -+ self.block_table.fill_(0) -+ self.block_table_cpu.fill_(0) -+ -+ def get_device_tensor(self) -> torch.Tensor: -+ """Ruturns the device tensor of the block table.""" -+ return self.block_table -+ -+ def get_cpu_tensor(self) -> torch.Tensor: -+ """Returns the CPU tensor of the block table.""" -+ return self.block_table_cpu -+ -+ def get_numpy_array(self) -> np.ndarray: -+ """Returns the numpy array of the block table.""" -+ return self.block_table_np -diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py -index 6c4d300ec..40494e64b 100644 ---- a/vllm/v1/worker/gpu_input_batch.py -+++ b/vllm/v1/worker/gpu_input_batch.py -@@ -9,6 +9,7 @@ import torch - from vllm.multimodal import MultiModalKwargs - from vllm.sampling_params import SamplingParams, SamplingType - from vllm.v1.sample.metadata import SamplingMetadata -+from vllm.v1.worker.block_table import BlockTable - - if TYPE_CHECKING: - from vllm.multimodal.inputs import PlaceholderRange -@@ -57,29 +58,27 @@ class InputBatch: - - # TODO(woosuk): This buffer could be too large if max_model_len is big. - # Find a way to reduce the CPU memory usage. -+ # This buffer is not directly transferred to the GPU, so it does not -+ # need to be pinned. - self.token_ids_cpu_tensor = torch.zeros( - (max_num_reqs, max_model_len), - device="cpu", - dtype=torch.int32, -- pin_memory=pin_memory, -+ pin_memory=False, - ) - self.token_ids_cpu = self.token_ids_cpu_tensor.numpy() -- self.num_computed_tokens_cpu = np.empty(max_num_reqs, dtype=np.int32) -+ self.num_tokens = np.zeros(max_num_reqs, dtype=np.int32) - self.num_prompt_tokens = np.zeros(max_num_reqs, dtype=np.int32) -+ self.num_computed_tokens_cpu = np.empty(max_num_reqs, dtype=np.int32) - -- # Attention-related. -- self.block_table = torch.zeros( -- (max_num_reqs, max_num_blocks_per_req), -- device=self.device, -- dtype=torch.int32, -- ) -- self.block_table_cpu_tensor = torch.zeros( -- (max_num_reqs, max_num_blocks_per_req), -- device="cpu", -- dtype=torch.int32, -+ # Block table. -+ self.block_table = BlockTable( -+ max_num_reqs=max_num_reqs, -+ max_model_len=max_model_len, -+ max_num_blocks_per_req=max_num_blocks_per_req, - pin_memory=pin_memory, -+ device=device, - ) -- self.block_table_cpu = self.block_table_cpu_tensor.numpy() - - # Sampling-related. - self.temperature = torch.empty((max_num_reqs, ), -@@ -187,10 +186,10 @@ class InputBatch: - end_idx = start_idx + len(request.output_token_ids) - self.token_ids_cpu[req_index, - start_idx:end_idx] = request.output_token_ids -+ self.num_tokens[req_index] = request.num_tokens - - self.num_computed_tokens_cpu[req_index] = request.num_computed_tokens -- num_blocks = len(request.block_ids) -- self.block_table_cpu[req_index, :num_blocks] = request.block_ids -+ self.block_table.add_row(req_index, request.block_ids) - - sampling_params = request.sampling_params - self.temperature_cpu[req_index] = sampling_params.temperature -@@ -288,16 +287,15 @@ class InputBatch: - self.req_ids[last_req_index] = None - self.req_id_to_index[req_id] = empty_index - -- # TODO(woosuk): Optimize the copy of token_ids_cpu and -- # block_table_cpu. -- self.token_ids_cpu[empty_index] = self.token_ids_cpu[ -- last_req_index] -+ num_tokens = self.num_tokens[last_req_index] -+ self.token_ids_cpu[empty_index, :num_tokens] = self.token_ids_cpu[ -+ last_req_index, :num_tokens] -+ self.num_tokens[empty_index] = num_tokens - self.num_prompt_tokens[empty_index] = \ - self.num_prompt_tokens[last_req_index] - self.num_computed_tokens_cpu[ - empty_index] = self.num_computed_tokens_cpu[last_req_index] -- self.block_table_cpu[empty_index] = self.block_table_cpu[ -- last_req_index] -+ self.block_table.move_row(last_req_index, empty_index) - self.temperature_cpu[empty_index] = self.temperature_cpu[ - last_req_index] - self.top_p_cpu[empty_index] = self.top_p_cpu[last_req_index] diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py -index 509771b7e..d7b1102f2 100644 +index 5133c637f..1c6ed43cf 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py -@@ -72,6 +72,8 @@ class GPUModelRunner: - # Model-related. - self.num_attn_layers = model_config.get_num_layers_by_block_type( - parallel_config, LayerBlockType.attention) -+ self.num_query_heads = model_config.get_num_attention_heads( -+ parallel_config) - self.num_kv_heads = model_config.get_num_kv_heads(parallel_config) - self.head_size = model_config.get_head_size() - self.hidden_size = model_config.get_hidden_size() -@@ -118,6 +120,10 @@ class GPUModelRunner: - self.cudagraph_batch_sizes = list( - reversed(self.vllm_config.compilation_config.capture_sizes)) - -+ # Cache the device properties. -+ self.device_properties = torch.cuda.get_device_properties(self.device) -+ self.num_sms = self.device_properties.multi_processor_count -+ - # Persistent buffers for CUDA graphs. - self.input_ids = torch.zeros(self.max_num_tokens, - dtype=torch.int32, -@@ -131,7 +137,8 @@ class GPUModelRunner: - device=self.device) - - # OPTIMIZATION: Cache the tensors rather than creating them every step. -- self.arange_np = np.arange(max(self.max_num_reqs, self.max_model_len), -+ self.arange_np = np.arange(max(self.max_num_reqs + 1, -+ self.max_model_len), - dtype=np.int32) - # NOTE(woosuk): These tensors are "stateless", i.e., they are literally - # a faster version of creating a new tensor every time. Thus, we should -@@ -204,10 +211,9 @@ class GPUModelRunner: - if num_new_blocks == 0: - continue - start_index = len(req_state.block_ids) -- end_index = start_index + num_new_blocks - req_state.block_ids.extend(req_data.new_block_ids) -- self.input_batch.block_table_cpu[ -- req_index, start_index:end_index] = req_data.new_block_ids -+ self.input_batch.block_table.append_row(req_index, start_index, -+ req_data.new_block_ids) - - req_ids_to_add: List[str] = [] - # Add new requests to the cached states. -@@ -268,9 +274,7 @@ class GPUModelRunner: - - # OPTIMIZATION: Start copying the block table first. - # This way, we can overlap the copy with the following CPU operations. -- self.input_batch.block_table[:num_reqs].copy_( -- self.input_batch.block_table_cpu_tensor[:num_reqs], -- non_blocking=True) -+ self.input_batch.block_table.commit(num_reqs) - - # Get the number of scheduled tokens for each request. - # TODO: The Python loop can be slow. Optimize. -@@ -326,8 +330,8 @@ class GPUModelRunner: - # NOTE(woosuk): We use torch.index_select instead of np.take here - # because torch.index_select is much faster than np.take for large - # tensors. -- block_numbers = (self.input_batch.block_table_cpu_tensor.flatten() -- [block_table_indices].numpy()) -+ block_table_cpu = self.input_batch.block_table.get_cpu_tensor() -+ block_numbers = block_table_cpu.flatten()[block_table_indices].numpy() - block_offsets = positions_np % self.block_size - np.add(block_numbers * self.block_size, - block_offsets, -@@ -355,14 +359,102 @@ class GPUModelRunner: - self.device, non_blocking=True) - slot_mapping = self.slot_mapping_cpu[:total_num_scheduled_tokens].to( - self.device, non_blocking=True).long() -+ -+ # Prepare for cascade attention if needed. -+ common_prefix_len = (scheduler_output.num_common_prefix_blocks * -+ self.block_size) -+ if common_prefix_len == 0: -+ # Common case. -+ use_cascade = False -+ else: -+ # NOTE(woosuk): Cascade attention uses two attention kernels: one -+ # for the common prefix and the other for the rest. For the first -+ # kernel, we concatenate all the query tokens (possibly from -+ # different requests) and treat them as if they are from the same -+ # request. Then, we use bi-directional attention to process the -+ # common prefix in the KV cache. Importantly, this means that the -+ # first kernel does not do any masking. -+ -+ # Consider the following example: -+ # Request 1's input query: [D, E, X] -+ # Request 1's kv cache: [A, B, C, D, E, X] -+ # Request 1's num_computed_tokens: 3 (i.e., [A, B, C]) -+ # Request 2's input query: [E, Y] -+ # Request 2's kv cache: [A, B, C, D, E, Y] -+ # Request 2's num_computed_tokens: 4 (i.e., [A, B, C, D]) -+ -+ # If we use [A, B, C, D, E] as the common prefix, then the -+ # first kernel will compute the bi-directional attention between -+ # input query [D, E, X, E, Y] and common prefix [A, B, C, D, E]. -+ # However, this is wrong because D in Request 1 should not attend to -+ # E in the common prefix (i.e., we need masking). -+ # To avoid this, [A, B, C, D] should be the common prefix. -+ # That is, the common prefix should be capped by the minimum -+ # num_computed_tokens among the requests, and plus one to include -+ # the first token of the query. -+ -+ # In practice, we use [A, B, C] as the common prefix, instead of -+ # [A, B, C, D] (i.e., the common prefix is capped by the minimum -+ # num_computed_tokens, without plus one). -+ # This is because of an implementation detail: We want to always -+ # use two kernels for cascade attention. Let's imagine: -+ # Request 3's input query: [D] -+ # Request 3's kv cache: [A, B, C, D] -+ # Request 3's num_computed_tokens: 4 (i.e., [A, B, C, D]) -+ # If we use [A, B, C, D] as the common prefix for Request 1-3, -+ # then Request 3 will be processed only by the first kernel, -+ # and the second kernel will get an empty input. While this is not -+ # a fundamental problem, our current implementation does not support -+ # this case. -+ common_prefix_len = min( -+ common_prefix_len, -+ self.input_batch.num_computed_tokens_cpu[:num_reqs].min()) -+ # common_prefix_len should be a multiple of the block size. -+ common_prefix_len = (common_prefix_len // self.block_size * -+ self.block_size) -+ use_cascade = FlashAttentionBackend.use_cascade_attention( -+ common_prefix_len=common_prefix_len, -+ query_lens=num_scheduled_tokens, -+ num_query_heads=self.num_query_heads, -+ num_kv_heads=self.num_kv_heads, -+ use_alibi=False, # FIXME -+ use_sliding_window=self.sliding_window is not None, -+ num_sms=self.num_sms, -+ ) -+ -+ if use_cascade: -+ # TODO: Optimize. -+ cu_prefix_query_lens = torch.tensor( -+ [0, total_num_scheduled_tokens], -+ dtype=torch.int32, -+ device=self.device) -+ cu_prefix_kv_lens = torch.tensor([0, common_prefix_len], -+ dtype=torch.int32, -+ device=self.device) -+ cu_suffix_kv_lens = ( -+ self.seq_start_loc_np[:num_reqs + 1] - -+ self.arange_np[:num_reqs + 1] * common_prefix_len) -+ cu_suffix_kv_lens = torch.from_numpy(cu_suffix_kv_lens).to( -+ self.device) -+ else: -+ cu_prefix_query_lens = None -+ cu_prefix_kv_lens = None -+ cu_suffix_kv_lens = None -+ - attn_metadata = FlashAttentionMetadata( - num_actual_tokens=total_num_scheduled_tokens, - max_query_len=max_num_scheduled_tokens, - query_start_loc=query_start_loc, - max_seq_len=max_seq_len, - seq_start_loc=seq_start_loc, -- block_table=self.input_batch.block_table[:num_reqs], -+ block_table=( -+ self.input_batch.block_table.get_device_tensor()[:num_reqs]), - slot_mapping=slot_mapping, -+ use_cascade=use_cascade, -+ common_prefix_len=common_prefix_len, -+ cu_prefix_query_lens=cu_prefix_query_lens, -+ cu_prefix_kv_lens=cu_prefix_kv_lens, -+ cu_suffix_kv_lens=cu_suffix_kv_lens, - ) - # NOTE(woosuk): Due to chunked prefills, there can be at most 1 partial - # request in the batch. While we should not sample any token from this -@@ -550,6 +642,7 @@ class GPUModelRunner: - # Append the sampled token to the output token ids. - token_id = sampled_token_ids[i] - self.input_batch.token_ids_cpu[i, seq_len] = token_id -+ self.input_batch.num_tokens[i] += 1 - req_state.output_token_ids.append(token_id) - else: - # Ignore the sampled token from the partial request. -@@ -584,6 +677,7 @@ class GPUModelRunner: - return model_runner_output +@@ -1255,6 +1255,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): + return draft_token_ids def load_model(self) -> None: + # We will need to replace the model loading position here... logger.info("Starting to load model %s...", self.model_config.model) with DeviceMemoryProfiler() as m: # noqa: SIM117 - self.model = get_model(vllm_config=self.vllm_config) -@@ -647,10 +741,23 @@ class GPUModelRunner: - self.mm_registry.get_max_tokens_per_item_by_modality( - self.model_config).values()) - -- max_num_mm_items = min( -+ max_num_mm_items_encoder_budget = min( - self.max_num_encoder_input_tokens, - self.encoder_cache_size) // max_tokens_per_mm_item - -+ max_mm_items_per_req = max( -+ self.mm_registry.get_mm_limits_per_prompt( -+ self.model_config).values()) -+ -+ # NOTE: We do not consider max_num_batched_tokens on purpose -+ # because the multimodal embeddings can be generated in advance -+ # and chunked prefilled. -+ max_num_mm_items_decoder_budget = self.max_num_reqs * \ -+ max_mm_items_per_req -+ -+ max_num_mm_items = min(max_num_mm_items_encoder_budget, -+ max_num_mm_items_decoder_budget) -+ - # Dummy data definition in V0 may contain multiple multimodal items - # (e.g, multiple images) for a single request, therefore here we - # always replicate first item by max_num_mm_items times since in V1 -@@ -728,7 +835,7 @@ class GPUModelRunner: - # Trigger CUDA graph capture for specific shapes. - # Capture the large shapes first so that the smaller shapes - # can reuse the memory pool allocated for the large shapes. -- with graph_capture(): -+ with graph_capture(device=self.device): - for num_tokens in reversed(self.cudagraph_batch_sizes): - for _ in range(self.vllm_config.compilation_config. - cudagraph_num_of_warmups): -diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py -index 0000b09bf..af438f7d5 100644 ---- a/vllm/v1/worker/gpu_worker.py -+++ b/vllm/v1/worker/gpu_worker.py -@@ -48,6 +48,7 @@ class Worker: - self.prompt_adapter_config = vllm_config.prompt_adapter_config - self.observability_config = vllm_config.observability_config - -+ self.parallel_config.rank = rank - self.local_rank = local_rank - self.rank = rank - self.distributed_init_method = distributed_init_method + time_before_load = time.perf_counter() diff --git a/vllm/v1/worker/xpu_model_runner.py b/vllm/v1/worker/xpu_model_runner.py new file mode 100644 -index 000000000..182d673df +index 000000000..8612d3d77 --- /dev/null +++ b/vllm/v1/worker/xpu_model_runner.py -@@ -0,0 +1,321 @@ -+from typing import TYPE_CHECKING, Dict, List +@@ -0,0 +1,370 @@ ++# SPDX-License-Identifier: Apache-2.0 ++import gc ++from typing import TYPE_CHECKING + +import numpy as np +import torch + +from vllm.config import CompilationLevel, VllmConfig +from vllm.inputs import INPUT_REGISTRY ++from vllm.logger import init_logger +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, LayerBlockType, cdiv, + is_pin_memory_available) -+from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata -+from vllm.v1.attention.backends.ipex_attn import IPEXAttentionBackend -+from vllm.v1.engine.mm_input_mapper import MMHasher, MMInputMapperClient ++from vllm.v1.attention.backends.ipex_attn import (IPEXAttentionBackend, ++ IPEXAttentionMetadata) ++from vllm.v1.core.encoder_cache_manager import compute_encoder_budget ++from vllm.v1.kv_cache_interface import (AttentionSpec, FullAttentionSpec, ++ KVCacheConfig, KVCacheSpec, ++ SlidingWindowSpec) ++from vllm.v1.utils import bind_kv_cache +from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch +from vllm.v1.worker.gpu_model_runner import GPUModelRunner + +if TYPE_CHECKING: + from vllm.v1.core.scheduler import SchedulerOutput + ++logger = init_logger(__name__) ++ + +class XPUModelRunner(GPUModelRunner): + """A model runner for XPU devices.""" @@ -39283,27 +15357,26 @@ index 000000000..182d673df + # Multi-modal data support + self.input_registry = INPUT_REGISTRY + self.mm_registry = MULTIMODAL_REGISTRY ++ # FIXME: support mrope ++ self.uses_mrope = False + -+ # NOTE: mm_input_mapper_client and mm_hasher are only used for memory -+ # profiling. -+ self.mm_input_mapper_client = MMInputMapperClient(self.model_config) -+ self.mm_hasher = MMHasher() -+ self.use_hash = (not model_config.disable_mm_preprocessor_cache) or \ -+ cache_config.enable_prefix_caching -+ -+ self.max_num_encoder_input_tokens = self.scheduler_config.max_num_encoder_input_tokens # noqa: E501 -+ self.encoder_cache_size = self.scheduler_config.encoder_cache_size ++ encoder_compute_budget, encoder_cache_size = compute_encoder_budget( ++ model_config=model_config, ++ scheduler_config=scheduler_config, ++ mm_registry=self.mm_registry, ++ ) ++ self.max_num_encoder_input_tokens = encoder_compute_budget ++ self.encoder_cache_size = encoder_cache_size + + # Lazy initialization + # self.model: nn.Module # Set after load_model -+ self.kv_caches: List[torch.Tensor] = [] ++ self.kv_caches: list[torch.Tensor] = [] + # req_id -> (input_id -> encoder_output) -+ self.encoder_cache: Dict[str, Dict[int, torch.Tensor]] = {} -+ ++ self.encoder_cache: dict[str, dict[int, torch.Tensor]] = {} ++ self.use_spec_decode = False + # Request states. -+ self.requests: Dict[str, CachedRequestState] = {} ++ self.requests: dict[str, CachedRequestState] = {} + # Persistent batch. -+ # It seems that each ModelRunner have a InputBatch + self.input_batch = InputBatch( + max_num_reqs=self.max_num_reqs, + max_model_len=self.max_model_len, @@ -39321,7 +15394,8 @@ index 000000000..182d673df + # self.cudagraph_batch_sizes sorts in ascending order. + # The batch sizes in the config are in descending order. + self.cudagraph_batch_sizes = list( -+ reversed(self.vllm_config.compilation_config.capture_sizes)) ++ reversed( ++ self.vllm_config.compilation_config.cudagraph_capture_sizes)) + + # Persistent buffers for CUDA graphs. + self.input_ids = torch.zeros(self.max_num_tokens, @@ -39361,30 +15435,30 @@ index 000000000..182d673df + dtype=torch.int32, + device="cpu", + pin_memory=self.pin_memory) -+ self.context_len_cpu = torch.zeros(self.max_num_reqs, dtype=torch.int32, device="cpu", pin_memory=self.pin_memory) + self.query_start_loc_np = self.query_start_loc_cpu.numpy() + self.seq_start_loc_cpu = torch.zeros(self.max_num_reqs + 1, + dtype=torch.int32, + device="cpu", + pin_memory=self.pin_memory) + self.seq_start_loc_np = self.seq_start_loc_cpu.numpy() ++ self.seq_lens_cpu = torch.zeros(self.max_num_reqs, ++ dtype=torch.int32, ++ device="cpu", ++ pin_memory=self.pin_memory) ++ self.seq_lens_np = self.seq_lens_cpu.numpy() + + def _prepare_inputs(self, scheduler_output: "SchedulerOutput"): -+ # Total scheduled tokens + total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens + assert total_num_scheduled_tokens > 0 -+ # Num reqs + num_reqs = self.input_batch.num_reqs + assert num_reqs > 0 + -+ # It seems that the block table is stored in CPU at first... + # OPTIMIZATION: Start copying the block table first. + # This way, we can overlap the copy with the following CPU operations. + self.input_batch.block_table.commit(num_reqs) + + # Get the number of scheduled tokens for each request. + # TODO: The Python loop can be slow. Optimize. -+ # Scheduled tokens are stored in this, max is stored in max + num_scheduled_tokens = [] + max_num_scheduled_tokens = 0 + for req_id in self.input_batch.req_ids[:num_reqs]: @@ -39449,14 +15523,17 @@ index 000000000..182d673df + np.cumsum(num_scheduled_tokens, + out=self.query_start_loc_np[1:num_reqs + 1]) + -+ # Here we can get seq_lens + seq_lens = (self.input_batch.num_computed_tokens_cpu[:num_reqs] + + num_scheduled_tokens) -+ context_lens = self.input_batch.num_computed_tokens_cpu[:num_reqs] + max_seq_len = seq_lens.max() + self.seq_start_loc_np[0] = 0 + np.cumsum(seq_lens, out=self.seq_start_loc_np[1:num_reqs + 1]) + ++ self.seq_lens_np[:num_reqs] = ( ++ self.input_batch.num_computed_tokens_cpu[:num_reqs] + ++ num_scheduled_tokens) ++ # max_seq_len = self.seq_lens_np[:num_reqs].max() ++ + # Copy the tensors to the GPU. + self.input_ids[:total_num_scheduled_tokens].copy_( + self.input_ids_cpu[:total_num_scheduled_tokens], non_blocking=True) @@ -39466,6 +15543,8 @@ index 000000000..182d673df + self.device, non_blocking=True) + seq_start_loc = self.seq_start_loc_cpu[:num_reqs + 1].to( + self.device, non_blocking=True) ++ seq_lens = self.seq_lens_cpu[:num_reqs].to(self.device, ++ non_blocking=True) + slot_mapping = self.slot_mapping_cpu[:total_num_scheduled_tokens].to( + self.device, non_blocking=True).long() + @@ -39479,36 +15558,31 @@ index 000000000..182d673df + [0, total_num_scheduled_tokens], + dtype=torch.int32, + device=self.device) -+ cu_prefix_kv_lens = torch.tensor([0, common_prefix_len], -+ dtype=torch.int32, -+ device=self.device) -+ cu_suffix_kv_lens = ( -+ self.seq_start_loc_np[:num_reqs + 1] - -+ self.arange_np[:num_reqs + 1] * common_prefix_len) -+ cu_suffix_kv_lens = torch.from_numpy(cu_suffix_kv_lens).to( -+ self.device) ++ prefix_kv_lens = torch.tensor([common_prefix_len], ++ dtype=torch.int32, ++ device=self.device) ++ suffix_kv_lens = (self.seq_lens_np[:num_reqs] - common_prefix_len) ++ suffix_kv_lens = torch.from_numpy(suffix_kv_lens).to(self.device) + else: + cu_prefix_query_lens = None -+ cu_prefix_kv_lens = None -+ cu_suffix_kv_lens = None ++ prefix_kv_lens = None ++ suffix_kv_lens = None + -+ # TODO(gc): remove this context_lens and seq_lens -+ attn_metadata = FlashAttentionMetadata( ++ attn_metadata = IPEXAttentionMetadata( + num_actual_tokens=total_num_scheduled_tokens, + max_query_len=max_num_scheduled_tokens, + query_start_loc=query_start_loc, + max_seq_len=max_seq_len, + seq_start_loc=seq_start_loc, ++ seq_lens=torch.empty(0, dtype=torch.int32, device=self.device), + block_table=( + self.input_batch.block_table.get_device_tensor()[:num_reqs]), + slot_mapping=slot_mapping, + use_cascade=use_cascade, + common_prefix_len=common_prefix_len, + cu_prefix_query_lens=cu_prefix_query_lens, -+ cu_prefix_kv_lens=cu_prefix_kv_lens, -+ cu_suffix_kv_lens=cu_suffix_kv_lens, -+ context_lens=None, -+ seq_lens=None, ++ prefix_kv_lens=prefix_kv_lens, ++ suffix_kv_lens=suffix_kv_lens, + ) + # NOTE(woosuk): Due to chunked prefills, there can be at most 1 partial + # request in the batch. While we should not sample any token from this @@ -39516,42 +15590,81 @@ index 000000000..182d673df + # token from the partial request. + # TODO: Support prompt logprobs. + logits_indices = query_start_loc[1:] - 1 -+ return attn_metadata, logits_indices ++ spec_decode_metadata = None ++ return attn_metadata, logits_indices, spec_decode_metadata + -+ @torch.inference_mode() + def profile_run(self) -> None: -+ # self._dummy_run(self.model, self.max_num_tokens) ++ # Trigger compilation for general shape. ++ hidden_states = self._dummy_run(self.max_num_tokens) ++ logits = self.model.compute_logits(hidden_states, None) ++ logits = logits[:self.max_num_tokens] + torch.xpu.synchronize() ++ gc.collect() + -+ def initialize_kv_cache(self, num_blocks: int) -> None: -+ assert len(self.kv_caches) == 0 -+ kv_cache_shape = IPEXAttentionBackend.get_kv_cache_shape( -+ num_blocks, self.block_size, self.num_kv_heads, self.head_size) -+ for _ in range(self.num_attn_layers): -+ self.kv_caches.append( -+ torch.zeros(kv_cache_shape, -+ dtype=self.kv_cache_dtype, -+ device=self.device)) ++ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None: ++ """ ++ Initialize KV cache based on `kv_cache_config`. ++ Args: ++ kv_cache_config: Configuration for the KV cache, including the KV ++ cache size of each layer ++ """ ++ if len(kv_cache_config.kv_cache_groups) > 1: ++ raise NotImplementedError( ++ "Hybrid models with more than one KV cache type are not " ++ "supported yet.") ++ ++ kv_caches: dict[str, torch.Tensor] = {} ++ ++ for kv_cache_group in kv_cache_config.kv_cache_groups: ++ kv_cache_spec = kv_cache_group.kv_cache_spec ++ for layer_name in kv_cache_group.layer_names: ++ tensor_config = kv_cache_config.tensors[layer_name] ++ assert tensor_config.size % kv_cache_spec.page_size_bytes == 0 ++ num_blocks = tensor_config.size // kv_cache_spec.page_size_bytes ++ # `num_blocks` is the number of blocks the model runner can use. ++ # `kv_cache_config.num_blocks` is the number of blocks that ++ # KVCacheManager may allocate. ++ # Since different GPUs may have different number of layers and ++ # different memory capacities, `num_blocks` can be different on ++ # different GPUs, and `kv_cache_config.num_blocks` is set to ++ # the min of all `num_blocks`. Verify it here. ++ assert num_blocks >= kv_cache_config.num_blocks ++ if isinstance(kv_cache_spec, AttentionSpec): ++ kv_cache_shape = IPEXAttentionBackend.get_kv_cache_shape( ++ num_blocks, kv_cache_spec.block_size, kv_cache_spec.num_kv_heads, ++ kv_cache_spec.head_size) ++ dtype = kv_cache_spec.dtype ++ kv_caches[layer_name] = torch.zeros(kv_cache_shape, ++ dtype=dtype, ++ device=self.device) ++ else: ++ # TODO: add new branches when introducing more types of ++ # KV cache specs. ++ raise ValueError("Unknown KV cache spec type.") ++ ++ bind_kv_cache( ++ kv_caches, ++ self.vllm_config.compilation_config.static_forward_context, ++ self.kv_caches) diff --git a/vllm/v1/worker/xpu_worker.py b/vllm/v1/worker/xpu_worker.py new file mode 100644 -index 000000000..8f6fdb11e +index 000000000..1bc531e28 --- /dev/null +++ b/vllm/v1/worker/xpu_worker.py -@@ -0,0 +1,149 @@ -+import gc +@@ -0,0 +1,168 @@ ++# SPDX-License-Identifier: Apache-2.0 +import os -+from typing import Optional, Tuple ++from typing import Optional + -+import oneccl_bindings_for_pytorch # noqa: F401 +import torch +import torch.distributed + -+from vllm.config import ParallelConfig ++from vllm.config import ParallelConfig, VllmConfig +from vllm.distributed import (ensure_model_parallel_initialized, + init_distributed_environment) +from vllm.model_executor import set_random_seed +from vllm.platforms import current_platform -+from vllm.v1.worker.gpu_worker import Worker, _get_cache_block_size ++from vllm.v1.worker.gpu_worker import Worker +from vllm.v1.worker.xpu_model_runner import XPUModelRunner + + @@ -39560,30 +15673,44 @@ index 000000000..8f6fdb11e + + def __init__( + self, -+ vllm_config, -+ local_rank, -+ rank, -+ distributed_init_method, ++ vllm_config: VllmConfig, ++ local_rank: int, ++ rank: int, ++ distributed_init_method: str, ++ is_driver_worker: bool = False, + ): -+ super().__init__( -+ vllm_config, -+ local_rank, -+ rank, -+ distributed_init_method, -+ ) ++ super().__init__(vllm_config, local_rank, rank, ++ distributed_init_method, is_driver_worker) + device_config = self.device_config + assert device_config.device_type == "xpu" + assert current_platform.is_xpu() + ++ def load_model(self) -> None: ++ self.model_runner.load_model() ++ ++ # we provide this function due to `torch.xpu.mem_get_info()` doesn't ++ # return correct free_gpu_memory on intel client GPU. We need to ++ # calculate/estiamte it. ++ def xpu_get_mem_info(self): ++ if current_platform.is_data_center_gpu(): ++ return torch.xpu.mem_get_info() ++ else: ++ _, total_gpu_memory = torch.xpu.mem_get_info() ++ # FIXME: memory_allocated() doesn't count non-torch allocations, ++ # and we don't have any API to get it. so we mark it as 128MB. ++ used_memory = torch.xpu.memory_allocated() ++ non_torch_allocations = 128 * 1024 * 1024 ++ free_gpu_memory = total_gpu_memory - (used_memory + ++ non_torch_allocations) ++ return free_gpu_memory, total_gpu_memory ++ + @torch.inference_mode() -+ def determine_num_available_blocks(self) -> Tuple[int, int]: ++ def determine_available_memory(self) -> int: + """Profiles the peak memory usage of the model to determine how many + KV blocks may be allocated without OOMs. -+ + The engine will first conduct a profiling of the existing memory usage. + Then, it calculate the maximum possible number of GPU and CPU blocks + that can be allocated with the remaining free memory. -+ + .. tip:: + You may limit the usage of GPU memory + by adjusting the `gpu_memory_utilization` parameter. @@ -39591,43 +15718,49 @@ index 000000000..8f6fdb11e + # Profile the memory usage of the model and get the maximum number of + # cache blocks that can be allocated with the remaining free memory. + torch.xpu.empty_cache() ++ torch.xpu.reset_peak_memory_stats() + ++ # Note(Xiangyu): torch.xpu.mem_get_info() is missing with torch 2.6 ++ # _, total_gpu_memory = torch.xpu.mem_get_info() + # Execute a forward pass with dummy inputs to profile the memory usage + # of the model. + self.model_runner.profile_run() + -+ # Calculate the number of blocks that can be allocated with the -+ # profiled peak memory. + torch.xpu.synchronize() + used_memory = torch.xpu.memory_allocated() + total_gpu_memory = torch.xpu.get_device_properties( + self.local_rank).total_memory + free_gpu_memory = total_gpu_memory - used_memory + ++ # free_gpu_memory, _ = self.xpu_get_mem_info() + # NOTE(woosuk): Here we assume that the other processes using the same + # GPU did not change their memory usage during the profiling. + peak_memory = self.init_gpu_memory - free_gpu_memory -+ assert peak_memory > 0, ( ++ assert self.init_gpu_memory > free_gpu_memory, ( + "Error in memory profiling. " + f"Initial free memory {self.init_gpu_memory}, current free memory" + f" {free_gpu_memory}. This happens when the GPU memory was " + "not properly cleaned up before initializing the vLLM instance.") + -+ cache_block_size = _get_cache_block_size(self.cache_config, -+ self.model_config, -+ self.parallel_config) -+ num_gpu_blocks = int( -+ (total_gpu_memory * self.cache_config.gpu_memory_utilization - -+ peak_memory) // cache_block_size) -+ num_cpu_blocks = int(self.cache_config.swap_space_bytes // -+ cache_block_size) -+ num_gpu_blocks = max(num_gpu_blocks, 0) -+ num_cpu_blocks = max(num_cpu_blocks, 0) -+ gc.collect() ++ # # Get the peak memory allocation recorded by torch ++ # peak_memory = torch.xpu.memory_stats()["allocated_bytes.all.peak"] ++ + torch.xpu.empty_cache() -+ return num_gpu_blocks, num_cpu_blocks ++ # torch_allocated_bytes = torch.xpu.memory_stats( ++ # )["allocated_bytes.all.current"] ++ # total_allocated_bytes = self.xpu_get_mem_info( ++ # )[1] - self.xpu_get_mem_info()[0] + -+ def initialize(self): ++ # non_torch_allocations = total_allocated_bytes - torch_allocated_bytes ++ # if non_torch_allocations > 0: ++ # peak_memory += non_torch_allocations ++ available_kv_cache_memory = ( ++ total_gpu_memory * self.cache_config.gpu_memory_utilization - ++ peak_memory) - 128 * 1024 * 1024 ++ ++ return int(available_kv_cache_memory) ++ ++ def init_device(self): + if self.device_config.device.type == "xpu" and current_platform.is_xpu( + ): + self.device = torch.device(f"xpu:{self.local_rank}") @@ -39668,11 +15801,11 @@ index 000000000..8f6fdb11e + "is not already initialized") + else: + # oneapi 2025 will use pidfd as default -+ # ENV_CCL_ZE_IPC_EXCHANGE = os.getenv("CCL_ZE_IPC_EXCHANGE", "drmfd") ++ ENV_CCL_ZE_IPC_EXCHANGE = os.getenv("CCL_ZE_IPC_EXCHANGE", "drmfd") + ENV_CCL_ATL_TRANSPORT = os.getenv("CCL_ATL_TRANSPORT", "ofi") + ENV_LOCAL_WORLD_SIZE = os.getenv("LOCAL_WORLD_SIZE", + str(parallel_config.world_size)) -+ # os.environ["CCL_ZE_IPC_EXCHANGE"] = ENV_CCL_ZE_IPC_EXCHANGE ++ os.environ["CCL_ZE_IPC_EXCHANGE"] = ENV_CCL_ZE_IPC_EXCHANGE + os.environ["CCL_ATL_TRANSPORT"] = ENV_CCL_ATL_TRANSPORT + os.environ["LOCAL_WORLD_SIZE"] = ENV_LOCAL_WORLD_SIZE + os.environ["LOCAL_RANK"] = str(local_rank) @@ -39686,34 +15819,12 @@ index 000000000..8f6fdb11e + ensure_model_parallel_initialized(parallel_config.tensor_parallel_size, + parallel_config.pipeline_parallel_size) + # global all_reduce needed for overall oneccl warm up -+ torch.distributed.all_reduce(torch.zeros(1).xpu()) -diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py -index bff01320d..4d5d91808 100644 ---- a/vllm/worker/enc_dec_model_runner.py -+++ b/vllm/worker/enc_dec_model_runner.py -@@ -287,12 +287,11 @@ class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]): - seq_len, - self.mm_registry, - is_encoder_data=False) -- encoder_dummy_data \ -- = self.input_registry.dummy_data_for_profiling( -- self.model_config, -- seq_len, -- self.mm_registry, -- is_encoder_data=True) -+ encoder_dummy_data = self.input_registry \ -+ .dummy_data_for_profiling(self.model_config, -+ seq_len, -+ self.mm_registry, -+ is_encoder_data=True) - - # Having more tokens is over-conservative but otherwise fine - assert len( ++ # torch.distributed.all_reduce(torch.zeros(1).xpu()) diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py -index 2b545d1b2..369940b6d 100644 +index 86e6d9752..ad80bf54e 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py -@@ -69,9 +69,9 @@ _NUM_WARMUP_ITERS = 2 +@@ -70,9 +70,9 @@ _NUM_WARMUP_ITERS = 2 TModelInputForGPU = TypeVar('TModelInputForGPU', bound="ModelInputForGPU") @@ -39726,177 +15837,11 @@ index 2b545d1b2..369940b6d 100644 @dataclass(frozen=True) -@@ -1136,7 +1136,8 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): - self.prompt_adapter_manager.create_prompt_adapter_manager( - self.model)) - -- if self.kv_cache_dtype == "fp8" and current_platform.is_rocm(): -+ if self.kv_cache_dtype == "fp8" and (current_platform.is_rocm() -+ or current_platform.is_cuda()): - # Currently only ROCm accepts kv-cache scaling factors - # via quantization_param_path and this will be deprecated - # in the future. -@@ -1425,10 +1426,15 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): - - # Prepare dummy inputs. These will be reused for all batch sizes. - max_batch_size = self.max_batchsize_to_capture -- input_tokens = torch.zeros(max_batch_size, dtype=torch.long).cuda() -- input_positions = torch.zeros(max_batch_size, dtype=torch.long).cuda() -+ input_tokens = torch.zeros(max_batch_size, -+ dtype=torch.long, -+ device=self.device) -+ input_positions = torch.zeros(max_batch_size, -+ dtype=torch.long, -+ device=self.device) - if self.model_config.uses_mrope: -- input_positions = torch.tile(input_positions, (3, 1)) -+ input_positions = torch.tile(input_positions, -+ (3, 1)).cuda(device=self.device) - # Prepare dummy previous_hidden_states only if needed by the model. - # This is used by draft models such as EAGLE. - previous_hidden_states = None -@@ -1447,8 +1453,8 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): - dtype=self.model_config.dtype, - device=self.device) - -- with self.attn_state.graph_capture( -- max_batch_size), graph_capture() as graph_capture_context: -+ with self.attn_state.graph_capture(max_batch_size), graph_capture( -+ self.device) as graph_capture_context: - # NOTE: Capturing the largest batch size first may help reduce the - # memory usage of CUDA graph. - for virtual_engine in range( -@@ -1548,10 +1554,12 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): - """ - # During the decode phase encoder_input_ids and encoder_positions are - # unset. Do the same thing for graph capture. -- capture_inputs["encoder_input_ids"] = torch.tensor( -- [], dtype=torch.long).cuda() -- capture_inputs["encoder_positions"] = torch.tensor( -- [], dtype=torch.long).cuda() -+ capture_inputs["encoder_input_ids"] = torch.tensor([], -+ dtype=torch.long, -+ device=self.device) -+ capture_inputs["encoder_positions"] = torch.tensor([], -+ dtype=torch.long, -+ device=self.device) - - @property - def vocab_size(self) -> int: -diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py -index cd4770202..c7abad7e0 100644 ---- a/vllm/worker/model_runner_base.py -+++ b/vllm/worker/model_runner_base.py -@@ -12,7 +12,6 @@ from torch import is_tensor - from vllm.config import VllmConfig - from vllm.logger import init_logger - from vllm.model_executor.layers.sampler import SamplerOutput --from vllm.platforms import current_platform - from vllm.sequence import IntermediateTensors, SequenceGroupMetadata - - if TYPE_CHECKING: -@@ -265,13 +264,13 @@ class ModelRunnerBase(ABC, Generic[T]): - """ - raise NotImplementedError - -- @current_platform.inference_mode() - def execute_model( - self, - model_input: T, - kv_caches: Optional[List[torch.Tensor]], -- intermediate_tensors: Optional[IntermediateTensors], -+ intermediate_tensors: Optional[IntermediateTensors] = None, - num_steps: int = 1, -+ **kwargs, - ) -> Optional[List[SamplerOutput]]: - """ - Execute the model on the given input. -diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py -index 65d9bab0e..dee63a75c 100644 ---- a/vllm/worker/multi_step_model_runner.py -+++ b/vllm/worker/multi_step_model_runner.py -@@ -544,6 +544,7 @@ class MultiStepModelRunner(GPUModelRunnerBase[StatefulModelInput]): - model_input.record_step_event(current_stream) - - if get_pp_group().is_last_rank and self.is_driver_worker: -+ assert isinstance(output, list) - assert len( - output - ) == 1, "MultiStepModelRunner requires single-step base_models" -diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py -index 9a054eb8a..7bdb7f0e2 100644 ---- a/vllm/worker/tpu_model_runner.py -+++ b/vllm/worker/tpu_model_runner.py -@@ -126,8 +126,10 @@ class TPUModelRunner(ModelRunnerBase[ModelInputForTPU]): - logger.warning( - "The max_model_len (%d) is too large. This may degrade the " - "performance due to the insufficient smem size. Consider " -- "setting --max-model-len to a smaller value.", -- self.model_config.max_model_len) -+ "setting --max-model-len to a smaller value, like %d.", -+ self.model_config.max_model_len, -+ self.model_config.max_model_len / -+ (block_table_size / smem_size)) - - def load_model(self) -> None: - self.device = self.device_config.device -diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py -index 3ac7fb8df..249b3ed2d 100644 ---- a/vllm/worker/worker_base.py -+++ b/vllm/worker/worker_base.py -@@ -11,7 +11,6 @@ from vllm.distributed import broadcast_tensor_dict, get_pp_group, get_tp_group - from vllm.logger import init_logger - from vllm.lora.request import LoRARequest - from vllm.model_executor.layers.sampler import SamplerOutput --from vllm.platforms import current_platform - from vllm.sequence import ExecuteModelRequest, IntermediateTensors - from vllm.utils import (enable_trace_function_call_for_thread, - resolve_obj_by_qualname, update_environment_variables) -@@ -44,6 +43,8 @@ class WorkerBase(ABC): - self.prompt_adapter_config = vllm_config.prompt_adapter_config - self.observability_config = vllm_config.observability_config - self.kv_transfer_config = vllm_config.kv_transfer_config -+ from vllm.platforms import current_platform -+ self.current_platform = current_platform - - @abstractmethod - def init_device(self) -> None: -@@ -74,17 +75,17 @@ class WorkerBase(ABC): - """ - raise NotImplementedError - -- @current_platform.inference_mode() - def start_worker_execution_loop(self) -> None: - """Execute model loop in parallel worker. - - You can stop the loop by executing a driver worker with an empty output. - See `stop_remote_worker_execution_loop` for more details. - """ -- while True: -- output = self.execute_model(execute_model_req=None) -- if output is None: -- return None -+ with self.current_platform.inference_mode(): -+ while True: -+ output = self.execute_model(execute_model_req=None) -+ if output is None: -+ return None - - @abstractmethod - def execute_model( -@@ -352,6 +353,7 @@ class LocalOrDistributedWorkerBase(WorkerBase): - model_execute_time = time.perf_counter() - start_time - if not get_pp_group().is_last_rank: - # output is IntermediateTensors -+ assert isinstance(output, IntermediateTensors) - if (self.observability_config is not None - and self.observability_config.collect_model_execute_time): - output.tensors["model_execute_time"] = torch.tensor( diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py -index 9cf253875..34d098486 100644 +index 9d49b4385..67f07f5b1 100644 --- a/vllm/worker/xpu_model_runner.py +++ b/vllm/worker/xpu_model_runner.py -@@ -3,8 +3,8 @@ import time +@@ -5,8 +5,8 @@ import time import weakref from collections import defaultdict from dataclasses import dataclass @@ -39907,8 +15852,8 @@ index 9cf253875..34d098486 100644 import torch import torch.nn as nn -@@ -14,9 +14,15 @@ from vllm.config import VllmConfig - from vllm.distributed import get_pp_group +@@ -17,9 +17,15 @@ from vllm.distributed import get_pp_group + from vllm.forward_context import set_forward_context from vllm.inputs import INPUT_REGISTRY, InputRegistry from vllm.logger import init_logger +from vllm.lora.layers import LoRAMapping @@ -39923,9 +15868,9 @@ index 9cf253875..34d098486 100644 from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs, MultiModalKwargs, MultiModalPlaceholderMap, MultiModalRegistry) -@@ -24,19 +30,25 @@ from vllm.sampling_params import SamplingParams +@@ -27,19 +33,25 @@ from vllm.sampling_params import SamplingParams from vllm.sequence import IntermediateTensors, SequenceGroupMetadata - from vllm.utils import DeviceMemoryProfiler, make_tensor_with_pad + from vllm.utils import DeviceMemoryProfiler, GiB_bytes, make_tensor_with_pad from vllm.worker.model_runner import AttentionMetadata, SamplingMetadata +from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding from vllm.worker.model_runner_base import ( @@ -39950,7 +15895,7 @@ index 9cf253875..34d098486 100644 TModelInputForXPU = TypeVar('TModelInputForXPU', bound="ModelInputForXPU") -@@ -48,6 +60,8 @@ class ModelInputForXPU(ModelRunnerInputBase): +@@ -51,6 +63,8 @@ class ModelInputForXPU(ModelRunnerInputBase): """ input_tokens: Optional[torch.Tensor] = None input_positions: Optional[torch.Tensor] = None @@ -39959,7 +15904,7 @@ index 9cf253875..34d098486 100644 attn_metadata: Optional["AttentionMetadata"] = None multi_modal_kwargs: Optional[BatchedTensorInputs] = None virtual_engine: Optional[int] = None -@@ -59,6 +73,9 @@ class ModelInputForXPU(ModelRunnerInputBase): +@@ -62,6 +76,9 @@ class ModelInputForXPU(ModelRunnerInputBase): tensor_dict = { "input_tokens": self.input_tokens, "input_positions": self.input_positions, @@ -39969,7 +15914,7 @@ index 9cf253875..34d098486 100644 } _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata) -@@ -87,6 +104,9 @@ class ModelInputForXPUWithSamplingMetadata(ModelInputForXPU): +@@ -90,6 +107,9 @@ class ModelInputForXPUWithSamplingMetadata(ModelInputForXPU): tensor_dict = { "input_tokens": self.input_tokens, "input_positions": self.input_positions, @@ -39979,7 +15924,7 @@ index 9cf253875..34d098486 100644 } _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata) _add_sampling_metadata_broadcastable_dict(tensor_dict, -@@ -109,7 +129,7 @@ class ModelInputForXPUWithSamplingMetadata(ModelInputForXPU): +@@ -112,7 +132,7 @@ class ModelInputForXPUWithSamplingMetadata(ModelInputForXPU): class ModelInputForXPUBuilder(ModelRunnerInputBuilderBase[ModelInputForXPU]): def __init__(self, @@ -39987,8 +15932,8 @@ index 9cf253875..34d098486 100644 + runner: "XPUModelRunnerBase", finished_requests_ids: Optional[List[str]] = None) -> None: super().__init__() - self.seq_group_metadata_list: List[SequenceGroupMetadata] = [] -@@ -119,38 +139,284 @@ class ModelInputForXPUBuilder(ModelRunnerInputBuilderBase[ModelInputForXPU]): + self.runner = runner +@@ -121,6 +141,10 @@ class ModelInputForXPUBuilder(ModelRunnerInputBuilderBase[ModelInputForXPU]): self.sliding_window = self.runner.sliding_window self.block_size = self.runner.block_size self.device = self.runner.device @@ -39997,7 +15942,9 @@ index 9cf253875..34d098486 100644 + # Multi-modal data support + self.multi_modal_input_mapper = self.runner.multi_modal_input_mapper - def add_seq_group(self, seq_group_metadata: SequenceGroupMetadata): + def prepare(self, + finished_requests_ids: Optional[List[str]] = None) -> None: +@@ -130,33 +154,275 @@ class ModelInputForXPUBuilder(ModelRunnerInputBuilderBase[ModelInputForXPU]): self.seq_group_metadata_list.append(seq_group_metadata) def build(self) -> ModelInputForXPU: @@ -40103,7 +16050,7 @@ index 9cf253875..34d098486 100644 + input_positions.append(position) + if is_prompt: + mm_data = seq_group_metadata.multi_modal_data -+ if mm_data and not self.runner.model_is_mrope: ++ if mm_data and not self.runner.model_is_mrope and not self.runner.mm_registry.has_processor(self.runner.model_config): + mm_kwargs = self.multi_modal_input_mapper(mm_data) + else: + mm_kwargs = mm_data @@ -40116,19 +16063,18 @@ index 9cf253875..34d098486 100644 + "mrope embedding type requires multi-modal input mapper " + "returns 'image_grid_thw' or 'video_grid_thw'.") + ++ second_per_grid_ts = mm_kwargs.get("second_per_grid_ts", None) + hf_config = self.runner.model_config.hf_config + token_ids = seq_data.get_token_ids() + temp_mrope_input_positions, mrope_position_delta = \ + MRotaryEmbedding.get_input_positions( + token_ids, ++ hf_config=hf_config, + image_grid_thw=image_grid_thw, + video_grid_thw=video_grid_thw, -+ image_token_id=hf_config.image_token_id, -+ video_token_id=hf_config.video_token_id, -+ vision_start_token_id=hf_config.vision_start_token_id, -+ vision_end_token_id=hf_config.vision_end_token_id, -+ spatial_merge_size=hf_config.vision_config.spatial_merge_size, -+ context_len=seq_data.get_num_computed_tokens(), ++ second_per_grid_ts=second_per_grid_ts, ++ seq_len=seq_len, ++ context_len=context_len, + ) + seq_data.mrope_position_delta = mrope_position_delta + if mrope_input_positions is None: @@ -40253,6 +16199,7 @@ index 9cf253875..34d098486 100644 + seq_lens=seq_lens, # 3 + seqlen_q=torch.tensor([]), # 4 + multi_modal_placeholder_index_maps=None, ++ enable_kv_scales_calculation=False, + # max_seqlen=max_seqlen, # 5 + max_seqlen=max(query_lens), + seq_lens_tensor=seq_lens_tensor, # 9 @@ -40288,7 +16235,7 @@ index 9cf253875..34d098486 100644 assert len(seq_group_metadata_list) > 0 input_tokens: List[int] = [] input_positions: List[int] = [] -@@ -160,6 +426,9 @@ class ModelInputForXPUBuilder(ModelRunnerInputBuilderBase[ModelInputForXPU]): +@@ -166,6 +432,9 @@ class ModelInputForXPUBuilder(ModelRunnerInputBuilderBase[ModelInputForXPU]): multi_modal_placeholder_maps: Dict[ str, MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap) @@ -40298,7 +16245,7 @@ index 9cf253875..34d098486 100644 for seq_group_metadata in seq_group_metadata_list: assert seq_group_metadata.is_prompt -@@ -178,29 +447,55 @@ class ModelInputForXPUBuilder(ModelRunnerInputBuilderBase[ModelInputForXPU]): +@@ -184,29 +453,55 @@ class ModelInputForXPUBuilder(ModelRunnerInputBuilderBase[ModelInputForXPU]): # Token position ids # NOTE(woosuk): Here we assume that the first token in the prompt # is always the first token in the sequence. @@ -40376,7 +16323,7 @@ index 9cf253875..34d098486 100644 if seq_group_metadata.block_tables is None: # During memory profiling, the block tables are not initialized -@@ -269,26 +564,34 @@ class ModelInputForXPUBuilder(ModelRunnerInputBuilderBase[ModelInputForXPU]): +@@ -276,26 +571,34 @@ class ModelInputForXPUBuilder(ModelRunnerInputBuilderBase[ModelInputForXPU]): ) multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list) @@ -40414,7 +16361,7 @@ index 9cf253875..34d098486 100644 for seq_id in seq_ids: seq_data = seq_group_metadata.seq_data[seq_id] -@@ -308,6 +611,8 @@ class ModelInputForXPUBuilder(ModelRunnerInputBuilderBase[ModelInputForXPU]): +@@ -315,6 +618,8 @@ class ModelInputForXPUBuilder(ModelRunnerInputBuilderBase[ModelInputForXPU]): block_offset = position % self.block_size slot = block_number * self.block_size + block_offset slot_mapping.append(slot) @@ -40423,7 +16370,7 @@ index 9cf253875..34d098486 100644 if self.sliding_window is not None: sliding_window_blocks = (self.sliding_window // -@@ -351,17 +656,14 @@ class ModelInputForXPUBuilder(ModelRunnerInputBuilderBase[ModelInputForXPU]): +@@ -359,17 +664,14 @@ class ModelInputForXPUBuilder(ModelRunnerInputBuilderBase[ModelInputForXPU]): num_prefills=0, block_tables=block_tables, ) @@ -40447,39 +16394,20 @@ index 9cf253875..34d098486 100644 def __init__( self, -@@ -402,6 +704,8 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): +@@ -410,6 +712,12 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): # Lazy initialization. self.model: nn.Module # Set after init_Model + # Set after load_model. + self.lora_manager: Optional[LRUCacheWorkerLoRAManager] = None ++ ++ from vllm.model_executor.models.utils import set_cpu_offload_max_bytes ++ set_cpu_offload_max_bytes( ++ int(self.cache_config.cpu_offload_gb * 1024**3)) self.sampling_metadata_cache: SamplingMetadataCache = \ SamplingMetadataCache() \ -@@ -415,16 +719,74 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): - logger.info("Loading model weights took %.4f GB", - self.model_memory_usage / float(2**30)) - -+ if self.lora_config: -+ assert supports_lora(self.model), "Model does not support LoRA" -+ assert not supports_multimodal( -+ self.model -+ ), "To be tested: Multi-modal model with LoRA settings." -+ -+ self.lora_manager = LRUCacheWorkerLoRAManager( -+ self.scheduler_config.max_num_seqs, -+ self.scheduler_config.max_num_batched_tokens, -+ self.vocab_size, -+ self.lora_config, -+ self.device, -+ self.model.embedding_modules, -+ self.model.embedding_padding_modules, -+ max_position_embeddings=self.model.config. -+ max_position_embeddings, -+ ) -+ self.model = self.lora_manager.create_lora_manager(self.model) -+ - @property +@@ -432,6 +740,15 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): def vocab_size(self) -> int: return self.model_config.get_vocab_size() @@ -40493,19 +16421,12 @@ index 9cf253875..34d098486 100644 + return rope_scaling.get("type", None) == "mrope" or rope_scaling.get("mrope_section", None) is not None + @torch.inference_mode() -- def profile_run(self) -> None: -+ def profile_run(self, num_batched_tokens=-1, num_seqs=-1) -> None: + def profile_run(self) -> None: # Enable top-k sampling to reflect the accurate memory usage. - sampling_params = SamplingParams(top_p=0.99, top_k=self.vocab_size - 1) +@@ -439,6 +756,30 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens -+ assert (num_batched_tokens == -1 or num_batched_tokens > 0) -+ assert (num_seqs == -1 or num_seqs > 0) max_num_seqs = self.scheduler_config.max_num_seqs -+ if num_batched_tokens != -1: -+ max_num_batched_tokens = num_batched_tokens -+ if num_seqs != -1: -+ max_num_seqs = num_seqs -+ + + # This represents the maximum number of different requests + # that will have unique loras, an therefore the max amount of memory + # consumption create dummy lora request copies from the lora request @@ -40529,10 +16450,38 @@ index 9cf253875..34d098486 100644 + dummy_lora_requests[idx % len(dummy_lora_requests)] + for idx in range(max_num_seqs) + ] - ++ # Profile memory usage with max_num_sequences sequences and the total # number of tokens equal to max_num_batched_tokens. -@@ -466,7 +828,8 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): + seqs: List[SequenceGroupMetadata] = [] +@@ -448,6 +789,7 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): + # To exercise the worst scenario for GPU memory consumption, + # the number of seqs (batch_size) is chosen to maximize the number + # of images processed. ++ ''' + max_mm_tokens = self.mm_registry.get_max_multimodal_tokens( + self.model_config) + if max_mm_tokens > 0: +@@ -461,8 +803,18 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): + "Computed max_num_seqs (%s) to be less than 1. " + "Setting it to the minimum value of 1.", expr) + max_num_seqs = 1 ++ ''' + + batch_size = 0 ++ import os ++ self_max_num_batched_tokens = os.getenv("IPEX_LLM_SELF_MAX_NUM_BATCHED_TOKENS", None) ++ if self_max_num_batched_tokens is not None: ++ max_num_batched_tokens = int(self_max_num_batched_tokens) ++ self_max_num_seqs = os.getenv("IPEX_LLM_SELF_MAX_NUM_SEQS", None) ++ if self_max_num_seqs is not None: ++ max_num_seqs = int(self_max_num_seqs) ++ else: ++ max_num_seqs = 1 + for group_id in range(max_num_seqs): + seq_len = (max_num_batched_tokens // max_num_seqs + + (group_id < max_num_batched_tokens % max_num_seqs)) +@@ -479,11 +831,14 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): seq_data={group_id: dummy_data.seq_data}, sampling_params=sampling_params, block_tables=None, @@ -40542,18 +16491,18 @@ index 9cf253875..34d098486 100644 multi_modal_data=dummy_data.multi_modal_data, multi_modal_placeholders=dummy_data.multi_modal_placeholders) seqs.append(seq) -@@ -477,9 +840,7 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): - # it by reference, rather by specializing on the value ``None``. - # the `dtype` argument does not matter, and we use `float32` as - # a placeholder (it has wide hardware support). -- kv_caches = [ -- torch.tensor([], dtype=torch.float32, device=self.device) -- ] * num_layers + ++ num_layers = self.model_config.get_num_layers(self.parallel_config) + kv_caches = [None] * num_layers finished_requests_ids = [seq.request_id for seq in seqs] model_input = self.prepare_model_input( seqs, finished_requests_ids=finished_requests_ids) -@@ -493,21 +854,35 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): +@@ -493,25 +848,39 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): + batch_size=batch_size, + dtype=self.model_config.dtype, + device=self.device) +- self.execute_model(model_input, None, intermediate_tensors) ++ self.execute_model(model_input, kv_caches, intermediate_tensors) torch.xpu.synchronize() return @@ -40599,7 +16548,7 @@ index 9cf253875..34d098486 100644 """Helper method to prepare the model input based on a given sequence group. Prepares metadata needed for the base model forward pass but not metadata for possible additional steps, e.g., sampling. -@@ -519,6 +894,22 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): +@@ -524,6 +893,22 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): return builder.build() # type: ignore @@ -40622,7 +16571,7 @@ index 9cf253875..34d098486 100644 def prepare_model_input( self, seq_group_metadata_list: List[SequenceGroupMetadata], -@@ -558,6 +949,12 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): +@@ -563,6 +948,12 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): raise ValueError( "XPUModelRunner does not support multi-step execution.") @@ -40635,7 +16584,7 @@ index 9cf253875..34d098486 100644 model_executable = self.model if (self.observability_config is not None and self.observability_config.collect_model_forward_time): -@@ -607,3 +1004,9 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): +@@ -612,3 +1003,9 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): output.model_forward_time = model_forward_time return [output] @@ -41539,105 +17488,205 @@ index 000000000..6ad951824 + assert worker_input is not None + return model_input, worker_input, kwargs \ No newline at end of file +diff --git a/vllm/worker/xpu_pooling_model_runner.py b/vllm/worker/xpu_pooling_model_runner.py +new file mode 100644 +index 000000000..550bf81e8 +--- /dev/null ++++ b/vllm/worker/xpu_pooling_model_runner.py +@@ -0,0 +1,135 @@ ++import dataclasses ++from typing import Any, Dict, List, Optional, Tuple, Type, Union ++ ++import torch ++ ++from vllm.forward_context import set_forward_context ++from vllm.model_executor.pooling_metadata import PoolingMetadata ++from vllm.multimodal import MultiModalKwargs ++from vllm.pooling_params import PoolingParams ++from vllm.sequence import (IntermediateTensors, PoolerOutput, SequenceData, ++ SequenceGroupMetadata) ++# from vllm.worker.cpu_model_runner import (CPUModelRunnerBase, ModelInputForCPU, ++ # ModelInputForCPUBuilder) ++from vllm.worker.xpu_model_runner import ModelInputForXPU, XPUModelRunnerBase, ModelInputForXPUBuilder ++ ++ ++@dataclasses.dataclass(frozen=True) ++class ModelInputForXPUWithPoolingMetadata(ModelInputForXPU): ++ """ ++ Used by the CPUPoolingModelRunner. ++ """ ++ pooling_metadata: Optional["PoolingMetadata"] = None ++ ++ ++class XPUPoolingModelRunner( ++ XPUModelRunnerBase[ModelInputForXPUWithPoolingMetadata]): ++ _model_input_cls: Type[ModelInputForXPUWithPoolingMetadata] = ( ++ ModelInputForXPUWithPoolingMetadata) ++ _builder_cls: Type[ModelInputForXPUBuilder] = ModelInputForXPUBuilder ++ ++ @torch.inference_mode() ++ def execute_model( ++ self, ++ model_input: ModelInputForXPUWithPoolingMetadata, ++ kv_caches: List[torch.Tensor], ++ intermediate_tensors: Optional[IntermediateTensors] = None, ++ num_steps: int = 1, ++ ) -> Optional[Union[List[PoolerOutput], IntermediateTensors]]: ++ if num_steps > 1: ++ raise ValueError( ++ "Currently multi-step worker does not support multi-steps...") ++ ++ # num_layers = self.model_config.get_num_layers(self.parallel_config) ++ # use an empty tensor instead of `None`` to force Dynamo to pass ++ # it by reference, rather by specializing on the value ``None``. ++ # the `dtype` argument does not matter, and we use `float32` as ++ # a placeholder (it has wide hardware support). ++ # TODO: check if we need float16... ++ # kv_caches = [ ++ # torch.tensor([], dtype=torch.float32, device=self.device) ++ # for _ in range(num_layers) ++ # ] ++ ++ model_executable = self.model ++ cross_enc_kwargs = {} ++ # if model_input.token_type_ids is not None: ++ # cross_enc_kwargs["token_type_ids"] = model_input.token_type_ids ++ execute_model_kwargs = { ++ "input_ids": ++ model_input.input_tokens, ++ "positions": ++ model_input.input_positions, ++ # "kv_caches": ++ # kv_caches, ++ # "attn_metadata": ++ # model_input.attn_metadata, ++ **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs or {}, ++ device=self.device), ++ **cross_enc_kwargs, ++ "intermediate_tensors": ++ intermediate_tensors, ++ } ++ ++ with set_forward_context(model_input.attn_metadata, self.vllm_config): ++ hidden_states = model_executable(**execute_model_kwargs) ++ ++ # Only perform pooling in the driver worker. ++ if not self.is_driver_worker: ++ return [] ++ ++ return [ ++ self.model.pooler(hidden_states=hidden_states, ++ pooling_metadata=model_input.pooling_metadata) ++ ] ++ ++ def make_model_input_from_broadcasted_tensor_dict( ++ self, ++ tensor_dict: Dict[str, ++ Any]) -> ModelInputForXPUWithPoolingMetadata: ++ return ModelInputForXPUWithPoolingMetadata.from_broadcasted_tensor_dict( ++ tensor_dict, ++ attn_backend=self.attn_backend, ++ ) ++ ++ def prepare_model_input( ++ self, ++ seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], ++ virtual_engine: int = 0, ++ finished_requests_ids: Optional[List[str]] = None ++ ) -> ModelInputForXPUWithPoolingMetadata: ++ assert seq_group_metadata_list is not None ++ model_input = self._prepare_model_input_tensors( ++ seq_group_metadata_list, finished_requests_ids) ++ # Prepare PoolingMetadata. ++ assert model_input.seq_lens is not None ++ pooling_metadata = self._prepare_pooling(seq_group_metadata_list, ++ model_input.seq_lens) ++ ++ return dataclasses.replace(model_input, ++ virtual_engine=virtual_engine, ++ pooling_metadata=pooling_metadata) ++ ++ def _prepare_pooling( ++ self, ++ seq_group_metadata_list: List[SequenceGroupMetadata], ++ prompt_lens: List[int], ++ ) -> PoolingMetadata: ++ """Prepare PoolingMetadata for the sequence group metadata list.""" ++ seq_groups: List[Tuple[List[int], PoolingParams]] = [] ++ for i, seq_group_metadata in enumerate(seq_group_metadata_list): ++ seq_ids = list(seq_group_metadata.seq_data.keys()) ++ pooling_params = seq_group_metadata.pooling_params ++ seq_groups.append((seq_ids, pooling_params)) ++ ++ seq_data: Dict[int, SequenceData] = {} ++ for seq_group_metadata in seq_group_metadata_list: ++ seq_data.update(seq_group_metadata.seq_data) ++ ++ pooling_metadata = PoolingMetadata( ++ seq_groups=seq_groups, ++ seq_data=seq_data, ++ prompt_lens=prompt_lens, ++ ) ++ ++ return pooling_metadata diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py -index 129566605..fb7962dfe 100644 +index 3aea0d741..060bb10ab 100644 --- a/vllm/worker/xpu_worker.py +++ b/vllm/worker/xpu_worker.py -@@ -3,7 +3,8 @@ import gc +@@ -2,9 +2,10 @@ + """A XPU worker class.""" + import gc import os - from typing import List, Optional, Tuple +-from typing import List, Optional, Tuple ++from typing import List, Optional, Tuple, Type --import intel_extension_for_pytorch # noqa: F401 -+# import intel_extension_for_pytorch # noqa: F401 + import intel_extension_for_pytorch # noqa: F401 +# TODO: handle case for oneccl_bindings for dual cards import oneccl_bindings_for_pytorch # noqa: F401 import torch import torch.distributed -@@ -16,13 +17,13 @@ from vllm.model_executor import set_random_seed - from vllm.platforms import current_platform +@@ -19,7 +20,8 @@ from vllm.platforms import current_platform from vllm.worker.cache_engine import CacheEngine from vllm.worker.worker import Worker --from vllm.worker.worker_base import LoraNotSupportedWorkerBase, WorkerBase -+from vllm.worker.worker_base import WorkerBase - from vllm.worker.xpu_model_runner import XPUModelRunner + from vllm.worker.worker_base import LoRANotSupportedWorkerBase, WorkerBase +-from vllm.worker.xpu_model_runner import XPUModelRunner ++from vllm.worker.xpu_model_runner import XPUModelRunner, XPUModelRunnerBase ++from vllm.worker.xpu_pooling_model_runner import XPUPoolingModelRunner logger = init_logger(__name__) +@@ -56,8 +58,12 @@ class XPUWorker(LoRANotSupportedWorkerBase, Worker): + if parallel_config and is_driver_worker: + assert rank % parallel_config.tensor_parallel_size == 0, \ + "Driver worker should be rank 0 of tensor parallel group." ++ ModelRunnerClass: Type[XPUModelRunnerBase] = XPUModelRunner ++ model_config = self.model_config ++ if model_config.task == "embed": ++ ModelRunnerClass = XPUPoolingModelRunner + +- self.model_runner = XPUModelRunner( # type: ignore ++ self.model_runner = ModelRunnerClass( # type: ignore + vllm_config=vllm_config, + kv_cache_dtype=self.cache_config.cache_dtype, + is_driver_worker=is_driver_worker, +@@ -65,7 +71,7 @@ class XPUWorker(LoRANotSupportedWorkerBase, Worker): + # Uninitialized cache engine. Will be initialized by + # initialize_cache. + self.cache_engine: List[CacheEngine] +- self.gpu_cache: Optional[List[List[torch.Tensor]]] ++ self.gpu_cache: Optional[List[List[torch.Tensor]]] = None --class XPUWorker(LoraNotSupportedWorkerBase, Worker): -+class XPUWorker(Worker): - """A worker class that executes (a partition of) the model on a GPU. - - Each worker is associated with a single XPU device. The worker is -@@ -97,16 +98,74 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker): - """ + def init_device(self) -> None: + if self.device_config.device.type == "xpu" and current_platform.is_xpu( +@@ -100,6 +106,7 @@ class XPUWorker(LoRANotSupportedWorkerBase, Worker): # Profile the memory usage of the model and get the maximum number of # cache blocks that can be allocated with the remaining free memory. -+ flag = int(os.getenv("IPEX_LLM_FIND_MAX_LENGTH", -1)) -+ if flag != -1: -+ assert flag > 0 -+ torch.xpu.empty_cache() -+ before_memory = torch.xpu.memory_reserved() -+ max_num_batched_tokens = flag -+ max_num_seqs = 1 -+ support_input = [] -+ support_kv_cache = [] -+ while True: -+ print(f"Profiling with max_num_batched_tokens {max_num_batched_tokens}...") -+ self.model_runner.profile_run(max_num_batched_tokens, max_num_seqs) -+ torch.xpu.synchronize() -+ used_memory = torch.xpu.memory_reserved() -+ total_gpu_memory = torch.xpu.get_device_properties( -+ self.local_rank).total_memory -+ free_gpu_memory = total_gpu_memory - used_memory -+ peak_memory = self.init_gpu_memory - free_gpu_memory -+ assert peak_memory > 0 -+ cache_block_size = self.get_cache_block_size_bytes() -+ num_gpu_blocks = int( -+ (total_gpu_memory * self.cache_config.gpu_memory_utilization - -+ peak_memory) // cache_block_size) -+ num_cpu_blocks = int(self.cache_config.swap_space_bytes // -+ cache_block_size) -+ num_gpu_blocks = max(num_gpu_blocks, 0) -+ num_cpu_blocks = max(num_cpu_blocks, 0) -+ gc.collect() -+ torch.xpu.empty_cache() -+ # Begin to handle data... -+ if num_gpu_blocks == 0: -+ break -+ kv_cache_support_length = num_gpu_blocks * self.cache_config.block_size -+ # Too long input... -+ if max_num_batched_tokens > kv_cache_support_length: -+ break -+ support_input.append(max_num_batched_tokens) -+ support_kv_cache.append(kv_cache_support_length) -+ max_num_batched_tokens += 250 -+ -+ print(f"Recommended max input length: {support_input[len(support_input) - 1]}") -+ print(f"{'input length':<15} {'kv cache length':<15}") -+ print("-" * 30) -+ -+ for inp, kv in zip(support_input, support_kv_cache): -+ print(f"{inp:<15} {kv:<15}") torch.xpu.empty_cache() + before_memory = torch.xpu.memory_reserved() # Execute a forward pass with dummy inputs to profile the memory usage # of the model. -- self.model_runner.profile_run() -+ self_max_num_batched_tokens = os.getenv("IPEX_LLM_SELF_MAX_NUM_BATCHED_TOKENS", None) -+ if self_max_num_batched_tokens is not None: -+ # If this get set, then profile using max input length -+ max_num_batched_tokens = int(self_max_num_batched_tokens) -+ self_max_num_seqs = os.getenv("IPEX_LLM_SELF_MAX_NUM_SEQS", None) -+ if self_max_num_seqs is not None: -+ max_num_seqs = int(self_max_num_seqs) -+ else: -+ max_num_seqs = 1 -+ self.model_runner.profile_run(max_num_batched_tokens, max_num_seqs) -+ else: -+ self.model_runner.profile_run() - +@@ -108,7 +115,7 @@ class XPUWorker(LoRANotSupportedWorkerBase, Worker): # Calculate the number of blocks that can be allocated with the # profiled peak memory. torch.xpu.synchronize() @@ -41646,7 +17695,7 @@ index 129566605..fb7962dfe 100644 total_gpu_memory = torch.xpu.get_device_properties( self.local_rank).total_memory free_gpu_memory = total_gpu_memory - used_memory -@@ -130,6 +189,20 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker): +@@ -132,6 +139,20 @@ class XPUWorker(LoRANotSupportedWorkerBase, Worker): num_cpu_blocks = max(num_cpu_blocks, 0) gc.collect() torch.xpu.empty_cache() @@ -41667,14 +17716,18 @@ index 129566605..fb7962dfe 100644 return num_gpu_blocks, num_cpu_blocks def _warm_up_model(self) -> None: -@@ -175,4 +248,10 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker): +@@ -177,9 +198,10 @@ class XPUWorker(LoRANotSupportedWorkerBase, Worker): parallel_config.tensor_parallel_size, parallel_config.pipeline_parallel_size) # global all_reduce needed for overall oneccl warm up - torch.distributed.all_reduce(torch.zeros(1).xpu()) +- + # torch.distributed.all_reduce(torch.zeros(1).xpu()) + from vllm.distributed.parallel_state import get_pp_group -+ if parallel_config.pipeline_parallel_size > 1: + if parallel_config.pipeline_parallel_size > 1: +- # Add pp group init to avoid +- # p2p communication as the first call +- get_pp_group().all_reduce(torch.zeros(1).xpu()) + # torch-ccl xpu need a collective API warm up + # before calling send/recv API + get_pp_group().all_gather(torch.zeros(1).xpu()) From 3b9ae0f3100d64dd3550352fd04037d2c57f3392 Mon Sep 17 00:00:00 2001 From: xiangyuT Date: Tue, 29 Apr 2025 10:57:55 +0800 Subject: [PATCH 15/19] fix --- python/llm/src/ipex_llm/vllm/xpu/model_convert.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/llm/src/ipex_llm/vllm/xpu/model_convert.py b/python/llm/src/ipex_llm/vllm/xpu/model_convert.py index a4f4a5d1903..70c01f458d3 100644 --- a/python/llm/src/ipex_llm/vllm/xpu/model_convert.py +++ b/python/llm/src/ipex_llm/vllm/xpu/model_convert.py @@ -102,7 +102,7 @@ def _ipex_llm_load_model(self) -> None: "chatglm" in self.vllm_config.model_config.model.lower()) and \ "gptq" not in self.model_config.model.lower() and \ "awq" not in self.model_config.model.lower() and \ - "qwen3moe" not in self.model_config.model.lower(): + "qwen3" not in self.model_config.model.lower(): self.model.apply(padding_mlp) from ipex_llm import optimize_model not_convert_last_mlp = os.getenv("IPEX_LLM_NOT_CONVERT_LAST_MLP", None) From 3ad4f03988a68ccb2090996787c5ff208194ed53 Mon Sep 17 00:00:00 2001 From: xiangyuT Date: Tue, 29 Apr 2025 11:07:43 +0800 Subject: [PATCH 16/19] update readme for vllm --- docker/llm/serving/xpu/docker/start-vllm-service.sh | 1 + docs/mddocs/DockerGuides/vllm_docker_quickstart.md | 5 ++++- python/llm/example/GPU/vLLM-Serving/README.md | 13 ++++++++++--- 3 files changed, 15 insertions(+), 4 deletions(-) diff --git a/docker/llm/serving/xpu/docker/start-vllm-service.sh b/docker/llm/serving/xpu/docker/start-vllm-service.sh index 924ddb53ce7..84aab23ba1c 100644 --- a/docker/llm/serving/xpu/docker/start-vllm-service.sh +++ b/docker/llm/serving/xpu/docker/start-vllm-service.sh @@ -32,6 +32,7 @@ export TORCH_LLM_ALLREDUCE=0 export CCL_SAME_STREAM=1 export CCL_BLOCKING_WAIT=0 +export VLLM_USE_V1=0 export IPEX_LLM_LOWBIT=$LOAD_IN_LOW_BIT source /opt/intel/1ccl-wks/setvars.sh diff --git a/docs/mddocs/DockerGuides/vllm_docker_quickstart.md b/docs/mddocs/DockerGuides/vllm_docker_quickstart.md index a4959d6f31c..33fccbbef53 100644 --- a/docs/mddocs/DockerGuides/vllm_docker_quickstart.md +++ b/docs/mddocs/DockerGuides/vllm_docker_quickstart.md @@ -782,6 +782,9 @@ export USE_XETLA=OFF export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2 export TORCH_LLM_ALLREDUCE=0 +export VLLM_USE_V1=0 +export IPEX_LLM_LOWBIT=fp8 + source /opt/intel/1ccl-wks/setvars.sh python -m ipex_llm.vllm.xpu.entrypoints.openai.api_server \ @@ -793,7 +796,7 @@ python -m ipex_llm.vllm.xpu.entrypoints.openai.api_server \ --device xpu \ --dtype float16 \ --enforce-eager \ - --load-in-low-bit fp8 \ + --load-in-low-bit $IPEX_LLM_LOWBIT \ --max-model-len 2048 \ --max-num-batched-tokens 4000 \ --api-key \ diff --git a/python/llm/example/GPU/vLLM-Serving/README.md b/python/llm/example/GPU/vLLM-Serving/README.md index 333708ffe0a..7533049f78f 100644 --- a/python/llm/example/GPU/vLLM-Serving/README.md +++ b/python/llm/example/GPU/vLLM-Serving/README.md @@ -50,9 +50,14 @@ pip install --pre --upgrade "ipex-llm[xpu_2.6]" --extra-index-url https://pytorc pip install setuptools-scm pip install --upgrade cmake # cd to your workdir -git clone -b 0.6.6 https://github.com/analytics-zoo/vllm.git +git clone -b 0.8.3 https://github.com/analytics-zoo/vllm.git cd vllm -VLLM_TARGET_DEVICE=xpu pip install --no-build-isolation -v /llm/vllm +pip install setuptools-scm==8.2.0 setuptools==78.1.0 +pip install --upgrade cmake +pip install -v -r requirements/xpu.txt +VLLM_TARGET_DEVICE=xpu python setup.py install +pip install intel-extension-for-pytorch==2.6.10+xpu --extra-index-url=https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/ +pip uninstall -y oneccl oneccl-devel # For Qwen model support pip install transformers_stream_generator einops tiktoken pip install ray @@ -93,6 +98,8 @@ For vLLM, you can start the service using the following command: model="YOUR_MODEL_PATH" served_model_name="YOUR_MODEL_NAME" export VLLM_RPC_TIMEOUT=100000 +export VLLM_USE_V1=0 +export IPEX_LLM_LOWBIT=fp8 # You may need to adjust the value of # --max-model-len, --max-num-batched-tokens, --max-num-seqs @@ -107,7 +114,7 @@ python -m ipex_llm.vllm.xpu.entrypoints.openai.api_server \ --device xpu \ --dtype float16 \ --enforce-eager \ - --load-in-low-bit sym_int4 \ + --load-in-low-bit $IPEX_LLM_LOWBIT \ --max-model-len 4096 \ --max-num-batched-tokens 10240 \ --max-num-seqs 12 \ From e5a526a2e65f2ff839ba34fa5c254208b2117dd2 Mon Sep 17 00:00:00 2001 From: xiangyuT Date: Tue, 29 Apr 2025 11:08:02 +0800 Subject: [PATCH 17/19] finish dockerfile --- docker/llm/serving/xpu/docker/Dockerfile | 4 ---- 1 file changed, 4 deletions(-) diff --git a/docker/llm/serving/xpu/docker/Dockerfile b/docker/llm/serving/xpu/docker/Dockerfile index 7b5eaeaad5e..505ebe70137 100644 --- a/docker/llm/serving/xpu/docker/Dockerfile +++ b/docker/llm/serving/xpu/docker/Dockerfile @@ -184,10 +184,6 @@ RUN set -eux && \ pip install mpi4py fastapi uvicorn openai && \ pip install ray numba -# Install ipex-llm, should remove before merge. -RUN git clone -b vllm_083_0407 https://github.com/xiangyuT/ipex-llm.git /llm/ipex-llm && \ - cp -r /llm/ipex-llm/python/llm/src/ipex_llm/vllm/xpu /usr/local/lib/python3.11/dist-packages/ipex_llm/vllm && \ - cp /llm/ipex-llm/python/llm/src/ipex_llm/transformers/convert.py /usr/local/lib/python3.11/dist-packages/ipex_llm/transformers/convert.py WORKDIR /llm/ ENTRYPOINT ["bash", "/llm/start-vllm-service.sh"] From 3d299dac964a6ed1512df38c5a3073bbf30269f0 Mon Sep 17 00:00:00 2001 From: xiangyuT Date: Tue, 29 Apr 2025 12:50:46 +0800 Subject: [PATCH 18/19] update vllm --- .../llm/serving/xpu/docker/vllm_for_multi_arc.patch | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docker/llm/serving/xpu/docker/vllm_for_multi_arc.patch b/docker/llm/serving/xpu/docker/vllm_for_multi_arc.patch index eb80d89c9c9..f2fc7a078d9 100644 --- a/docker/llm/serving/xpu/docker/vllm_for_multi_arc.patch +++ b/docker/llm/serving/xpu/docker/vllm_for_multi_arc.patch @@ -13247,7 +13247,7 @@ index a7800d415..26af87512 100644 # Split concatenated embeddings for each image item. diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py new file mode 100644 -index 000000000..9c14038e6 +index 000000000..338fffed9 --- /dev/null +++ b/vllm/model_executor/models/qwen3.py @@ -0,0 +1,329 @@ @@ -13387,11 +13387,11 @@ index 000000000..9c14038e6 + # Add qk-norm + q_by_head = q.view(*q.shape[:-1], q.shape[-1] // self.head_dim, + self.head_dim) -+ q_by_head = self.q_norm.forward_native(q_by_head) ++ q_by_head = self.q_norm.forward_xpu(q_by_head.contiguous()) + q = q_by_head.view(q.shape) + k_by_head = k.view(*k.shape[:-1], k.shape[-1] // self.head_dim, + self.head_dim) -+ k_by_head = self.k_norm.forward_native(k_by_head) ++ k_by_head = self.k_norm.forward_xpu(k_by_head.contiguous()) + k = k_by_head.view(k.shape) + q, k = self.rotary_emb(positions, q, k) + attn_output = self.attn(q, k, v) @@ -13582,7 +13582,7 @@ index 000000000..9c14038e6 + return loader.load_weights(weights) diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py new file mode 100644 -index 000000000..390bb7adf +index 000000000..bda3de2d3 --- /dev/null +++ b/vllm/model_executor/models/qwen3_moe.py @@ -0,0 +1,531 @@ @@ -13813,12 +13813,12 @@ index 000000000..390bb7adf + # Add qk-norm + q_by_head = q.view(*q.shape[:-1], q.shape[-1] // self.head_dim, + self.head_dim) -+ q_by_head = self.q_norm.forward_native(q_by_head) ++ q_by_head = self.q_norm.forward_xpu(q_by_head.contiguous()) + q = q_by_head.view(q.shape) + + k_by_head = k.view(*k.shape[:-1], k.shape[-1] // self.head_dim, + self.head_dim) -+ k_by_head = self.k_norm.forward_native(k_by_head) ++ k_by_head = self.k_norm.forward_xpu(k_by_head.contiguous()) + k = k_by_head.view(k.shape) + q, k = self.rotary_emb(positions, q, k) + attn_output = self.attn(q, k, v) From e8a264d6de419f89cfd9ac8ca545fa1b932dc8ca Mon Sep 17 00:00:00 2001 From: xiangyuT Date: Wed, 30 Apr 2025 13:57:39 +0800 Subject: [PATCH 19/19] refine --- python/llm/src/ipex_llm/vllm/xpu/model_convert.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/llm/src/ipex_llm/vllm/xpu/model_convert.py b/python/llm/src/ipex_llm/vllm/xpu/model_convert.py index 70c01f458d3..3d3315ab817 100644 --- a/python/llm/src/ipex_llm/vllm/xpu/model_convert.py +++ b/python/llm/src/ipex_llm/vllm/xpu/model_convert.py @@ -142,7 +142,6 @@ def _ipex_llm_load_model(self) -> None: self.model_memory_usage = m.consumed_memory logger = init_logger(__name__) - logger.info(self.model) logger.info("Loading model weights took %.4f GB", self.model_memory_usage / float(2**30))