1111from vllm .entrypoints .openai .serving_chat import OpenAIServingChat
1212from vllm .entrypoints .openai .serving_completion import OpenAIServingCompletion
1313from vllm .entrypoints .openai .protocol import ChatCompletionRequest , CompletionRequest , ErrorResponse
14- from vllm .entrypoints .openai .serving_engine import BaseModelPath
14+ from vllm .entrypoints .openai .serving_engine import BaseModelPath , LoRAModulePath
15+
1516
1617from utils import DummyRequest , JobInput , BatchSize , create_error_response
1718from constants import DEFAULT_MAX_CONCURRENCY , DEFAULT_BATCH_SIZE , DEFAULT_BATCH_SIZE_GROWTH_FACTOR , DEFAULT_MIN_BATCH_SIZE
@@ -128,21 +129,32 @@ async def _initialize_engines(self):
128129 self .base_model_paths = [
129130 BaseModelPath (name = self .engine_args .model , model_path = self .engine_args .model )
130131 ]
132+
133+ lora_modules = os .getenv ('LORA_MODULES' , None )
134+ if lora_modules is not None :
135+ try :
136+ lora_modules = json .loads (lora_modules )
137+ lora_modules = [LoRAModulePath (** lora_modules )]
138+ except :
139+ lora_modules = None
140+
141+
142+
131143 self .chat_engine = OpenAIServingChat (
132144 engine_client = self .llm ,
133145 model_config = self .model_config ,
134146 base_model_paths = self .base_model_paths ,
135147 response_role = self .response_role ,
136148 chat_template = self .tokenizer .tokenizer .chat_template ,
137- lora_modules = None ,
149+ lora_modules = lora_modules ,
138150 prompt_adapters = None ,
139151 request_logger = None
140152 )
141153 self .completion_engine = OpenAIServingCompletion (
142154 engine_client = self .llm ,
143155 model_config = self .model_config ,
144156 base_model_paths = self .base_model_paths ,
145- lora_modules = [] ,
157+ lora_modules = lora_modules ,
146158 prompt_adapters = None ,
147159 request_logger = None
148160 )
@@ -158,9 +170,6 @@ async def generate(self, openai_request: JobInput):
158170
159171 async def _handle_model_request (self ):
160172 models = await self .chat_engine .show_available_models ()
161- fixed_model = models .data [0 ]
162- fixed_model .id = self .served_model_name
163- models .data = [fixed_model ]
164173 return models .model_dump ()
165174
166175 async def _handle_chat_or_completion_request (self , openai_request : JobInput ):
0 commit comments