1313import asyncio
1414import contextlib
1515import importlib .util
16+ import inspect
1617import os
1718import traceback
1819
2425import uvloop
2526
2627if TYPE_CHECKING :
28+ import socket
29+
2730 from vllm .engine .async_llm_engine import AsyncLLMEngine
2831 from vllm .engine .protocol import AsyncEngineClient
2932
4750else :
4851 # Third Party
4952 from vllm_tgis_adapter .grpc import run_grpc_server
53+ from vllm_tgis_adapter .http import build_http_server
5054 from vllm_tgis_adapter .tgis_utils .args import (
5155 EnvVarArgumentParser ,
5256 add_tgis_args ,
6064async def run_http_server (
6165 args : argparse .Namespace ,
6266 engine : AsyncLLMEngine | AsyncEngineClient ,
67+ sock : socket .socket | None = None ,
6368 ** uvicorn_kwargs , # noqa: ANN003
6469) -> None :
6570 # modified copy of vllm.entrypoints.openai.api_server.run_server that
@@ -81,6 +86,10 @@ async def run_http_server(
8186 }
8287 serve_kwargs .update (uvicorn_kwargs )
8388
89+ # should only be used in versions of vllm >= 0.7.3
90+ if "sock" in inspect .getfullargspec (serve_http ).args :
91+ serve_kwargs ["sock" ] = sock
92+
8493 shutdown_coro = await serve_http (app , ** serve_kwargs )
8594
8695 # launcher.serve_http returns a shutdown coroutine to await
@@ -94,19 +103,27 @@ async def start_servers(args: argparse.Namespace) -> None:
94103 """
95104 loop = asyncio .get_running_loop ()
96105
106+ # workaround to make sure that we bind the port before the engine is set up.
107+ # This avoids race conditions with ray.
108+ # see https://github.com/vllm-project/vllm/issues/8204
109+ sock_addr = (args .host or "" , args .port )
110+ sock = api_server .create_server_socket (sock_addr )
111+
97112 tasks : list [asyncio .Task ] = []
98113 async with api_server .build_async_engine_client (args ) as engine :
99114 add_logging_wrappers (engine )
100115
116+ vllm_server = await build_http_server (args , engine )
117+
101118 http_server_task = loop .create_task (
102- run_http_server (args , engine ),
119+ run_http_server (args , engine , sock ),
103120 name = "http_server" ,
104121 )
105122 # The http server task will catch interrupt signals for us
106123 tasks .append (http_server_task )
107124
108125 grpc_server_task = loop .create_task (
109- run_grpc_server (args , engine ),
126+ run_grpc_server (args , engine , vllm_server ),
110127 name = "grpc_server" ,
111128 )
112129 tasks .append (grpc_server_task )
0 commit comments