Skip to content

Commit 10f3b22

Browse files
committed
fix: open ai llm prewarm
1 parent 491bee2 commit 10f3b22

File tree

2 files changed

+188
-1
lines changed

2 files changed

+188
-1
lines changed

livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/llm.py

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
from __future__ import annotations
1616

17+
import asyncio
1718
import os
1819
from dataclasses import asdict, dataclass
1920
from typing import Any, Literal
@@ -22,7 +23,7 @@
2223
import httpx
2324

2425
import openai
25-
from livekit.agents import llm
26+
from livekit.agents import llm, utils
2627
from livekit.agents.inference.llm import LLMStream as _LLMStream
2728
from livekit.agents.llm import ToolChoice, utils as llm_utils
2829
from livekit.agents.llm.chat_context import ChatContext
@@ -157,6 +158,7 @@ def __init__(
157158
),
158159
),
159160
)
161+
self._prewarm_task: asyncio.Task[None] | None = None
160162

161163
@property
162164
def model(self) -> str:
@@ -912,6 +914,27 @@ def chat(
912914
extra_kwargs=extra,
913915
)
914916

917+
def prewarm(self) -> None:
918+
"""Pre-warm the HTTP connection pool to reduce first-request latency.
919+
"""
920+
921+
async def _prewarm_impl() -> None:
922+
try:
923+
await self._client.get("/", cast_to=str)
924+
except Exception:
925+
pass
926+
927+
# Cancel any existing prewarm task before creating a new one
928+
if self._prewarm_task is not None and not self._prewarm_task.done():
929+
self._prewarm_task.cancel()
930+
931+
self._prewarm_task = asyncio.create_task(_prewarm_impl())
932+
933+
async def aclose(self) -> None:
934+
"""Clean up resources including any pending prewarm tasks."""
935+
if self._prewarm_task is not None:
936+
await utils.aio.gracefully_cancel(self._prewarm_task)
937+
915938

916939
class LLMStream(_LLMStream):
917940
def __init__(

tests/test_llm_prewarm.py

Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,164 @@
1+
"""
2+
Test LLM prewarming functionality (Issue #3240).
3+
4+
This test suite verifies that the prewarm() method reduces first-request latency
5+
by pre-establishing HTTP connections to the LLM service.
6+
"""
7+
8+
from __future__ import annotations
9+
10+
import asyncio
11+
import os
12+
import time
13+
14+
import pytest
15+
16+
from livekit.agents import llm
17+
from livekit.plugins import openai
18+
19+
20+
pytestmark = pytest.mark.skipif(
21+
not os.environ.get("OPENAI_API_KEY"),
22+
reason="OPENAI_API_KEY not set"
23+
)
24+
25+
26+
@pytest.mark.asyncio
27+
async def test_llm_prewarm_reduces_latency():
28+
"""Test that prewarming reduces time to first token (TTFT).
29+
30+
This test verifies that calling prewarm() before making an LLM request
31+
reduces the latency of the first request by pre-establishing the HTTP connection.
32+
"""
33+
# Test 1: WITHOUT prewarming
34+
llm_no_prewarm = openai.LLM(model="gpt-4o-mini")
35+
36+
chat_ctx = llm.ChatContext()
37+
chat_ctx.add_message(role="user", content="Say 'test' in one word only")
38+
39+
start = time.perf_counter()
40+
stream = llm_no_prewarm.chat(chat_ctx=chat_ctx)
41+
42+
# Measure time to first chunk
43+
ttft_no_prewarm = 0
44+
async for chunk in stream:
45+
if chunk.delta and chunk.delta.content:
46+
ttft_no_prewarm = time.perf_counter() - start
47+
break
48+
49+
# Fully consume the stream to avoid leaks
50+
async for _ in stream:
51+
pass
52+
53+
await llm_no_prewarm.aclose()
54+
55+
# Test 2: WITH prewarming
56+
llm_with_prewarm = openai.LLM(model="gpt-4o-mini")
57+
llm_with_prewarm.prewarm()
58+
59+
# Give the prewarm task a moment to establish the connection
60+
await asyncio.sleep(0.3)
61+
62+
chat_ctx = llm.ChatContext()
63+
chat_ctx.add_message(role="user", content="Say 'test' in one word only")
64+
65+
start = time.perf_counter()
66+
stream = llm_with_prewarm.chat(chat_ctx=chat_ctx)
67+
68+
# Measure time to first chunk
69+
ttft_with_prewarm = 0
70+
async for chunk in stream:
71+
if chunk.delta and chunk.delta.content:
72+
ttft_with_prewarm = time.perf_counter() - start
73+
break
74+
75+
# Fully consume the stream to avoid leaks
76+
async for _ in stream:
77+
pass
78+
79+
await llm_with_prewarm.aclose()
80+
81+
# Verify prewarming helped (should be at least slightly faster)
82+
# We don't assert a specific improvement because network conditions vary,
83+
# but we print the results for visibility
84+
print(f"Prewarm Test Results:")
85+
print(f" Without prewarm: {ttft_no_prewarm:.3f}s")
86+
print(f" With prewarm: {ttft_with_prewarm:.3f}s")
87+
88+
if ttft_with_prewarm < ttft_no_prewarm:
89+
improvement = ttft_no_prewarm - ttft_with_prewarm
90+
improvement_pct = (improvement / ttft_no_prewarm) * 100
91+
print(f"Improvement: {improvement:.3f}s ({improvement_pct:.1f}% faster)")
92+
else:
93+
print(f" No improvement detected (network conditions may vary)")
94+
95+
# The test passes if both requests succeeded
96+
# We don't strictly assert latency improvements due to network variability
97+
assert ttft_no_prewarm > 0
98+
assert ttft_with_prewarm > 0
99+
100+
101+
@pytest.mark.asyncio
102+
async def test_llm_prewarm_task_cleanup():
103+
"""Test that prewarm task is properly cleaned up on aclose()."""
104+
llm_instance = openai.LLM(model="gpt-4o-mini")
105+
106+
# Start prewarming
107+
llm_instance.prewarm()
108+
109+
# Verify task was created
110+
assert llm_instance._prewarm_task is not None
111+
112+
# Close immediately (should cancel the prewarm task gracefully)
113+
await llm_instance.aclose()
114+
115+
# Task should be completed or cancelled
116+
assert llm_instance._prewarm_task.done() or llm_instance._prewarm_task.cancelled()
117+
118+
119+
@pytest.mark.asyncio
120+
async def test_llm_prewarm_idempotent():
121+
"""Test that calling prewarm() multiple times doesn't cause issues."""
122+
llm_instance = openai.LLM(model="gpt-4o-mini")
123+
124+
# Call prewarm multiple times
125+
llm_instance.prewarm()
126+
first_task = llm_instance._prewarm_task
127+
128+
# Calling prewarm again should create a new task
129+
llm_instance.prewarm()
130+
second_task = llm_instance._prewarm_task
131+
132+
# Both tasks should exist
133+
assert first_task is not None
134+
assert second_task is not None
135+
136+
# Clean up - must wait for tasks to complete or aclose will leak
137+
await llm_instance.aclose()
138+
139+
140+
@pytest.mark.asyncio
141+
async def test_llm_works_without_prewarm():
142+
"""Test that LLM works normally even without calling prewarm()."""
143+
llm_instance = openai.LLM(model="gpt-4o-mini")
144+
145+
# Don't call prewarm() at all
146+
chat_ctx = llm.ChatContext()
147+
chat_ctx.add_message(role="user", content="Say 'hello' in one word")
148+
149+
stream = llm_instance.chat(chat_ctx=chat_ctx)
150+
151+
# Should still work fine
152+
response_received = False
153+
async for chunk in stream:
154+
if chunk.delta and chunk.delta.content:
155+
response_received = True
156+
break
157+
158+
# Fully consume the stream to avoid leaks
159+
async for _ in stream:
160+
pass
161+
162+
await llm_instance.aclose()
163+
164+
assert response_received, "Should receive response even without prewarm"

0 commit comments

Comments
 (0)