Skip to content

Commit dc482f1

Browse files
committed
Add a metric to track request failures
Signed-off-by: Harshal Patil <[email protected]>
1 parent 27bebcd commit dc482f1

File tree

7 files changed

+257
-5
lines changed

7 files changed

+257
-5
lines changed

docs/design/v1/metrics.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ In v0, the following metrics are exposed via a Prometheus-compatible `/metrics`
3131
- `vllm:prompt_tokens_total` (Counter)
3232
- `vllm:generation_tokens_total` (Counter)
3333
- `vllm:request_success_total` (Counter)
34+
- `vllm:request_failed_total` (Counter)
3435
- `vllm:request_prompt_tokens` (Histogram)
3536
- `vllm:request_generation_tokens` (Histogram)
3637
- `vllm:time_to_first_token_seconds` (Histogram)
@@ -75,6 +76,7 @@ The subset of metrics exposed in the Grafana dashboard gives us an indication of
7576
- `vllm:request_prompt_tokens` - Request prompt length
7677
- `vllm:request_generation_tokens` - request generation length
7778
- `vllm:request_success_total` - Number of finished requests by their finish reason: either an EOS token was generated or the max sequence length was reached
79+
- `vllm:request_failed_total` Number of failed requests including both engine-level failures (aborted/ignored by scheduler) and API-level failures (validation errors, invalid parameters, etc.)
7880
- `vllm:request_queue_time_seconds` - Queue Time
7981
- `vllm:request_prefill_time_seconds` - Requests Prefill Time
8082
- `vllm:request_decode_time_seconds` - Requests Decode Time
@@ -597,7 +599,7 @@ see:
597599
- [Inference
598600
Perf](https://github.com/kubernetes-sigs/wg-serving/tree/main/proposals/013-inference-perf)
599601
- <gh-issue:5041> and <gh-pr:12726>.
600-
602+
601603
This is a non-trivial topic. Consider this comment from Rob:
602604

603605
> I think this metric should focus on trying to estimate what the max
@@ -678,7 +680,7 @@ v0 has support for OpenTelemetry tracing:
678680
post](https://medium.com/@ronen.schaffer/follow-the-trail-supercharging-vllm-with-opentelemetry-distributed-tracing-aa655229b46f)
679681
- [IBM product
680682
docs](https://www.ibm.com/docs/en/instana-observability/current?topic=mgaa-monitoring-large-language-models-llms-vllm-public-preview)
681-
683+
682684
OpenTelemetry has a [Gen AI Working
683685
Group](https://github.com/open-telemetry/community/blob/main/projects/gen-ai.md).
684686

examples/online_serving/prometheus_grafana/grafana.json

Lines changed: 103 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1177,6 +1177,107 @@
11771177
"title": "Finish Reason",
11781178
"type": "timeseries"
11791179
},
1180+
{
1181+
"datasource": {
1182+
"type": "prometheus",
1183+
"uid": "${DS_PROMETHEUS}"
1184+
},
1185+
"description": "Number of failed requests tracked by vLLM",
1186+
"fieldConfig": {
1187+
"defaults": {
1188+
"color": {
1189+
"mode": "palette-classic"
1190+
},
1191+
"custom": {
1192+
"axisBorderShow": false,
1193+
"axisCenteredZero": false,
1194+
"axisColorMode": "text",
1195+
"axisLabel": "",
1196+
"axisPlacement": "auto",
1197+
"barAlignment": 0,
1198+
"barWidthFactor": 0.6,
1199+
"drawStyle": "line",
1200+
"fillOpacity": 0,
1201+
"gradientMode": "none",
1202+
"hideFrom": {
1203+
"legend": false,
1204+
"tooltip": false,
1205+
"viz": false
1206+
},
1207+
"insertNulls": false,
1208+
"lineInterpolation": "linear",
1209+
"lineWidth": 1,
1210+
"pointSize": 5,
1211+
"scaleDistribution": {
1212+
"type": "linear"
1213+
},
1214+
"showPoints": "auto",
1215+
"spanNulls": false,
1216+
"stacking": {
1217+
"group": "A",
1218+
"mode": "none"
1219+
},
1220+
"thresholdsStyle": {
1221+
"mode": "off"
1222+
}
1223+
},
1224+
"mappings": [],
1225+
"thresholds": {
1226+
"mode": "absolute",
1227+
"steps": [
1228+
{
1229+
"color": "green"
1230+
},
1231+
{
1232+
"color": "red",
1233+
"value": 1
1234+
}
1235+
]
1236+
}
1237+
},
1238+
"overrides": []
1239+
},
1240+
"gridPos": {
1241+
"h": 8,
1242+
"w": 12,
1243+
"x": 12,
1244+
"y": 32
1245+
},
1246+
"id": 17,
1247+
"options": {
1248+
"legend": {
1249+
"calcs": [],
1250+
"displayMode": "list",
1251+
"placement": "bottom",
1252+
"showLegend": true
1253+
},
1254+
"tooltip": {
1255+
"mode": "single",
1256+
"sort": "none"
1257+
}
1258+
},
1259+
"targets": [
1260+
{
1261+
"datasource": {
1262+
"type": "prometheus",
1263+
"uid": "${DS_PROMETHEUS}"
1264+
},
1265+
"disableTextWrap": false,
1266+
"editorMode": "builder",
1267+
"expr": "increase(vllm:request_failed_total{model_name=\"$model_name\"}[$__rate_interval])",
1268+
"fullMetaSearch": false,
1269+
"includeNullMetadata": true,
1270+
"instant": false,
1271+
"interval": "",
1272+
"legendFormat": "Failed Requests",
1273+
"range": true,
1274+
"refId": "A",
1275+
"useBackend": false
1276+
}
1277+
],
1278+
"title": "Failed Requests",
1279+
"type": "timeseries"
1280+
},
11801281
{
11811282
"datasource": {
11821283
"default": false,
@@ -1341,7 +1442,7 @@
13411442
"h": 8,
13421443
"w": 12,
13431444
"x": 0,
1344-
"y": 40
1445+
"y": 48
13451446
},
13461447
"id": 15,
13471448
"options": {
@@ -1454,7 +1555,7 @@
14541555
"h": 8,
14551556
"w": 12,
14561557
"x": 12,
1457-
"y": 40
1558+
"y": 48
14581559
},
14591560
"id": 16,
14601561
"options": {

tests/entrypoints/openai/test_metrics.py

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,7 @@ async def client(server):
114114
("_total", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)
115115
],
116116
"vllm:request_success": [("_total", _NUM_REQUESTS)],
117+
# Note: vllm:request_failed tested separately in test_failed_requests_metric
117118
}
118119

119120

@@ -213,6 +214,7 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
213214
"vllm:prompt_tokens_total",
214215
"vllm:generation_tokens_total",
215216
"vllm:request_success_total",
217+
"vllm:request_failed_total",
216218
"vllm:cache_config_info",
217219
# labels in cache_config_info
218220
"block_size",
@@ -239,6 +241,7 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
239241
"vllm:iteration_tokens_total",
240242
"vllm:cache_config_info",
241243
"vllm:request_success_total",
244+
"vllm:request_failed_total",
242245
"vllm:request_prompt_tokens_sum",
243246
"vllm:request_prompt_tokens_bucket",
244247
"vllm:request_prompt_tokens_count",
@@ -343,3 +346,87 @@ def is_server_up(url):
343346
assert response.status_code == HTTPStatus.OK
344347

345348
proc.wait()
349+
350+
351+
@pytest.mark.asyncio
352+
async def test_failed_requests_metric(server: RemoteOpenAIServer,
353+
client: openai.AsyncClient,
354+
use_v1: bool):
355+
"""Test that failed requests are properly counted in metrics."""
356+
response = requests.get(server.url_for("metrics"))
357+
assert response.status_code == HTTPStatus.OK
358+
359+
initial_failed_count = 0
360+
for family in text_string_to_metric_families(response.text):
361+
if family.name == "vllm:request_failed_total":
362+
for sample in family.samples:
363+
if sample.name == "vllm:request_failed_total":
364+
initial_failed_count = sample.value
365+
break
366+
367+
total_failed_attempts = 3
368+
actual_failed_attempts = 0
369+
370+
# Attempt 1: Use a non-existent model (should fail with 404)
371+
try:
372+
await client.completions.create(model="non-existent-model-name",
373+
prompt="Test prompt",
374+
max_tokens=10)
375+
except Exception:
376+
actual_failed_attempts += 1
377+
378+
# Attempt 2: Use invalid temperature (outside valid range)
379+
try:
380+
await client.completions.create(
381+
model=MODEL_NAME,
382+
prompt="Test prompt",
383+
max_tokens=10,
384+
temperature=-1.0 # Invalid temperature
385+
)
386+
except Exception:
387+
actual_failed_attempts += 1
388+
389+
# Attempt 3: Use max_tokens that exceeds model limit
390+
try:
391+
await client.completions.create(
392+
model=MODEL_NAME,
393+
prompt="Test prompt",
394+
max_tokens=100000 # Very large number that should cause failure
395+
)
396+
except Exception:
397+
actual_failed_attempts += 1
398+
399+
assert actual_failed_attempts == total_failed_attempts, (
400+
f"Expected {total_failed_attempts} failed attempts, "
401+
f"got {actual_failed_attempts}")
402+
403+
time.sleep(1)
404+
405+
response = requests.get(server.url_for("metrics"))
406+
assert response.status_code == HTTPStatus.OK
407+
408+
found_failed_metric = False
409+
final_failed_count = 0
410+
for family in text_string_to_metric_families(response.text):
411+
if family.name == "vllm:request_failed_total":
412+
found_failed_metric = True
413+
for sample in family.samples:
414+
if sample.name == "vllm:request_failed_total":
415+
final_failed_count = sample.value
416+
break
417+
break
418+
419+
assert found_failed_metric, (
420+
"vllm:request_failed_total metric not found in metrics output")
421+
422+
print(f"Initial failed count: {initial_failed_count}, "
423+
f"Final failed count: {final_failed_count}")
424+
print(f"Failed request attempts: {actual_failed_attempts}")
425+
426+
assert final_failed_count >= initial_failed_count, (
427+
f"Expected failed count to be at least {initial_failed_count}, "
428+
f"but got {final_failed_count}")
429+
430+
if actual_failed_attempts == total_failed_attempts:
431+
print(f"Successfully tested failed request metric tracking with "
432+
f"{actual_failed_attempts} failed attempts")

vllm/engine/llm_engine.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1285,7 +1285,7 @@ def step(self) -> List[Union[RequestOutput, PoolingRequestOutput]]:
12851285
engine = LLMEngine.from_engine_args(engine_args)
12861286
example_inputs = [(0, "What is LLM?",
12871287
SamplingParams(temperature=0.0))]
1288-
1288+
12891289
# Start the engine with an event loop
12901290
while True:
12911291
if example_inputs:
@@ -1690,6 +1690,7 @@ def _get_stats(self,
16901690
max_num_generation_tokens_requests: List[int] = []
16911691
max_tokens_requests: List[int] = []
16921692
finished_reason_requests: List[str] = []
1693+
failed_requests: List[str] = []
16931694

16941695
# LoRA requests
16951696
running_lora_adapters = dict(
@@ -1817,6 +1818,12 @@ def _get_stats(self,
18171818
SequenceStatus.get_finished_reason(seq.status)
18181819
for seq in seq_group.get_finished_seqs()
18191820
])
1821+
failed_requests.extend([
1822+
SequenceStatus.get_finished_reason(seq.status)
1823+
for seq in seq_group.get_finished_seqs()
1824+
if seq.status in (SequenceStatus.FINISHED_ABORTED,
1825+
SequenceStatus.FINISHED_IGNORED)
1826+
])
18201827

18211828
# Number of generation tokens.
18221829
# num_batched_tokens equals the number of prompt_tokens plus the
@@ -1878,6 +1885,7 @@ def _get_stats(self,
18781885
n_requests=n_requests,
18791886
max_tokens_requests=max_tokens_requests,
18801887
finished_reason_requests=finished_reason_requests,
1888+
failed_requests=failed_requests,
18811889
max_lora=str(max_lora_stat),
18821890
waiting_lora_adapters=list(waiting_lora_adapters.keys()),
18831891
running_lora_adapters=list(running_lora_adapters.keys()))

vllm/engine/metrics.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,12 @@
2121
if TYPE_CHECKING:
2222
from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics
2323

24+
try:
25+
from vllm.entrypoints.openai.api_server import (
26+
get_api_failed_requests_count)
27+
except ImportError:
28+
get_api_failed_requests_count = None
29+
2430
logger = init_logger(__name__)
2531

2632
prometheus_client.disable_created_metrics()
@@ -267,6 +273,10 @@ def __init__(self, labelnames: List[str], vllm_config: VllmConfig):
267273
name="vllm:request_success_total",
268274
documentation="Count of successfully processed requests.",
269275
labelnames=labelnames + [Metrics.labelname_finish_reason])
276+
self.counter_request_failed = self._counter_cls(
277+
name="vllm:request_failed_total",
278+
documentation="Count of failed requests (engine and API level).",
279+
labelnames=labelnames)
270280

271281
# Speculative decoding stats
272282
self.gauge_spec_decode_draft_acceptance_rate = self._gauge_cls(
@@ -547,6 +557,8 @@ def __init__(self, local_interval: float, labels: Dict[str, str],
547557
self.labels = labels
548558
self.metrics = self._metrics_cls(labelnames=list(labels.keys()),
549559
vllm_config=vllm_config)
560+
# Track previous API failed count for incremental logging
561+
self.last_api_failed_count = 0
550562

551563
def _log_gauge(self, gauge, data: Union[int, float]) -> None:
552564
# Convenience function for logging to gauge.
@@ -646,6 +658,24 @@ def _log_prometheus(self, stats: Stats) -> None:
646658
self._log_counter_labels(self.metrics.counter_request_success,
647659
finished_reason_counter,
648660
Metrics.labelname_finish_reason)
661+
# Log failed requests (engine-level failures)
662+
engine_failed_count = len(stats.failed_requests)
663+
# Clear the list to avoid double-counting on subsequent calls
664+
stats.failed_requests.clear()
665+
666+
# Log API-level failures (if available)
667+
api_failed_increment = 0
668+
if get_api_failed_requests_count is not None:
669+
current_api_failed_count = get_api_failed_requests_count()
670+
# Only count the incremental increase since last collection
671+
api_failed_increment = current_api_failed_count - self.last_api_failed_count
672+
self.last_api_failed_count = current_api_failed_count
673+
674+
# Log total failed requests (engine + API increments)
675+
total_failed_increment = engine_failed_count + api_failed_increment
676+
if total_failed_increment > 0:
677+
self._log_counter(self.metrics.counter_request_failed,
678+
total_failed_increment)
649679
self._log_histogram(self.metrics.histogram_num_prompt_tokens_request,
650680
stats.num_prompt_tokens_requests)
651681
self._log_histogram(

vllm/engine/metrics_types.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ class Stats:
6363
max_num_generation_tokens_requests: List[int]
6464
max_tokens_requests: List[int]
6565
finished_reason_requests: List[str]
66+
failed_requests: List[str]
6667
waiting_lora_adapters: List[str]
6768
running_lora_adapters: List[str]
6869
max_lora: str

0 commit comments

Comments
 (0)