Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions docs/design/v1/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ In v0, the following metrics are exposed via a Prometheus-compatible `/metrics`
- `vllm:prompt_tokens_total` (Counter)
- `vllm:generation_tokens_total` (Counter)
- `vllm:request_success_total` (Counter)
- `vllm:request_failed_total` (Counter)
- `vllm:request_prompt_tokens` (Histogram)
- `vllm:request_generation_tokens` (Histogram)
- `vllm:time_to_first_token_seconds` (Histogram)
Expand Down Expand Up @@ -75,6 +76,7 @@ The subset of metrics exposed in the Grafana dashboard gives us an indication of
- `vllm:request_prompt_tokens` - Request prompt length
- `vllm:request_generation_tokens` - request generation length
- `vllm:request_success_total` - Number of finished requests by their finish reason: either an EOS token was generated or the max sequence length was reached
- `vllm:request_failed_total` Number of failed requests including both engine-level failures (aborted/ignored by scheduler) and API-level failures (validation errors, invalid parameters, etc.)
- `vllm:request_queue_time_seconds` - Queue Time
- `vllm:request_prefill_time_seconds` - Requests Prefill Time
- `vllm:request_decode_time_seconds` - Requests Decode Time
Expand Down Expand Up @@ -597,7 +599,7 @@ see:
- [Inference
Perf](https://github.com/kubernetes-sigs/wg-serving/tree/main/proposals/013-inference-perf)
- <gh-issue:5041> and <gh-pr:12726>.

This is a non-trivial topic. Consider this comment from Rob:

> I think this metric should focus on trying to estimate what the max
Expand Down Expand Up @@ -678,7 +680,7 @@ v0 has support for OpenTelemetry tracing:
post](https://medium.com/@ronen.schaffer/follow-the-trail-supercharging-vllm-with-opentelemetry-distributed-tracing-aa655229b46f)
- [IBM product
docs](https://www.ibm.com/docs/en/instana-observability/current?topic=mgaa-monitoring-large-language-models-llms-vllm-public-preview)

OpenTelemetry has a [Gen AI Working
Group](https://github.com/open-telemetry/community/blob/main/projects/gen-ai.md).

Expand Down
105 changes: 103 additions & 2 deletions examples/online_serving/prometheus_grafana/grafana.json
Original file line number Diff line number Diff line change
Expand Up @@ -1177,6 +1177,107 @@
"title": "Finish Reason",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "Number of failed requests tracked by vLLM",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green"
},
{
"color": "red",
"value": 1
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 32
},
"id": 17,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"disableTextWrap": false,
"editorMode": "builder",
"expr": "increase(vllm:request_failed_total{model_name=\"$model_name\"}[$__rate_interval])",
"fullMetaSearch": false,
"includeNullMetadata": true,
"instant": false,
"interval": "",
"legendFormat": "Failed Requests",
"range": true,
"refId": "A",
"useBackend": false
}
],
"title": "Failed Requests",
"type": "timeseries"
},
{
"datasource": {
"default": false,
Expand Down Expand Up @@ -1341,7 +1442,7 @@
"h": 8,
"w": 12,
"x": 0,
"y": 40
"y": 48
},
"id": 15,
"options": {
Expand Down Expand Up @@ -1454,7 +1555,7 @@
"h": 8,
"w": 12,
"x": 12,
"y": 40
"y": 48
},
"id": 16,
"options": {
Expand Down
87 changes: 87 additions & 0 deletions tests/entrypoints/openai/test_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ async def client(server):
("_total", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)
],
"vllm:request_success": [("_total", _NUM_REQUESTS)],
# Note: vllm:request_failed tested separately in test_failed_requests_metric
}


Expand Down Expand Up @@ -213,6 +214,7 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
"vllm:prompt_tokens_total",
"vllm:generation_tokens_total",
"vllm:request_success_total",
"vllm:request_failed_total",
"vllm:cache_config_info",
# labels in cache_config_info
"block_size",
Expand All @@ -239,6 +241,7 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
"vllm:iteration_tokens_total",
"vllm:cache_config_info",
"vllm:request_success_total",
"vllm:request_failed_total",
"vllm:request_prompt_tokens_sum",
"vllm:request_prompt_tokens_bucket",
"vllm:request_prompt_tokens_count",
Expand Down Expand Up @@ -343,3 +346,87 @@ def is_server_up(url):
assert response.status_code == HTTPStatus.OK

proc.wait()


@pytest.mark.asyncio
async def test_failed_requests_metric(server: RemoteOpenAIServer,
client: openai.AsyncClient,
use_v1: bool):
"""Test that failed requests are properly counted in metrics."""
response = requests.get(server.url_for("metrics"))
assert response.status_code == HTTPStatus.OK

initial_failed_count = 0
for family in text_string_to_metric_families(response.text):
if family.name == "vllm:request_failed_total":
for sample in family.samples:
if sample.name == "vllm:request_failed_total":
initial_failed_count = sample.value
break

total_failed_attempts = 3
actual_failed_attempts = 0

# Attempt 1: Use a non-existent model (should fail with 404)
try:
await client.completions.create(model="non-existent-model-name",
prompt="Test prompt",
max_tokens=10)
except Exception:
actual_failed_attempts += 1

# Attempt 2: Use invalid temperature (outside valid range)
try:
await client.completions.create(
model=MODEL_NAME,
prompt="Test prompt",
max_tokens=10,
temperature=-1.0 # Invalid temperature
)
except Exception:
actual_failed_attempts += 1

# Attempt 3: Use max_tokens that exceeds model limit
try:
await client.completions.create(
model=MODEL_NAME,
prompt="Test prompt",
max_tokens=100000 # Very large number that should cause failure
)
except Exception:
actual_failed_attempts += 1

assert actual_failed_attempts == total_failed_attempts, (
f"Expected {total_failed_attempts} failed attempts, "
f"got {actual_failed_attempts}")

time.sleep(1)

response = requests.get(server.url_for("metrics"))
assert response.status_code == HTTPStatus.OK

found_failed_metric = False
final_failed_count = 0
for family in text_string_to_metric_families(response.text):
if family.name == "vllm:request_failed_total":
found_failed_metric = True
for sample in family.samples:
if sample.name == "vllm:request_failed_total":
final_failed_count = sample.value
break
break

assert found_failed_metric, (
"vllm:request_failed_total metric not found in metrics output")

print(f"Initial failed count: {initial_failed_count}, "
f"Final failed count: {final_failed_count}")
print(f"Failed request attempts: {actual_failed_attempts}")

assert final_failed_count >= initial_failed_count, (
f"Expected failed count to be at least {initial_failed_count}, "
f"but got {final_failed_count}")

if actual_failed_attempts == total_failed_attempts:
print(f"Successfully tested failed request metric tracking with "
f"{actual_failed_attempts} failed attempts")
10 changes: 9 additions & 1 deletion vllm/engine/llm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -1285,7 +1285,7 @@ def step(self) -> List[Union[RequestOutput, PoolingRequestOutput]]:
engine = LLMEngine.from_engine_args(engine_args)
example_inputs = [(0, "What is LLM?",
SamplingParams(temperature=0.0))]

# Start the engine with an event loop
while True:
if example_inputs:
Expand Down Expand Up @@ -1690,6 +1690,7 @@ def _get_stats(self,
max_num_generation_tokens_requests: List[int] = []
max_tokens_requests: List[int] = []
finished_reason_requests: List[str] = []
failed_requests: List[str] = []

# LoRA requests
running_lora_adapters = dict(
Expand Down Expand Up @@ -1817,6 +1818,12 @@ def _get_stats(self,
SequenceStatus.get_finished_reason(seq.status)
for seq in seq_group.get_finished_seqs()
])
failed_requests.extend([
SequenceStatus.get_finished_reason(seq.status)
for seq in seq_group.get_finished_seqs()
if seq.status in (SequenceStatus.FINISHED_ABORTED,
SequenceStatus.FINISHED_IGNORED)
])

# Number of generation tokens.
# num_batched_tokens equals the number of prompt_tokens plus the
Expand Down Expand Up @@ -1878,6 +1885,7 @@ def _get_stats(self,
n_requests=n_requests,
max_tokens_requests=max_tokens_requests,
finished_reason_requests=finished_reason_requests,
failed_requests=failed_requests,
max_lora=str(max_lora_stat),
waiting_lora_adapters=list(waiting_lora_adapters.keys()),
running_lora_adapters=list(running_lora_adapters.keys()))
Expand Down
31 changes: 31 additions & 0 deletions vllm/engine/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,12 @@
if TYPE_CHECKING:
from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics

try:
from vllm.entrypoints.openai.api_server import (
get_api_failed_requests_count)
except ImportError:
get_api_failed_requests_count = None

logger = init_logger(__name__)

prometheus_client.disable_created_metrics()
Expand Down Expand Up @@ -267,6 +273,10 @@ def __init__(self, labelnames: List[str], vllm_config: VllmConfig):
name="vllm:request_success_total",
documentation="Count of successfully processed requests.",
labelnames=labelnames + [Metrics.labelname_finish_reason])
self.counter_request_failed = self._counter_cls(
name="vllm:request_failed_total",
documentation="Count of failed requests (engine and API level).",
labelnames=labelnames)

# Speculative decoding stats
self.gauge_spec_decode_draft_acceptance_rate = self._gauge_cls(
Expand Down Expand Up @@ -547,6 +557,8 @@ def __init__(self, local_interval: float, labels: Dict[str, str],
self.labels = labels
self.metrics = self._metrics_cls(labelnames=list(labels.keys()),
vllm_config=vllm_config)
# Track previous API failed count for incremental logging
self.last_api_failed_count = 0

def _log_gauge(self, gauge, data: Union[int, float]) -> None:
# Convenience function for logging to gauge.
Expand Down Expand Up @@ -646,6 +658,25 @@ def _log_prometheus(self, stats: Stats) -> None:
self._log_counter_labels(self.metrics.counter_request_success,
finished_reason_counter,
Metrics.labelname_finish_reason)
# Log failed requests (engine-level failures)
engine_failed_count = len(stats.failed_requests)
# Clear the list to avoid double-counting on subsequent calls
stats.failed_requests.clear()

# Log API-level failures (if available)
api_failed_increment = 0
if get_api_failed_requests_count is not None:
current_api_failed_count = get_api_failed_requests_count()
# Only count the incremental increase since last collection
api_failed_increment = (current_api_failed_count -
self.last_api_failed_count)
self.last_api_failed_count = current_api_failed_count

# Log total failed requests (engine + API increments)
total_failed_increment = engine_failed_count + api_failed_increment
if total_failed_increment > 0:
self._log_counter(self.metrics.counter_request_failed,
total_failed_increment)
self._log_histogram(self.metrics.histogram_num_prompt_tokens_request,
stats.num_prompt_tokens_requests)
self._log_histogram(
Expand Down
1 change: 1 addition & 0 deletions vllm/engine/metrics_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ class Stats:
max_num_generation_tokens_requests: List[int]
max_tokens_requests: List[int]
finished_reason_requests: List[str]
failed_requests: List[str]
waiting_lora_adapters: List[str]
running_lora_adapters: List[str]
max_lora: str
Expand Down
Loading