vllm-project · harche · May 27, 2025
diff --git a/docs/design/v1/metrics.md b/docs/design/v1/metrics.md
@@ -31,6 +31,7 @@ In v0, the following metrics are exposed via a Prometheus-compatible `/metrics`
 - `vllm:prompt_tokens_total` (Counter)
 - `vllm:generation_tokens_total` (Counter)
 - `vllm:request_success_total` (Counter)
+- `vllm:request_failed_total` (Counter)
 - `vllm:request_prompt_tokens` (Histogram)
 - `vllm:request_generation_tokens` (Histogram)
 - `vllm:time_to_first_token_seconds` (Histogram)
@@ -75,6 +76,7 @@ The subset of metrics exposed in the Grafana dashboard gives us an indication of
 - `vllm:request_prompt_tokens` - Request prompt length
 - `vllm:request_generation_tokens` - request generation length
 - `vllm:request_success_total` - Number of finished requests by their finish reason: either an EOS token was generated or the max sequence length was reached
+- `vllm:request_failed_total` Number of failed requests including both engine-level failures (aborted/ignored by scheduler) and API-level failures (validation errors, invalid parameters, etc.)
 - `vllm:request_queue_time_seconds` - Queue Time
 - `vllm:request_prefill_time_seconds` - Requests Prefill Time
 - `vllm:request_decode_time_seconds` - Requests Decode Time
@@ -597,7 +599,7 @@ see:
 - [Inference
   Perf](https://github.com/kubernetes-sigs/wg-serving/tree/main/proposals/013-inference-perf)
 - <gh-issue:5041> and <gh-pr:12726>.
-  
+
 This is a non-trivial topic. Consider this comment from Rob:
 
 > I think this metric should focus on trying to estimate what the max
@@ -678,7 +680,7 @@ v0 has support for OpenTelemetry tracing:
   post](https://medium.com/@ronen.schaffer/follow-the-trail-supercharging-vllm-with-opentelemetry-distributed-tracing-aa655229b46f)
 - [IBM product
   docs](https://www.ibm.com/docs/en/instana-observability/current?topic=mgaa-monitoring-large-language-models-llms-vllm-public-preview)
-  
+
 OpenTelemetry has a [Gen AI Working
 Group](https://github.com/open-telemetry/community/blob/main/projects/gen-ai.md).
 

diff --git a/examples/online_serving/prometheus_grafana/grafana.json b/examples/online_serving/prometheus_grafana/grafana.json
@@ -1177,6 +1177,107 @@
       "title": "Finish Reason",
       "type": "timeseries"
     },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "description": "Number of failed requests tracked by vLLM",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 1
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 32
+      },
+      "id": 17,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "increase(vllm:request_failed_total{model_name=\"$model_name\"}[$__rate_interval])",
+          "fullMetaSearch": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "interval": "",
+          "legendFormat": "Failed Requests",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        }
+      ],
+      "title": "Failed Requests",
+      "type": "timeseries"
+    },
     {
       "datasource": {
         "default": false,
@@ -1341,7 +1442,7 @@
         "h": 8,
         "w": 12,
         "x": 0,
-        "y": 40
+        "y": 48
       },
       "id": 15,
       "options": {
@@ -1454,7 +1555,7 @@
         "h": 8,
         "w": 12,
         "x": 12,
-        "y": 40
+        "y": 48
       },
       "id": 16,
       "options": {

@@ -114,6 +114,7 @@ async def client(server):
         ("_total", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)
     ],
     "vllm:request_success": [("_total", _NUM_REQUESTS)],
+    # Note: vllm:request_failed tested separately in test_failed_requests_metric
 }
 
 
@@ -213,6 +214,7 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
     "vllm:prompt_tokens_total",
     "vllm:generation_tokens_total",
     "vllm:request_success_total",
+    "vllm:request_failed_total",
     "vllm:cache_config_info",
     # labels in cache_config_info
     "block_size",
@@ -239,6 +241,7 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
     "vllm:iteration_tokens_total",
     "vllm:cache_config_info",
     "vllm:request_success_total",
+    "vllm:request_failed_total",
     "vllm:request_prompt_tokens_sum",
     "vllm:request_prompt_tokens_bucket",
     "vllm:request_prompt_tokens_count",
@@ -343,3 +346,87 @@ def is_server_up(url):
         assert response.status_code == HTTPStatus.OK
 
         proc.wait()
+
+
+@pytest.mark.asyncio
+async def test_failed_requests_metric(server: RemoteOpenAIServer,
+                                      client: openai.AsyncClient,
+                                      use_v1: bool):
+    """Test that failed requests are properly counted in metrics."""
+    response = requests.get(server.url_for("metrics"))
+    assert response.status_code == HTTPStatus.OK
+
+    initial_failed_count = 0
+    for family in text_string_to_metric_families(response.text):
+        if family.name == "vllm:request_failed_total":
+            for sample in family.samples:
+                if sample.name == "vllm:request_failed_total":
+                    initial_failed_count = sample.value
+                    break
+
+    total_failed_attempts = 3
+    actual_failed_attempts = 0
+
+    # Attempt 1: Use a non-existent model (should fail with 404)
+    try:
+        await client.completions.create(model="non-existent-model-name",
+                                        prompt="Test prompt",
+                                        max_tokens=10)
+    except Exception:
+        actual_failed_attempts += 1
+
+    # Attempt 2: Use invalid temperature (outside valid range)
+    try:
+        await client.completions.create(
+            model=MODEL_NAME,
+            prompt="Test prompt",
+            max_tokens=10,
+            temperature=-1.0  # Invalid temperature
+        )
+    except Exception:
+        actual_failed_attempts += 1
+
+    # Attempt 3: Use max_tokens that exceeds model limit
+    try:
+        await client.completions.create(
+            model=MODEL_NAME,
+            prompt="Test prompt",
+            max_tokens=100000  # Very large number that should cause failure
+        )
+    except Exception:
+        actual_failed_attempts += 1
+
+    assert actual_failed_attempts == total_failed_attempts, (
+        f"Expected {total_failed_attempts} failed attempts, "
+        f"got {actual_failed_attempts}")
+
+    time.sleep(1)
+
+    response = requests.get(server.url_for("metrics"))
+    assert response.status_code == HTTPStatus.OK
+
+    found_failed_metric = False
+    final_failed_count = 0
+    for family in text_string_to_metric_families(response.text):
+        if family.name == "vllm:request_failed_total":
+            found_failed_metric = True
+            for sample in family.samples:
+                if sample.name == "vllm:request_failed_total":
+                    final_failed_count = sample.value
+                    break
+            break
+
+    assert found_failed_metric, (
+        "vllm:request_failed_total metric not found in metrics output")
+
+    print(f"Initial failed count: {initial_failed_count}, "
+          f"Final failed count: {final_failed_count}")
+    print(f"Failed request attempts: {actual_failed_attempts}")
+
+    assert final_failed_count >= initial_failed_count, (
+        f"Expected failed count to be at least {initial_failed_count}, "
+        f"but got {final_failed_count}")
+
+    if actual_failed_attempts == total_failed_attempts:
+        print(f"Successfully tested failed request metric tracking with "
+              f"{actual_failed_attempts} failed attempts")
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
@@ -1285,7 +1285,7 @@ def step(self) -> List[Union[RequestOutput, PoolingRequestOutput]]:
         engine = LLMEngine.from_engine_args(engine_args)
         example_inputs = [(0, "What is LLM?",
         SamplingParams(temperature=0.0))]
-    
+
         # Start the engine with an event loop
         while True:
             if example_inputs:
@@ -1690,6 +1690,7 @@ def _get_stats(self,
         max_num_generation_tokens_requests: List[int] = []
         max_tokens_requests: List[int] = []
         finished_reason_requests: List[str] = []
+        failed_requests: List[str] = []
 
         # LoRA requests
         running_lora_adapters = dict(
@@ -1817,6 +1818,12 @@ def _get_stats(self,
                         SequenceStatus.get_finished_reason(seq.status)
                         for seq in seq_group.get_finished_seqs()
                     ])
+                    failed_requests.extend([
+                        SequenceStatus.get_finished_reason(seq.status)
+                        for seq in seq_group.get_finished_seqs()
+                        if seq.status in (SequenceStatus.FINISHED_ABORTED,
+                                          SequenceStatus.FINISHED_IGNORED)
+                    ])
 
             # Number of generation tokens.
             #   num_batched_tokens equals the number of prompt_tokens plus the
@@ -1878,6 +1885,7 @@ def _get_stats(self,
             n_requests=n_requests,
             max_tokens_requests=max_tokens_requests,
             finished_reason_requests=finished_reason_requests,
+            failed_requests=failed_requests,
             max_lora=str(max_lora_stat),
             waiting_lora_adapters=list(waiting_lora_adapters.keys()),
             running_lora_adapters=list(running_lora_adapters.keys()))

diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
@@ -21,6 +21,12 @@
 if TYPE_CHECKING:
     from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics
 
+try:
+    from vllm.entrypoints.openai.api_server import (
+        get_api_failed_requests_count)
+except ImportError:
+    get_api_failed_requests_count = None
+
 logger = init_logger(__name__)
 
 prometheus_client.disable_created_metrics()
@@ -267,6 +273,10 @@ def __init__(self, labelnames: List[str], vllm_config: VllmConfig):
             name="vllm:request_success_total",
             documentation="Count of successfully processed requests.",
             labelnames=labelnames + [Metrics.labelname_finish_reason])
+        self.counter_request_failed = self._counter_cls(
+            name="vllm:request_failed_total",
+            documentation="Count of failed requests (engine and API level).",
+            labelnames=labelnames)
 
         # Speculative decoding stats
         self.gauge_spec_decode_draft_acceptance_rate = self._gauge_cls(
@@ -547,6 +557,8 @@ def __init__(self, local_interval: float, labels: Dict[str, str],
         self.labels = labels
         self.metrics = self._metrics_cls(labelnames=list(labels.keys()),
                                          vllm_config=vllm_config)
+        # Track previous API failed count for incremental logging
+        self.last_api_failed_count = 0
 
     def _log_gauge(self, gauge, data: Union[int, float]) -> None:
         # Convenience function for logging to gauge.
@@ -646,6 +658,25 @@ def _log_prometheus(self, stats: Stats) -> None:
         self._log_counter_labels(self.metrics.counter_request_success,
                                  finished_reason_counter,
                                  Metrics.labelname_finish_reason)
+        # Log failed requests (engine-level failures)
+        engine_failed_count = len(stats.failed_requests)
+        # Clear the list to avoid double-counting on subsequent calls
+        stats.failed_requests.clear()
+
+        # Log API-level failures (if available)
+        api_failed_increment = 0
+        if get_api_failed_requests_count is not None:
+            current_api_failed_count = get_api_failed_requests_count()
+            # Only count the incremental increase since last collection
+            api_failed_increment = (current_api_failed_count -
+                                    self.last_api_failed_count)
+            self.last_api_failed_count = current_api_failed_count
+
+        # Log total failed requests (engine + API increments)
+        total_failed_increment = engine_failed_count + api_failed_increment
+        if total_failed_increment > 0:
+            self._log_counter(self.metrics.counter_request_failed,
+                              total_failed_increment)
         self._log_histogram(self.metrics.histogram_num_prompt_tokens_request,
                             stats.num_prompt_tokens_requests)
         self._log_histogram(

diff --git a/vllm/engine/metrics_types.py b/vllm/engine/metrics_types.py
@@ -63,6 +63,7 @@ class Stats:
     max_num_generation_tokens_requests: List[int]
     max_tokens_requests: List[int]
     finished_reason_requests: List[str]
+    failed_requests: List[str]
     waiting_lora_adapters: List[str]
     running_lora_adapters: List[str]
     max_lora: str