diff --git a/docs/design/v1/metrics.md b/docs/design/v1/metrics.md index 7156ee9dd3ec..84c434212616 100644 --- a/docs/design/v1/metrics.md +++ b/docs/design/v1/metrics.md @@ -31,6 +31,7 @@ In v0, the following metrics are exposed via a Prometheus-compatible `/metrics` - `vllm:prompt_tokens_total` (Counter) - `vllm:generation_tokens_total` (Counter) - `vllm:request_success_total` (Counter) +- `vllm:request_failed_total` (Counter) - `vllm:request_prompt_tokens` (Histogram) - `vllm:request_generation_tokens` (Histogram) - `vllm:time_to_first_token_seconds` (Histogram) @@ -75,6 +76,7 @@ The subset of metrics exposed in the Grafana dashboard gives us an indication of - `vllm:request_prompt_tokens` - Request prompt length - `vllm:request_generation_tokens` - request generation length - `vllm:request_success_total` - Number of finished requests by their finish reason: either an EOS token was generated or the max sequence length was reached +- `vllm:request_failed_total` Number of failed requests including both engine-level failures (aborted/ignored by scheduler) and API-level failures (validation errors, invalid parameters, etc.) - `vllm:request_queue_time_seconds` - Queue Time - `vllm:request_prefill_time_seconds` - Requests Prefill Time - `vllm:request_decode_time_seconds` - Requests Decode Time @@ -597,7 +599,7 @@ see: - [Inference Perf](https://github.com/kubernetes-sigs/wg-serving/tree/main/proposals/013-inference-perf) - and . - + This is a non-trivial topic. Consider this comment from Rob: > I think this metric should focus on trying to estimate what the max @@ -678,7 +680,7 @@ v0 has support for OpenTelemetry tracing: post](https://medium.com/@ronen.schaffer/follow-the-trail-supercharging-vllm-with-opentelemetry-distributed-tracing-aa655229b46f) - [IBM product docs](https://www.ibm.com/docs/en/instana-observability/current?topic=mgaa-monitoring-large-language-models-llms-vllm-public-preview) - + OpenTelemetry has a [Gen AI Working Group](https://github.com/open-telemetry/community/blob/main/projects/gen-ai.md). diff --git a/examples/online_serving/prometheus_grafana/grafana.json b/examples/online_serving/prometheus_grafana/grafana.json index fbe96b48e799..d199ce1c16b7 100644 --- a/examples/online_serving/prometheus_grafana/grafana.json +++ b/examples/online_serving/prometheus_grafana/grafana.json @@ -1177,6 +1177,107 @@ "title": "Finish Reason", "type": "timeseries" }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Number of failed requests tracked by vLLM", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 1 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 32 + }, + "id": 17, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "increase(vllm:request_failed_total{model_name=\"$model_name\"}[$__rate_interval])", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "interval": "", + "legendFormat": "Failed Requests", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Failed Requests", + "type": "timeseries" + }, { "datasource": { "default": false, @@ -1341,7 +1442,7 @@ "h": 8, "w": 12, "x": 0, - "y": 40 + "y": 48 }, "id": 15, "options": { @@ -1454,7 +1555,7 @@ "h": 8, "w": 12, "x": 12, - "y": 40 + "y": 48 }, "id": 16, "options": { diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py index 42f7b098f917..72f12dc727ec 100644 --- a/tests/entrypoints/openai/test_metrics.py +++ b/tests/entrypoints/openai/test_metrics.py @@ -114,6 +114,7 @@ async def client(server): ("_total", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST) ], "vllm:request_success": [("_total", _NUM_REQUESTS)], + # Note: vllm:request_failed tested separately in test_failed_requests_metric } @@ -213,6 +214,7 @@ async def test_metrics_counts(server: RemoteOpenAIServer, "vllm:prompt_tokens_total", "vllm:generation_tokens_total", "vllm:request_success_total", + "vllm:request_failed_total", "vllm:cache_config_info", # labels in cache_config_info "block_size", @@ -239,6 +241,7 @@ async def test_metrics_counts(server: RemoteOpenAIServer, "vllm:iteration_tokens_total", "vllm:cache_config_info", "vllm:request_success_total", + "vllm:request_failed_total", "vllm:request_prompt_tokens_sum", "vllm:request_prompt_tokens_bucket", "vllm:request_prompt_tokens_count", @@ -343,3 +346,87 @@ def is_server_up(url): assert response.status_code == HTTPStatus.OK proc.wait() + + +@pytest.mark.asyncio +async def test_failed_requests_metric(server: RemoteOpenAIServer, + client: openai.AsyncClient, + use_v1: bool): + """Test that failed requests are properly counted in metrics.""" + response = requests.get(server.url_for("metrics")) + assert response.status_code == HTTPStatus.OK + + initial_failed_count = 0 + for family in text_string_to_metric_families(response.text): + if family.name == "vllm:request_failed_total": + for sample in family.samples: + if sample.name == "vllm:request_failed_total": + initial_failed_count = sample.value + break + + total_failed_attempts = 3 + actual_failed_attempts = 0 + + # Attempt 1: Use a non-existent model (should fail with 404) + try: + await client.completions.create(model="non-existent-model-name", + prompt="Test prompt", + max_tokens=10) + except Exception: + actual_failed_attempts += 1 + + # Attempt 2: Use invalid temperature (outside valid range) + try: + await client.completions.create( + model=MODEL_NAME, + prompt="Test prompt", + max_tokens=10, + temperature=-1.0 # Invalid temperature + ) + except Exception: + actual_failed_attempts += 1 + + # Attempt 3: Use max_tokens that exceeds model limit + try: + await client.completions.create( + model=MODEL_NAME, + prompt="Test prompt", + max_tokens=100000 # Very large number that should cause failure + ) + except Exception: + actual_failed_attempts += 1 + + assert actual_failed_attempts == total_failed_attempts, ( + f"Expected {total_failed_attempts} failed attempts, " + f"got {actual_failed_attempts}") + + time.sleep(1) + + response = requests.get(server.url_for("metrics")) + assert response.status_code == HTTPStatus.OK + + found_failed_metric = False + final_failed_count = 0 + for family in text_string_to_metric_families(response.text): + if family.name == "vllm:request_failed_total": + found_failed_metric = True + for sample in family.samples: + if sample.name == "vllm:request_failed_total": + final_failed_count = sample.value + break + break + + assert found_failed_metric, ( + "vllm:request_failed_total metric not found in metrics output") + + print(f"Initial failed count: {initial_failed_count}, " + f"Final failed count: {final_failed_count}") + print(f"Failed request attempts: {actual_failed_attempts}") + + assert final_failed_count >= initial_failed_count, ( + f"Expected failed count to be at least {initial_failed_count}, " + f"but got {final_failed_count}") + + if actual_failed_attempts == total_failed_attempts: + print(f"Successfully tested failed request metric tracking with " + f"{actual_failed_attempts} failed attempts") diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 5ca3ebe91d12..f8f3546b0fb7 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1285,7 +1285,7 @@ def step(self) -> List[Union[RequestOutput, PoolingRequestOutput]]: engine = LLMEngine.from_engine_args(engine_args) example_inputs = [(0, "What is LLM?", SamplingParams(temperature=0.0))] - + # Start the engine with an event loop while True: if example_inputs: @@ -1690,6 +1690,7 @@ def _get_stats(self, max_num_generation_tokens_requests: List[int] = [] max_tokens_requests: List[int] = [] finished_reason_requests: List[str] = [] + failed_requests: List[str] = [] # LoRA requests running_lora_adapters = dict( @@ -1817,6 +1818,12 @@ def _get_stats(self, SequenceStatus.get_finished_reason(seq.status) for seq in seq_group.get_finished_seqs() ]) + failed_requests.extend([ + SequenceStatus.get_finished_reason(seq.status) + for seq in seq_group.get_finished_seqs() + if seq.status in (SequenceStatus.FINISHED_ABORTED, + SequenceStatus.FINISHED_IGNORED) + ]) # Number of generation tokens. # num_batched_tokens equals the number of prompt_tokens plus the @@ -1878,6 +1885,7 @@ def _get_stats(self, n_requests=n_requests, max_tokens_requests=max_tokens_requests, finished_reason_requests=finished_reason_requests, + failed_requests=failed_requests, max_lora=str(max_lora_stat), waiting_lora_adapters=list(waiting_lora_adapters.keys()), running_lora_adapters=list(running_lora_adapters.keys())) diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index 34b48f83b643..f8ef20ed73da 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -21,6 +21,12 @@ if TYPE_CHECKING: from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics +try: + from vllm.entrypoints.openai.api_server import ( + get_api_failed_requests_count) +except ImportError: + get_api_failed_requests_count = None + logger = init_logger(__name__) prometheus_client.disable_created_metrics() @@ -267,6 +273,10 @@ def __init__(self, labelnames: List[str], vllm_config: VllmConfig): name="vllm:request_success_total", documentation="Count of successfully processed requests.", labelnames=labelnames + [Metrics.labelname_finish_reason]) + self.counter_request_failed = self._counter_cls( + name="vllm:request_failed_total", + documentation="Count of failed requests (engine and API level).", + labelnames=labelnames) # Speculative decoding stats self.gauge_spec_decode_draft_acceptance_rate = self._gauge_cls( @@ -547,6 +557,8 @@ def __init__(self, local_interval: float, labels: Dict[str, str], self.labels = labels self.metrics = self._metrics_cls(labelnames=list(labels.keys()), vllm_config=vllm_config) + # Track previous API failed count for incremental logging + self.last_api_failed_count = 0 def _log_gauge(self, gauge, data: Union[int, float]) -> None: # Convenience function for logging to gauge. @@ -646,6 +658,25 @@ def _log_prometheus(self, stats: Stats) -> None: self._log_counter_labels(self.metrics.counter_request_success, finished_reason_counter, Metrics.labelname_finish_reason) + # Log failed requests (engine-level failures) + engine_failed_count = len(stats.failed_requests) + # Clear the list to avoid double-counting on subsequent calls + stats.failed_requests.clear() + + # Log API-level failures (if available) + api_failed_increment = 0 + if get_api_failed_requests_count is not None: + current_api_failed_count = get_api_failed_requests_count() + # Only count the incremental increase since last collection + api_failed_increment = (current_api_failed_count - + self.last_api_failed_count) + self.last_api_failed_count = current_api_failed_count + + # Log total failed requests (engine + API increments) + total_failed_increment = engine_failed_count + api_failed_increment + if total_failed_increment > 0: + self._log_counter(self.metrics.counter_request_failed, + total_failed_increment) self._log_histogram(self.metrics.histogram_num_prompt_tokens_request, stats.num_prompt_tokens_requests) self._log_histogram( diff --git a/vllm/engine/metrics_types.py b/vllm/engine/metrics_types.py index 9e6d5ef29bed..661f7cc46d4c 100644 --- a/vllm/engine/metrics_types.py +++ b/vllm/engine/metrics_types.py @@ -63,6 +63,7 @@ class Stats: max_num_generation_tokens_requests: List[int] max_tokens_requests: List[int] finished_reason_requests: List[str] + failed_requests: List[str] waiting_lora_adapters: List[str] running_lora_adapters: List[str] max_lora: str diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 2da89b4f5944..c43f6cf72f6a 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -10,6 +10,7 @@ import signal import socket import tempfile +import threading import uuid from argparse import Namespace from collections.abc import AsyncIterator @@ -110,6 +111,24 @@ _running_tasks: set[asyncio.Task] = set() +# Global counter for API-level failed requests +_api_failed_requests_counter = 0 +_api_failed_requests_lock = threading.Lock() + + +def increment_api_failed_requests(): + """Thread-safe increment of the API-level failed requests counter.""" + global _api_failed_requests_counter + with _api_failed_requests_lock: + _api_failed_requests_counter += 1 + + +def get_api_failed_requests_count(): + """Thread-safe getter for the API-level failed requests counter.""" + global _api_failed_requests_counter + with _api_failed_requests_lock: + return _api_failed_requests_counter + @asynccontextmanager async def lifespan(app: FastAPI): @@ -1057,6 +1076,8 @@ def build_app(args: Namespace) -> FastAPI: @app.exception_handler(HTTPException) async def http_exception_handler(_: Request, exc: HTTPException): + # Track API-level failed requests + increment_api_failed_requests() err = ErrorResponse(message=exc.detail, type=HTTPStatus(exc.status_code).phrase, code=exc.status_code) @@ -1065,6 +1086,8 @@ async def http_exception_handler(_: Request, exc: HTTPException): @app.exception_handler(RequestValidationError) async def validation_exception_handler(_: Request, exc: RequestValidationError): + # Track API-level failed requests + increment_api_failed_requests() exc_str = str(exc) errors_str = str(exc.errors())