[FIX] Suppression of stacktrace on a shutdown (#187)

wallashss · tjohnson31415 · web-flow · commit 2d2bae2bd01a · 2025-06-03T20:15:00.000Z
---------

Signed-off-by: Wallas Santos &lt;wallashss@ibm.com&gt;
Co-authored-by: Travis Johnson &lt;tsjohnso@us.ibm.com&gt;
diff --git a/tests/spyre_util.py b/tests/spyre_util.py
@@ -174,6 +174,9 @@ def generate_spyre_vllm_output(model: str, prompts: list[str],
         str(val) for val in warmup_batch_size)
     os.environ['VLLM_SPYRE_DYNAMO_BACKEND'] = backend
     os.environ['VLLM_USE_V1'] = "1" if vllm_version == "V1" else "0"
+    # Allows to run multiprocess V1 engine without dumping meaningless logs at
+    # shutdown engine this context.
+    os.environ['VLLM_SPYRE_OVERRIDE_SIGNALS_HANDLER'] = "1"
 
     vllm_model = LLM(model=model,
                      tokenizer=model,
diff --git a/vllm_spyre/envs.py b/vllm_spyre/envs.py
@@ -10,6 +10,7 @@
     VLLM_SPYRE_RM_PADDED_BLOCKS: bool = False
     VLLM_SPYRE_PERF_METRIC_LOGGING_ENABLED: int = 0
     VLLM_SPYRE_PERF_METRIC_LOGGING_DIR: str = "/tmp"
+    VLLM_SPYRE_OVERRIDE_SIGNALS_HANDLER: bool = False
 
 # --8<-- [start:env-vars-definition]
 environment_variables: dict[str, Callable[[], Any]] = {
@@ -68,6 +69,12 @@
     # logs are written to /tmp.
     "VLLM_SPYRE_PERF_METRIC_LOGGING_DIR":
     lambda: os.getenv("VLLM_SPYRE_PERF_METRIC_LOGGING_DIR", "/tmp"),
+
+    # If set, override the signal handler for vllm-spyre on
+    # vLLM V1 + torch_sendnn backend to be able to gracefully
+    # shutdown the engine.
+    "VLLM_SPYRE_OVERRIDE_SIGNALS_HANDLER":
+    lambda: bool(int(os.getenv("VLLM_SPYRE_OVERRIDE_SIGNALS_HANDLER", "1"))),
 }
 # --8<-- [end:env-vars-definition]
 
diff --git a/vllm_spyre/v1/worker/spyre_worker.py b/vllm_spyre/v1/worker/spyre_worker.py
@@ -2,6 +2,7 @@
 import json
 import os
 import platform
+import signal
 import time
 from typing import Optional, Union, cast
 
@@ -406,6 +407,8 @@ def _warmup_spyre_dynamic_size(self, special_token_ids):
         logger.info("Warmup finished.")
         logger.info("Warmup took %.3fs", warmup_total_t)
 
+        maybe_override_signals_handler()
+
     def _warmup_spyre_fixed_size(self, prompt_len, num_decode_tokens,
                                  special_token_ids, batch_size):
 
@@ -524,6 +527,7 @@ def _warmup_spyre_fixed_size(self, prompt_len, num_decode_tokens,
         logger.info(
             "Warmup took %.3fs (for prompt length %d and max output tokens %d)",
             warmup_total_t, prompt_len, num_decode_tokens)
+        maybe_override_signals_handler()
 
     def _warmup_model_forward_pass(
         self,
@@ -566,3 +570,27 @@ def execute_model(
     ) -> Optional[ModelRunnerOutput]:
         output = self.model_runner.execute_model(scheduler_output)
         return output if self.is_driver_worker else None
+
+
+# Ref: https://github.com/vllm-project/vllm/blob/5fbbfe9a4c13094ad72ed3d6b4ef208a7ddc0fd7/vllm/v1/executor/multiproc_executor.py#L446 # noqa: E501
+# TODO: review this in the future
+# This setup is a workaround to suppress logs that are dumped at the shutdown
+# of the engine (only on V1) when vllm runs with multiprocess. The undesired
+# behavior happens because g3log from Spyre runtime overrides the signal
+# handler from vLLM when it starts a process for the engine code. Therefore,
+# the engine does not have a chance to gracefully shutdown.
+def maybe_override_signals_handler():
+    if not (envs.VLLM_USE_V1 and envs.VLLM_ENABLE_V1_MULTIPROCESSING
+            and envs_spyre.VLLM_SPYRE_OVERRIDE_SIGNALS_HANDLER):
+        return
+
+    shutdown_requested = False
+
+    def signal_handler(signum, frame):
+        nonlocal shutdown_requested
+        if not shutdown_requested:
+            shutdown_requested = True
+            raise SystemExit()
+
+    signal.signal(signal.SIGTERM, signal_handler)
+    signal.signal(signal.SIGINT, signal_handler)