sleep mode level 1

Kacper-Pietkun · Kacper-Pietkun · commit d6922a23c9ee · 2025-11-19T13:33:51.000+02:00
Signed-off-by: Kacper Pietkun &lt;kpietkun@habana.ai&gt;
diff --git a/vllm_gaudi/platform.py b/vllm_gaudi/platform.py
@@ -138,6 +138,10 @@ def supports_v1(cls, model_config: ModelConfig) -> bool:
         # V1 support on HPU is experimental
         return True
 
+    @classmethod
+    def is_sleep_mode_available(cls) -> bool:
+        return True
+
     @classmethod
     def set_torch_compile(cls) -> None:
         # NOTE: PT HPU lazy backend (PT_HPU_LAZY_MODE = 1)
diff --git a/vllm_gaudi/v1/worker/hpu_worker.py b/vllm_gaudi/v1/worker/hpu_worker.py
@@ -10,6 +10,7 @@
 import torch
 import torch.distributed
 import torch.nn as nn
+import habana_frameworks.torch as htorch
 from vllm.tasks import SupportedTask
 from vllm_gaudi.extension.profiler import HabanaMemoryProfiler, format_bytes
 
@@ -77,6 +78,9 @@ def __init__(
         self.gc_track_recompiles = bool(
             "PT_HPU_METRICS_GC_DETAILS" in os.environ
             and bool_helper(os.getenv("PT_HPU_METRICS_GC_DETAILS")))
+        
+        self.model_sleeping = False
+        self.kv_cache_sleeping = False
 
     def init_profiler(self):
         """Initialize the profiler."""
@@ -273,6 +277,88 @@ def profile(self, is_start: bool = True):
         else:
             self.profiler.stop()
 
+    def sleep(self, level: int = 1) -> None:
+        """Put the worker into sleep mode to reduce memory usage. Unlike GPU workers that use custom
+        memory allocators, HPU workers use a simpler approach of moving model to CPU and clearing KV cache.
+        Args:
+            level (int): Sleep level (kept for interface compatibility, always performs level 1 operations)
+        """
+
+        assert not htorch.utils.internal.is_lazy(
+        ) or self.model_config.enforce_eager, "Sleep mode is supported only for torch.compile mode"
+
+        # Handle model - if model was loaded move it to CPU
+        if self.model_sleeping:
+            logger.warning("Model is already in a sleep mode, skipping moving it to CPU")
+        elif hasattr(self.model_runner, "model") and self.model_runner.model is not None:
+            with HabanaMemoryProfiler() as m:
+                self.model_runner.model.to("cpu")
+                torch.hpu.synchronize()
+            msg = f"Moving model to CPU for sleep mode took {m.get_summary_string()}"
+            logger.info(msg)
+            self.model_sleeping = True
+        else:
+            logger.warning("Model was not loaded yet, skipping moving it to CPU")
+
+        # Handle KV cache - discard it
+        if self.kv_cache_sleeping:
+            logger.warning("KV cache is already in a sleep mode, skipping discarding it")
+        else:
+            with HabanaMemoryProfiler() as m:
+                for ve in range(self.parallel_config.pipeline_parallel_size):
+                    del self.cache_engine[ve].gpu_cache
+                    del self.cache_engine[ve].cpu_cache
+                self.cache_engine.clear()
+                self.hpu_cache.clear()
+                self.hpu_cache = None
+                for layer_name in self.compilation_config.static_forward_context:
+                    self.compilation_config.static_forward_context[layer_name].kv_cache.clear()
+                    self.compilation_config.static_forward_context[layer_name].kv_cache = [
+                        torch.tensor([]) for _ in range(self.parallel_config.pipeline_parallel_size)
+                    ]
+                torch.hpu.synchronize()
+            msg = f"Discarding KV cache for sleep mode took {m.get_summary_string()}"
+            logger.info(msg)
+            self.kv_cache_sleeping = True
+
+    def wake_up(self, tags: list[str] | None = None) -> None:
+        """Wake up the worker from sleep mode. Moves the model back to HPU and optionally reinitializes KV cache.
+        
+        Args:
+            tags: Optional list of tags (kept for interface compatibility)
+        """
+        assert not htorch.utils.internal.is_lazy(
+        ) or self.model_config.enforce_eager, "Sleep mode is supported only for torch.compile mode"
+
+        if tags is None:
+            tags = ["weights", "kv_cache"]
+
+        # Handle model - if model was loaded, move it back to HPU
+        if "weights" in tags:
+            if not self.model_sleeping:
+                logger.warning("Model is not in a sleep mode, skipping moving it to HPU")
+            elif hasattr(self.model_runner, "model") and self.model_runner.model is not None:
+                with HabanaMemoryProfiler() as m:
+                    self.model_runner.model.to(self.device)
+                    torch.hpu.synchronize()
+                msg = f"Waking up model, moving it back to HPU took {m.get_summary_string()}"
+                logger.info(msg)
+                self.model_sleeping = False
+            else:
+                logger.warning("Model was not loaded yet, skipping moving it to HPU")
+
+        # Handle KV cache - reinitialize it
+        if "kv_cache" in tags:
+            if not self.kv_cache_sleeping:
+                logger.warning("KV cache is not in a sleep mode, skipping reinitializing it")
+            else:
+                with HabanaMemoryProfiler() as m:
+                    self._init_cache_engine()
+                    torch.hpu.synchronize()
+                msg = f"Waking up KV cache, reinitializing it took {m.get_summary_string()}"
+                logger.info(msg)
+                self.kv_cache_sleeping = False
+
 
 def init_worker_distributed_environment(
     parallel_config: ParallelConfig,
@@ -297,7 +383,6 @@ def init_worker_distributed_environment(
 
 @contextmanager
 def track_graph_compile(name: str):
-    import habana_frameworks.torch as htorch
     from habana_frameworks.torch.hpu.metrics import metric_localcontext
     with metric_localcontext("graph_compilation") as gc:
         yield