Fix cublas handle not sufficient memory bug in multi gpu case and reduce memory fraction in kv cache config.

dominicshanshan · dominicshanshan · commit 247016bced6c · 2025-10-21T23:46:12.000-07:00
Signed-off-by: Wangshanshan &lt;30051912+dominicshanshan@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/common/opUtils.cpp b/cpp/tensorrt_llm/common/opUtils.cpp
@@ -238,28 +238,35 @@ class PerCudaCtxPerThreadSingletonCreator
     std::unordered_map<CacheKey, std::weak_ptr<T>, hash<CacheKey>> mObservers;
 };
 
+// Helper function to log memory usage - returns the memory values for potential error handling
+static std::pair<size_t, size_t> logMemoryUsage(char const* operation, CUcontext ctx)
+{
+    size_t free_mem = 0, total_mem = 0;
+    TLLM_CUDA_CHECK(cudaMemGetInfo(&free_mem, &total_mem));
+
+    TLLM_LOG_DEBUG("%s: Context=%p, Free Memory=%zu MB (%.1f%%), Total=%zu MB", operation, ctx,
+        free_mem / (1024 * 1024), (float) free_mem / total_mem * 100.0, total_mem / (1024 * 1024));
+
+    return {free_mem, total_mem};
+}
+
 } // namespace
 
 std::shared_ptr<cublasHandle_t> getCublasHandle()
 {
     static PerCudaCtxPerThreadSingletonCreator<cublasHandle_t> creator(
         []() -> auto
         {
-            size_t free_mem = 0, total_mem = 0;
-            cudaMemGetInfo(&free_mem, &total_mem);
-
-            CUcontext ctx;
-            cuCtxGetCurrent(&ctx);
+            CUcontext ctx = getCurrentCudaCtx();
+            auto [free_mem, total_mem] = logMemoryUsage("Creating cublas handle", ctx);
 
-            TLLM_LOG_DEBUG("Creating cublas handle: Context=%p, Free Memory=%zu MB (%.1f%%), Total=%zu MB", ctx,
-                free_mem / (1024 * 1024), (float) free_mem / total_mem * 100.0, total_mem / (1024 * 1024));
-
-            auto handle = std::unique_ptr<cublasHandle_t>(new cublasHandle_t);
+            auto handle = std::make_unique<cublasHandle_t>();
 
             cublasStatus_t status = cublasCreate(handle.get());
 
             if (status != CUBLAS_STATUS_SUCCESS)
             {
+                // Re-fetch memory info for error message (memory state might have changed)
                 cudaMemGetInfo(&free_mem, &total_mem);
                 TLLM_THROW(
                     "Failed to create cublas handle. "
@@ -273,7 +280,11 @@ std::shared_ptr<cublasHandle_t> getCublasHandle()
         },
         [](cublasHandle_t* handle)
         {
-            TLLM_CUDA_CHECK(cublasDestroy(*handle));
+            cublasStatus_t status = cublasDestroy(*handle);
+            if (status != CUBLAS_STATUS_SUCCESS)
+            {
+                TLLM_LOG_WARNING("Failed to destroy cublas handle. Status: %d", status);
+            }
             delete handle;
         });
     return creator();
@@ -284,21 +295,16 @@ std::shared_ptr<cublasLtHandle_t> getCublasLtHandle()
     static PerCudaCtxPerThreadSingletonCreator<cublasLtHandle_t> creator(
         []() -> auto
         {
-            size_t free_mem = 0, total_mem = 0;
-            cudaMemGetInfo(&free_mem, &total_mem);
-
-            CUcontext ctx;
-            cuCtxGetCurrent(&ctx);
-
-            TLLM_LOG_DEBUG("Creating cublasLt handle: Context=%p, Free Memory=%zu MB (%.1f%%), Total=%zu MB", ctx,
-                free_mem / (1024 * 1024), (float) free_mem / total_mem * 100.0, total_mem / (1024 * 1024));
+            CUcontext ctx = getCurrentCudaCtx();
+            auto [free_mem, total_mem] = logMemoryUsage("Creating cublasLt handle", ctx);
 
-            auto handle = std::unique_ptr<cublasLtHandle_t>(new cublasLtHandle_t);
+            auto handle = std::make_unique<cublasLtHandle_t>();
 
             cublasStatus_t status = cublasLtCreate(handle.get());
 
             if (status != CUBLAS_STATUS_SUCCESS)
             {
+                // Re-fetch memory info for error message (memory state might have changed)
                 cudaMemGetInfo(&free_mem, &total_mem);
                 TLLM_THROW(
                     "Failed to create cublasLt handle. "
@@ -312,7 +318,11 @@ std::shared_ptr<cublasLtHandle_t> getCublasLtHandle()
         },
         [](cublasLtHandle_t* handle)
         {
-            TLLM_CUDA_CHECK(cublasLtDestroy(*handle));
+            cublasStatus_t status = cublasLtDestroy(*handle);
+            if (status != CUBLAS_STATUS_SUCCESS)
+            {
+                TLLM_LOG_WARNING("Failed to destroy cublasLt handle. Status: %d", status);
+            }
             delete handle;
         });
     return creator();
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -164,8 +164,8 @@ def test_fp8(self, fp8kv, attn_backend, torch_compile):
         if fp8kv:
             pytorch_config["kv_cache_config"] = KvCacheConfig(
                 dtype="fp8",
-                max_tokens=
-                100000,  # Limit tokens to prevent no room for cublas/cublasLt handles
+                free_gpu_memory_fraction=
+                0.8,  # Prevent cublas/cublasLt handle allocation memory insufficient errors
             )
         with LLM(
                 f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct-FP8",
@@ -202,8 +202,8 @@ def test_fp8_4gpus(self, tp_size, pp_size, fp8kv, attn_backend,
         if fp8kv:
             pytorch_config["kv_cache_config"] = KvCacheConfig(
                 dtype="fp8",
-                max_tokens=
-                100000,  # Limit tokens to prevent no room for cublas/cublasLt handles
+                free_gpu_memory_fraction=
+                0.8,  # Prevent cublas/cublasLt handle allocation memory insufficient errors
             )
         with LLM(
                 f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct-FP8",