Fix cublas handle not sufficient memory bug in multi gpu case and reduce max tokens in kv cache config.

dominicshanshan · dominicshanshan · commit bbf11755b76d · 2025-10-21T23:46:11.000-07:00
Signed-off-by: Wangshanshan &lt;30051912+dominicshanshan@users.noreply.github.com&gt;
diff --git a/docker/Makefile b/docker/Makefile
@@ -121,7 +121,7 @@ endef
 	@echo "Pulling docker image: $(IMAGE_WITH_TAG)"
 	docker pull $(IMAGE_WITH_TAG)
 
-DOCKER_RUN_OPTS ?= --rm -it --ipc=host --ulimit stack=67108864 $(if $(filter 0,$(IS_ROOTLESS)),--ulimit memlock=-1) --privileged
+DOCKER_RUN_OPTS ?= --rm -it --ipc=host --ulimit stack=67108864 $(if $(filter 0,$(IS_ROOTLESS)),--ulimit memlock=-1)
 DOCKER_RUN_ARGS   ?=
 # Check if NVIDIA_VISIBLE_DEVICES is set and not empty
 NVIDIA_VISIBLE_DEVICES_VAL = $(shell echo $$NVIDIA_VISIBLE_DEVICES)
@@ -156,7 +156,6 @@ endif
 	docker run $(DOCKER_RUN_OPTS) $(DOCKER_RUN_ARGS) \
     		$(GPU_OPTS) \
     		--volume $(SOURCE_DIR):$(CODE_DIR) \
-			--volume /home/scratch.trt_llm_data/:/scratch.trt_llm_data/ \
     		$(EXTRA_VOLUMES) \
     		$(if $(and $(filter 1,$(LOCAL_USER)),$(shell [ -w "$(USER_CACHE_DIR)" ] && echo 1)),--volume $(USER_CACHE_DIR):/home/$(USER_NAME)/.cache:rw) \
     		--env "CCACHE_DIR=$(CCACHE_DIR)" \
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -164,7 +164,8 @@ def test_fp8(self, fp8kv, attn_backend, torch_compile):
         if fp8kv:
             pytorch_config["kv_cache_config"] = KvCacheConfig(
                 dtype="fp8",
-                # max_tokens=100000,  # Limit tokens to prevent no room for CUBLAS handles
+                max_tokens=
+                100000,  # Limit tokens to prevent no room for cublas/cublasLt handles
             )
         with LLM(
                 f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct-FP8",
@@ -201,7 +202,8 @@ def test_fp8_4gpus(self, tp_size, pp_size, fp8kv, attn_backend,
         if fp8kv:
             pytorch_config["kv_cache_config"] = KvCacheConfig(
                 dtype="fp8",
-                # max_tokens=100000,  # Limit tokens to prevent no room for CUBLAS handles
+                max_tokens=
+                100000,  # Limit tokens to prevent no room for cublas/cublasLt handles
             )
         with LLM(
                 f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct-FP8",