Skip to content

Commit bbf1175

Browse files
Fix cublas handle not sufficient memory bug in multi gpu case and reduce max tokens in kv cache config.
Signed-off-by: Wangshanshan <[email protected]>
1 parent 50824ed commit bbf1175

File tree

2 files changed

+5
-4
lines changed

2 files changed

+5
-4
lines changed

docker/Makefile

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ endef
121121
@echo "Pulling docker image: $(IMAGE_WITH_TAG)"
122122
docker pull $(IMAGE_WITH_TAG)
123123

124-
DOCKER_RUN_OPTS ?= --rm -it --ipc=host --ulimit stack=67108864 $(if $(filter 0,$(IS_ROOTLESS)),--ulimit memlock=-1) --privileged
124+
DOCKER_RUN_OPTS ?= --rm -it --ipc=host --ulimit stack=67108864 $(if $(filter 0,$(IS_ROOTLESS)),--ulimit memlock=-1)
125125
DOCKER_RUN_ARGS ?=
126126
# Check if NVIDIA_VISIBLE_DEVICES is set and not empty
127127
NVIDIA_VISIBLE_DEVICES_VAL = $(shell echo $$NVIDIA_VISIBLE_DEVICES)
@@ -156,7 +156,6 @@ endif
156156
docker run $(DOCKER_RUN_OPTS) $(DOCKER_RUN_ARGS) \
157157
$(GPU_OPTS) \
158158
--volume $(SOURCE_DIR):$(CODE_DIR) \
159-
--volume /home/scratch.trt_llm_data/:/scratch.trt_llm_data/ \
160159
$(EXTRA_VOLUMES) \
161160
$(if $(and $(filter 1,$(LOCAL_USER)),$(shell [ -w "$(USER_CACHE_DIR)" ] && echo 1)),--volume $(USER_CACHE_DIR):/home/$(USER_NAME)/.cache:rw) \
162161
--env "CCACHE_DIR=$(CCACHE_DIR)" \

tests/integration/defs/accuracy/test_llm_api_pytorch.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -164,7 +164,8 @@ def test_fp8(self, fp8kv, attn_backend, torch_compile):
164164
if fp8kv:
165165
pytorch_config["kv_cache_config"] = KvCacheConfig(
166166
dtype="fp8",
167-
# max_tokens=100000, # Limit tokens to prevent no room for CUBLAS handles
167+
max_tokens=
168+
100000, # Limit tokens to prevent no room for cublas/cublasLt handles
168169
)
169170
with LLM(
170171
f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct-FP8",
@@ -201,7 +202,8 @@ def test_fp8_4gpus(self, tp_size, pp_size, fp8kv, attn_backend,
201202
if fp8kv:
202203
pytorch_config["kv_cache_config"] = KvCacheConfig(
203204
dtype="fp8",
204-
# max_tokens=100000, # Limit tokens to prevent no room for CUBLAS handles
205+
max_tokens=
206+
100000, # Limit tokens to prevent no room for cublas/cublasLt handles
205207
)
206208
with LLM(
207209
f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct-FP8",

0 commit comments

Comments
 (0)