Update TensorRT-LLM backend (triton-inference-server#439)

kaiyux · web-flow · commit f87ad6bf66c7 · 2024-04-30T17:21:20.000+08:00
diff --git a/README.md b/README.md
@@ -205,8 +205,11 @@ and postprocessing models together.
 This model can also be used to chain the preprocessing,
 tensorrt_llm and postprocessing models together.
 
-The BLS model has an optional
-parameter `accumulate_tokens` which can be used in streaming mode to call the
+When using the BLS model instead of the ensemble, you should set the number of model instances to
+the maximum batch size supported by the TRT engine to allow concurrent request execution. This
+can be done by modifying the `count` value in the `instance_group` section of the BLS model `config.pbtxt`.
+
+The BLS model has an optional parameter `accumulate_tokens` which can be used in streaming mode to call the
 postprocessing model with all accumulated tokens, instead of only one token.
 This might be necessary for certain tokenizers.
 
diff --git a/ci/L0_backend_trtllm/generate_engines.sh b/ci/L0_backend_trtllm/generate_engines.sh
@@ -45,13 +45,19 @@ function build_tensorrt_engine_inflight_batcher {
     local OUTPUT_DIR=inflight_${NUM_GPUS}_gpu/
     # ./c-model/gpt2/ must already exist (it will if build_base_model
     # has already been run)
+    extra_args=""
+    # If no nvlink, disable custom all reduce.
+    if [ "$(nvidia-smi nvlink -s | wc -l)" -eq "0" ] || [ $(nvidia-smi nvlink --status | grep inActive | wc -l) -ge 1 ]; then
+        extra_args+="--use_custom_all_reduce=disable"
+    fi
     trtllm-build --checkpoint_dir "${GPT_MODEL_DIR}" \
             --gpt_attention_plugin float16 \
             --remove_input_padding enable \
             --paged_kv_cache enable \
             --gemm_plugin float16 \
             --workers "${NUM_GPUS}" \
-            --output_dir "${OUTPUT_DIR}"
+            --output_dir "${OUTPUT_DIR}" \
+            ${extra_args}
     cd ${BASE_DIR}
 }
 
diff --git a/ci/L0_backend_trtllm/test.sh b/ci/L0_backend_trtllm/test.sh
@@ -186,6 +186,8 @@ for NUM_GPU in "${NUM_GPUS_TO_TEST[@]}"; do
         exit 1
     fi
 
+    wait_for_server_terminated ${SERVER_PID[@]}
+
     # inflight batching OFF (V1)
     # streaming OFF
     SERVER_LOG="./${NUM_GPU}gpu_v1_no_streaming_server.log"
diff --git a/inflight_batcher_llm/CMakeLists.txt b/inflight_batcher_llm/CMakeLists.txt
@@ -34,7 +34,7 @@ include(${TRTLLM_DIR}/cpp/cmake/modules/find_library_create_target.cmake)
 
 project(tritontensorrtllmbackend LANGUAGES C CXX)
 
-add_compile_options("-D_GLIBCXX_USE_CXX11_ABI=0")
+add_compile_options("-D_GLIBCXX_USE_CXX11_ABI=0" "-DENABLE_MULTI_DEVICE=1")
 #
 # Options
 #
diff --git a/tensorrt_llm b/tensorrt_llm
@@ -1 +1 @@
-Subproject commit 66ef1df492f7bc9c8eeb01d7e14db01838e3f0bd
+Subproject commit 06c0e9b1ec38f981d023a223b212b312cfb62417
diff --git a/tools/version.txt b/tools/version.txt
@@ -1 +1 @@
-b95f14b34c5b22194c87be2f30b07269ebb2f328
+656d96117a2deee3e26d444363b4d1aa1799cca2

Original file line number	Diff line number	Diff line change
`@@ -34,7 +34,7 @@ include(${TRTLLM_DIR}/cpp/cmake/modules/find_library_create_target.cmake)`
`34`	`34`
`35`	`35`	`project(tritontensorrtllmbackend LANGUAGES C CXX)`
`36`	`36`
`37`		`-add_compile_options("-D_GLIBCXX_USE_CXX11_ABI=0")`
	`37`	`+add_compile_options("-D_GLIBCXX_USE_CXX11_ABI=0" "-DENABLE_MULTI_DEVICE=1")`
`38`	`38`	`#`
`39`	`39`	`# Options`
`40`	`40`	`#`
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-b95f14b34c5b22194c87be2f30b07269ebb2f328`
	`1`	`+656d96117a2deee3e26d444363b4d1aa1799cca2`