Skip to content

Commit f87ad6b

Browse files
authored
Update TensorRT-LLM backend (triton-inference-server#439)
1 parent c6ac3d8 commit f87ad6b

File tree

6 files changed

+17
-6
lines changed

6 files changed

+17
-6
lines changed

README.md

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -205,8 +205,11 @@ and postprocessing models together.
205205
This model can also be used to chain the preprocessing,
206206
tensorrt_llm and postprocessing models together.
207207

208-
The BLS model has an optional
209-
parameter `accumulate_tokens` which can be used in streaming mode to call the
208+
When using the BLS model instead of the ensemble, you should set the number of model instances to
209+
the maximum batch size supported by the TRT engine to allow concurrent request execution. This
210+
can be done by modifying the `count` value in the `instance_group` section of the BLS model `config.pbtxt`.
211+
212+
The BLS model has an optional parameter `accumulate_tokens` which can be used in streaming mode to call the
210213
postprocessing model with all accumulated tokens, instead of only one token.
211214
This might be necessary for certain tokenizers.
212215

ci/L0_backend_trtllm/generate_engines.sh

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,13 +45,19 @@ function build_tensorrt_engine_inflight_batcher {
4545
local OUTPUT_DIR=inflight_${NUM_GPUS}_gpu/
4646
# ./c-model/gpt2/ must already exist (it will if build_base_model
4747
# has already been run)
48+
extra_args=""
49+
# If no nvlink, disable custom all reduce.
50+
if [ "$(nvidia-smi nvlink -s | wc -l)" -eq "0" ] || [ $(nvidia-smi nvlink --status | grep inActive | wc -l) -ge 1 ]; then
51+
extra_args+="--use_custom_all_reduce=disable"
52+
fi
4853
trtllm-build --checkpoint_dir "${GPT_MODEL_DIR}" \
4954
--gpt_attention_plugin float16 \
5055
--remove_input_padding enable \
5156
--paged_kv_cache enable \
5257
--gemm_plugin float16 \
5358
--workers "${NUM_GPUS}" \
54-
--output_dir "${OUTPUT_DIR}"
59+
--output_dir "${OUTPUT_DIR}" \
60+
${extra_args}
5561
cd ${BASE_DIR}
5662
}
5763

ci/L0_backend_trtllm/test.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,8 @@ for NUM_GPU in "${NUM_GPUS_TO_TEST[@]}"; do
186186
exit 1
187187
fi
188188

189+
wait_for_server_terminated ${SERVER_PID[@]}
190+
189191
# inflight batching OFF (V1)
190192
# streaming OFF
191193
SERVER_LOG="./${NUM_GPU}gpu_v1_no_streaming_server.log"

inflight_batcher_llm/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ include(${TRTLLM_DIR}/cpp/cmake/modules/find_library_create_target.cmake)
3434

3535
project(tritontensorrtllmbackend LANGUAGES C CXX)
3636

37-
add_compile_options("-D_GLIBCXX_USE_CXX11_ABI=0")
37+
add_compile_options("-D_GLIBCXX_USE_CXX11_ABI=0" "-DENABLE_MULTI_DEVICE=1")
3838
#
3939
# Options
4040
#

tensorrt_llm

Submodule tensorrt_llm updated 200 files

tools/version.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
b95f14b34c5b22194c87be2f30b07269ebb2f328
1+
656d96117a2deee3e26d444363b4d1aa1799cca2

0 commit comments

Comments
 (0)