File tree Expand file tree Collapse file tree 6 files changed +17
-6
lines changed Expand file tree Collapse file tree 6 files changed +17
-6
lines changed Original file line number Diff line number Diff line change @@ -205,8 +205,11 @@ and postprocessing models together.
205
205
This model can also be used to chain the preprocessing,
206
206
tensorrt_llm and postprocessing models together.
207
207
208
- The BLS model has an optional
209
- parameter ` accumulate_tokens ` which can be used in streaming mode to call the
208
+ When using the BLS model instead of the ensemble, you should set the number of model instances to
209
+ the maximum batch size supported by the TRT engine to allow concurrent request execution. This
210
+ can be done by modifying the ` count ` value in the ` instance_group ` section of the BLS model ` config.pbtxt ` .
211
+
212
+ The BLS model has an optional parameter ` accumulate_tokens ` which can be used in streaming mode to call the
210
213
postprocessing model with all accumulated tokens, instead of only one token.
211
214
This might be necessary for certain tokenizers.
212
215
Original file line number Diff line number Diff line change @@ -45,13 +45,19 @@ function build_tensorrt_engine_inflight_batcher {
45
45
local OUTPUT_DIR=inflight_${NUM_GPUS} _gpu/
46
46
# ./c-model/gpt2/ must already exist (it will if build_base_model
47
47
# has already been run)
48
+ extra_args=" "
49
+ # If no nvlink, disable custom all reduce.
50
+ if [ " $( nvidia-smi nvlink -s | wc -l) " -eq " 0" ] || [ $( nvidia-smi nvlink --status | grep inActive | wc -l) -ge 1 ]; then
51
+ extra_args+=" --use_custom_all_reduce=disable"
52
+ fi
48
53
trtllm-build --checkpoint_dir " ${GPT_MODEL_DIR} " \
49
54
--gpt_attention_plugin float16 \
50
55
--remove_input_padding enable \
51
56
--paged_kv_cache enable \
52
57
--gemm_plugin float16 \
53
58
--workers " ${NUM_GPUS} " \
54
- --output_dir " ${OUTPUT_DIR} "
59
+ --output_dir " ${OUTPUT_DIR} " \
60
+ ${extra_args}
55
61
cd ${BASE_DIR}
56
62
}
57
63
Original file line number Diff line number Diff line change @@ -186,6 +186,8 @@ for NUM_GPU in "${NUM_GPUS_TO_TEST[@]}"; do
186
186
exit 1
187
187
fi
188
188
189
+ wait_for_server_terminated ${SERVER_PID[@]}
190
+
189
191
# inflight batching OFF (V1)
190
192
# streaming OFF
191
193
SERVER_LOG=" ./${NUM_GPU} gpu_v1_no_streaming_server.log"
Original file line number Diff line number Diff line change @@ -34,7 +34,7 @@ include(${TRTLLM_DIR}/cpp/cmake/modules/find_library_create_target.cmake)
34
34
35
35
project (tritontensorrtllmbackend LANGUAGES C CXX )
36
36
37
- add_compile_options ("-D_GLIBCXX_USE_CXX11_ABI=0" )
37
+ add_compile_options ("-D_GLIBCXX_USE_CXX11_ABI=0" "-DENABLE_MULTI_DEVICE=1" )
38
38
#
39
39
# Options
40
40
#
Original file line number Diff line number Diff line change 1
- b95f14b34c5b22194c87be2f30b07269ebb2f328
1
+ 656d96117a2deee3e26d444363b4d1aa1799cca2
You can’t perform that action at this time.
0 commit comments