Skip to content

Commit c6ac3d8

Browse files
authored
Update TensorRT-LLM backend (triton-inference-server#431)
* Update TensorRT-LLM backend
1 parent bf5e900 commit c6ac3d8

File tree

8 files changed

+75
-62
lines changed

8 files changed

+75
-62
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -299,7 +299,7 @@ The following table shows the fields that may to be modified before deployment:
299299
| `normalize_log_probs` | Optional (default=`true`). Set to `false` to skip normalization of `output_log_probs` |
300300
| `enable_chunked_context` | Optional (default=`false`). Set to `true` to enable context chunking. |
301301
| `gpu_device_ids` | Optional (default=unspecified). Comma-separated list of GPU IDs to use for this model. If not provided, the model will use all visible GPUs. |
302-
| `decoding_mode` | Optional. Set to one of the following: `{top_k, top_p, top_k_top_p, beam_search}` to select the decoding mode. The `top_k` mode exclusively uses Top-K algorithm for sampling, The `top_p` mode uses exclusively Top-P algorithm for sampling. The top_k_top_p mode employs both Top-K and Top-P algorithms, depending on the runtime sampling params of the request. Note that the `top_k_top_p option` requires more memory and has a longer runtime than using `top_k` or `top_p` individually; therefore, it should be used only when necessary. `beam_search` uses beam search algorithm. If not specified, the default is to use `top_k_top_p` if `max_beam_width == 1`; otherwise, `beam_search` is used. |
302+
| `decoding_mode` | Optional. Set to one of the following: `{top_k, top_p, top_k_top_p, beam_search, medusa}` to select the decoding mode. The `top_k` mode exclusively uses Top-K algorithm for sampling, The `top_p` mode uses exclusively Top-P algorithm for sampling. The top_k_top_p mode employs both Top-K and Top-P algorithms, depending on the runtime sampling params of the request. Note that the `top_k_top_p option` requires more memory and has a longer runtime than using `top_k` or `top_p` individually; therefore, it should be used only when necessary. `beam_search` uses beam search algorithm. If not specified, the default is to use `top_k_top_p` if `max_beam_width == 1`; otherwise, `beam_search` is used. When Medusa model is used, `medusa` decoding mode should be set. However, TensorRT-LLM detects loaded Medusa model and overwrites decoding mode to `medusa` with warning. |
303303
| `medusa_choices` | Optional. To specify Medusa choices tree in the format of e.g. "{0, 0, 0}, {0, 1}". By default, mc_sim_7b_63 choices are used. |
304304
| `lora_cache_optimal_adapter_size` | Optional (default=8) Optimal adapter size used to size cache pages. Typically optimally sized adapters will fix exactly into 1 cache page. |
305305
| `lora_cache_max_adapter_size` | Optional (default=64) Used to set the minimum size of a cache page. Pages must be at least large enough to fit a single module, single later adapter_size `maxAdapterSize` row of weights. |

all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -350,6 +350,18 @@ parameters: {
350350
string_value: "${kv_cache_free_gpu_mem_fraction}"
351351
}
352352
}
353+
parameters: {
354+
key: "kv_cache_host_memory_bytes"
355+
value: {
356+
string_value: "${kv_cache_host_memory_bytes}"
357+
}
358+
}
359+
parameters: {
360+
key: "kv_cache_onboard_blocks"
361+
value: {
362+
string_value: "${kv_cache_onboard_blocks}"
363+
}
364+
}
353365
parameters: {
354366
key: "enable_trt_overlap"
355367
value: {

ci/L0_backend_trtllm/custom_metrics_verification_tests.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,7 @@
4646
"inflight_batcher_specific_metric=micro_batch_id": "MicroBatch ID",
4747
"inflight_batcher_specific_metric=generation_requests":
4848
"Generation Requests",
49-
"inflight_batcher_specific_metric=terminated_requests":
50-
"Terminated Requests",
49+
"inflight_batcher_specific_metric=paused_requests": "Paused Requests",
5150
"v1_specific_metric=total_context_tokens": "Total Context Tokens",
5251
"v1_specific_metric=total_generation_tokens": "Total Generation Tokens",
5352
"v1_specific_metric=empty_generation_slots": "Empty Generation Slots",
@@ -109,7 +108,7 @@ def _base_test(self, stats_file, metrics_file, is_v1):
109108
int(metrics[metric_key]))
110109
else:
111110
dt_log = datetime.strptime(stats[metric_key],
112-
"%m-%d-%Y %H:%M:%S")
111+
'%m-%d-%Y %H:%M:%S.%f')
113112
dt_curl = datetime.utcfromtimestamp(
114113
int(metrics[metric_key]) // 1000000)
115114
difference = dt_log - dt_curl

ci/L0_backend_trtllm/generate_engines.sh

Lines changed: 0 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -55,26 +55,6 @@ function build_tensorrt_engine_inflight_batcher {
5555
cd ${BASE_DIR}
5656
}
5757

58-
function install_trt_llm {
59-
# Install CMake
60-
bash /opt/tritonserver/tensorrtllm_backend/tensorrt_llm/docker/common/install_cmake.sh
61-
export PATH="/usr/local/cmake/bin:${PATH}"
62-
63-
# PyTorch needs to be built from source for aarch64
64-
ARCH="$(uname -i)"
65-
if [ "${ARCH}" = "aarch64" ]; then TORCH_INSTALL_TYPE="src_non_cxx11_abi"; \
66-
else TORCH_INSTALL_TYPE="pypi"; fi && \
67-
(cd /opt/tritonserver/tensorrtllm_backend/tensorrt_llm &&
68-
bash docker/common/install_pytorch.sh $TORCH_INSTALL_TYPE &&
69-
python3 ./scripts/build_wheel.py --trt_root="${TRT_ROOT}" &&
70-
pip3 install ./build/tensorrt_llm*.whl)
71-
}
72-
73-
# Install TRT LLM
74-
install_trt_llm
75-
76-
# Install dependencies
77-
pip3 install -r ${TRTLLM_DIR}/requirements-dev.txt --extra-index-url https://pypi.ngc.nvidia.com
7858
# Downgrade to legacy version to accommodate Triton CI runners
7959
pip install pynvml==11.4.0
8060

@@ -97,8 +77,3 @@ mv ${GPT_DIR}/inflight_*_gpu/ engines/
9777
# Move the tokenizer into the CI directory
9878
mkdir tokenizer
9979
mv ${GPT_DIR}/gpt2/* tokenizer/
100-
101-
# Now that the engines are generated, we should remove the
102-
# tensorrt_llm module to ensure the C++ backend tests are
103-
# not using it
104-
pip3 uninstall -y torch tensorrt_llm

ci/L0_backend_trtllm/test.sh

Lines changed: 33 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -306,6 +306,39 @@ for NUM_GPU in "${NUM_GPUS_TO_TEST[@]}"; do
306306
kill_server
307307
wait_for_server_terminated ${SERVER_PID[@]}
308308

309+
# World size must be 1 when using multi-model
310+
if [ "${NUM_GPU}" == "0" ]; then
311+
# Multi-model
312+
SERVER_LOG="./${NUM_GPU}gpu_multi_model.log"
313+
run_server "${SERVER_ARGS} --multi-model"
314+
wait_for_server_ready ${SERVER_TIMEOUT} ${SERVER_PID[@]}
315+
if [ "$WAIT_RET" != "0" ]; then
316+
# Cleanup
317+
kill $SERVER_PID > /dev/null 2>&1 || true
318+
echo -e "\n***\n*** Failed to start $SERVER\n***"
319+
cat $SERVER_LOG
320+
exit 1
321+
fi
322+
set -e
323+
324+
python3 ${TOOLS_DIR}/inflight_batcher_llm/end_to_end_test.py \
325+
--max-input-len=500 \
326+
--dataset=${DATASET}
327+
328+
if [ $? -ne 0 ]; then
329+
cat $SERVER_LOG
330+
echo -e "\n***\n*** Error executing inflight batching end-to-end test with ${NUM_GPU}GPU(s): line ${LINENO}\n***"
331+
kill_server
332+
wait_for_server_terminated ${SERVER_PID[@]}
333+
RET=1
334+
fi
335+
set +e
336+
337+
curl localhost:8002/metrics -o ${NUM_GPU}gpu_multi_model_metrics.out
338+
kill_server
339+
wait_for_server_terminated ${SERVER_PID[@]}
340+
fi
341+
309342
# inflight batching ON
310343
# streaming ON
311344
SERVER_LOG="./${NUM_GPU}gpu_IFB_streaming_server.log"
@@ -338,37 +371,6 @@ for NUM_GPU in "${NUM_GPUS_TO_TEST[@]}"; do
338371
kill_server
339372
wait_for_server_terminated ${SERVER_PID[@]}
340373

341-
# Multi-model
342-
SERVER_LOG="./${NUM_GPU}gpu_multi_model.log"
343-
run_server "${SERVER_ARGS} --multi-model"
344-
wait_for_server_ready ${SERVER_TIMEOUT} ${SERVER_PID[@]}
345-
if [ "$WAIT_RET" != "0" ]; then
346-
# Cleanup
347-
kill $SERVER_PID > /dev/null 2>&1 || true
348-
echo -e "\n***\n*** Failed to start $SERVER\n***"
349-
cat $SERVER_LOG
350-
exit 1
351-
fi
352-
set -e
353-
354-
set -e
355-
python3 ${TOOLS_DIR}/inflight_batcher_llm/end_to_end_test.py \
356-
--max-input-len=500 \
357-
--dataset=${DATASET}
358-
359-
if [ $? -ne 0 ]; then
360-
cat $SERVER_LOG
361-
echo -e "\n***\n*** Error executing inflight batching end-to-end test with ${NUM_GPU}GPU(s): line ${LINENO}\n***"
362-
kill_server
363-
wait_for_server_terminated ${SERVER_PID[@]}
364-
RET=1
365-
fi
366-
set +e
367-
368-
curl localhost:8002/metrics -o ${NUM_GPU}gpu_IFB_no_stream_metrics.out
369-
kill_server
370-
wait_for_server_terminated ${SERVER_PID[@]}
371-
372374
done
373375

374376
# Verify TRT LLM statistics are being properly reported as custom metrics

inflight_batcher_llm/src/model_instance_state.cc

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,25 @@ ModelInstanceState::ModelInstanceState(
210210
"kv_cache_free_gpu_mem_fraction is not specified, will use default value of 0.9 or "
211211
"max_tokens_in_paged_kv_cache");
212212
}
213+
std::optional<size_t> kvCacheHostCacheSize = std::nullopt;
214+
try
215+
{
216+
kvCacheHostCacheSize = model_state_->GetParameter<size_t>("kv_cache_host_memory_bytes");
217+
}
218+
catch (std::exception const& e)
219+
{
220+
TLLM_LOG_WARNING("kv_cache_host_memory_bytes not set, defaulting to 0");
221+
}
222+
bool kvCacheOnboardBlocks = true;
223+
try
224+
{
225+
kvCacheOnboardBlocks = model_state_->GetParameter<bool>("kv_cache_onboard_blocks");
226+
}
227+
catch (std::exception const& e)
228+
{
229+
// If parameter is not specified, just ignore
230+
TLLM_LOG_WARNING("kv_cache_onboard_blocks not set, defaulting to true");
231+
}
213232

214233
bool enableTrtOverlap = false;
215234
try
@@ -288,6 +307,10 @@ ModelInstanceState::ModelInstanceState(
288307
{
289308
decodingMode = DecodingMode::BeamSearch();
290309
}
310+
else if (decodingModeStr == "medusa")
311+
{
312+
decodingMode = DecodingMode::Medusa();
313+
}
291314
else
292315
{
293316
throw std::runtime_error("");
@@ -368,6 +391,8 @@ ModelInstanceState::ModelInstanceState(
368391
optionalParams.kvCacheConfig.freeGpuMemoryFraction = kvCacheFreeGpuMemFraction;
369392
optionalParams.kvCacheConfig.maxAttentionWindow = maxAttentionWindow;
370393
optionalParams.kvCacheConfig.enableBlockReuse = enableKVCacheReuse;
394+
optionalParams.kvCacheConfig.hostCacheSize = kvCacheHostCacheSize;
395+
optionalParams.kvCacheConfig.onboardBlocks = kvCacheOnboardBlocks;
371396
optionalParams.enableTrtOverlap = enableTrtOverlap;
372397
optionalParams.normalizeLogProbs = normalizeLogProbs;
373398
optionalParams.enableChunkedContext = enableChunkedContext;

tensorrt_llm

Submodule tensorrt_llm updated 319 files

tools/version.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
d0c1a4fdcc47207417b7841e343b3c89ee512770
1+
b95f14b34c5b22194c87be2f30b07269ebb2f328

0 commit comments

Comments
 (0)