Skip to content

Commit e239adc

Browse files
authored
Update TensorRT-LLM backend (triton-inference-server#444)
1 parent f87ad6b commit e239adc

23 files changed

+1291
-2281
lines changed

README.md

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -70,10 +70,10 @@ The below commands will build the same Triton TRT-LLM container as the one on th
7070
# Prepare the TRT-LLM base image using the dockerfile from tensorrtllm_backend.
7171
cd tensorrtllm_backend
7272
# Specify the build args for the dockerfile.
73-
BASE_IMAGE=nvcr.io/nvidia/tritonserver:24.01-py3-min
74-
TRT_VERSION=9.2.0.5
75-
TRT_URL_x86=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/9.2.0/tensorrt-9.2.0.5.linux.x86_64-gnu.cuda-12.2.tar.gz
76-
TRT_URL_ARM=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/9.2.0/tensorrt-9.2.0.5.Ubuntu-22.04.aarch64-gnu.cuda-12.2.tar.gz
73+
BASE_IMAGE=nvcr.io/nvidia/pytorch:24.02-py3
74+
TRT_VERSION=9.3.0.1
75+
TRT_URL_x86=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/9.3.0/tensorrt-9.3.0.1.linux.x86_64-gnu.cuda-12.2.tar.gz
76+
TRT_URL_ARM=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/9.3.0/tensorrt-9.3.0.1.ubuntu-22.04.aarch64-gnu.cuda-12.2.tar.gz
7777

7878
docker build -t trtllm_base \
7979
--build-arg BASE_IMAGE="${BASE_IMAGE}" \
@@ -86,8 +86,8 @@ docker build -t trtllm_base \
8686
# endpoints can be removed if not needed. Please refer to the support matrix to
8787
# see the aligned versions: https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html
8888
TRTLLM_BASE_IMAGE=trtllm_base
89-
TENSORRTLLM_BACKEND_REPO_TAG=v0.7.2
90-
PYTHON_BACKEND_REPO_TAG=r24.01
89+
TENSORRTLLM_BACKEND_REPO_TAG=rel
90+
PYTHON_BACKEND_REPO_TAG=r24.04
9191

9292
cd server
9393
./build.py -v --no-container-interactive --enable-logging --enable-stats --enable-tracing \
@@ -299,6 +299,9 @@ The following table shows the fields that may to be modified before deployment:
299299
| `kv_cache_free_gpu_mem_fraction` | Optional (default=0.9). Set to a number between 0 and 1 to indicate the maximum fraction of GPU memory (after loading the model) that may be used for KV cache.|
300300
| `enable_trt_overlap` | Optional (default=`false`). Set to `true` to partition available requests into 2 'microbatches' that can be run concurrently to hide exposed CPU runtime |
301301
| `exclude_input_in_output` | Optional (default=`false`). Set to `true` to only return completion tokens in a response. Set to `false` to return the prompt tokens concatenated with the generated tokens |
302+
| `cancellation_check_period_ms` | Optional (default=100). The time for cancellation check thread to sleep before doing the next check. It checks if any of the current active requests are cancelled through triton and prevent further execution of them. |
303+
| `iter_stats_max_iterations` | Optional (default=executor::kDefaultIterStatsMaxIterations). The numbers of iteration stats to be kept. |
304+
| `request_stats_max_iterations` | Optional (default=executor::kDefaultRequestStatsMaxIterations). The numbers of request stats to be kept. |
302305
| `normalize_log_probs` | Optional (default=`true`). Set to `false` to skip normalization of `output_log_probs` |
303306
| `enable_chunked_context` | Optional (default=`false`). Set to `true` to enable context chunking. |
304307
| `gpu_device_ids` | Optional (default=unspecified). Comma-separated list of GPU IDs to use for this model. If not provided, the model will use all visible GPUs. |

all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt

Lines changed: 68 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,13 @@ input [
6969
optional: true
7070
allow_ragged_batch: true
7171
},
72+
{
73+
name: "draft_acceptance_threshold"
74+
data_type: TYPE_FP32
75+
dims: [ 1 ]
76+
reshape: { shape: [ ] }
77+
optional: true
78+
},
7279
{
7380
name: "end_id"
7481
data_type: TYPE_INT32
@@ -132,13 +139,41 @@ input [
132139
reshape: { shape: [ ] }
133140
optional: true
134141
},
142+
{
143+
name: "runtime_top_p_min"
144+
data_type: TYPE_FP32
145+
dims: [ 1 ]
146+
reshape: { shape: [ ] }
147+
optional: true
148+
},
149+
{
150+
name: "runtime_top_p_decay"
151+
data_type: TYPE_FP32
152+
dims: [ 1 ]
153+
reshape: { shape: [ ] }
154+
optional: true
155+
},
156+
{
157+
name: "runtime_top_p_reset_ids"
158+
data_type: TYPE_INT32
159+
dims: [ 1 ]
160+
reshape: { shape: [ ] }
161+
optional: true
162+
},
135163
{
136164
name: "len_penalty"
137165
data_type: TYPE_FP32
138166
dims: [ 1 ]
139167
reshape: { shape: [ ] }
140168
optional: true
141169
},
170+
{
171+
name: "early_stopping"
172+
data_type: TYPE_BOOL
173+
dims: [ 1 ]
174+
reshape: { shape: [ ] }
175+
optional: true
176+
},
142177
{
143178
name: "repetition_penalty"
144179
data_type: TYPE_FP32
@@ -153,6 +188,13 @@ input [
153188
reshape: { shape: [ ] }
154189
optional: true
155190
},
191+
{
192+
name: "beam_search_diversity_rate"
193+
data_type: TYPE_FP32
194+
dims: [ 1 ]
195+
reshape: { shape: [ ] }
196+
optional: true
197+
},
156198
{
157199
name: "presence_penalty"
158200
data_type: TYPE_FP32
@@ -338,6 +380,12 @@ parameters: {
338380
string_value: "${max_attention_window_size}"
339381
}
340382
}
383+
parameters: {
384+
key: "sink_token_length"
385+
value: {
386+
string_value: "${sink_token_length}"
387+
}
388+
}
341389
parameters: {
342390
key: "batch_scheduler_policy"
343391
value: {
@@ -374,6 +422,24 @@ parameters: {
374422
string_value: "${exclude_input_in_output}"
375423
}
376424
}
425+
parameters: {
426+
key: "cancellation_check_period_ms"
427+
value: {
428+
string_value: "${cancellation_check_period_ms}"
429+
}
430+
}
431+
parameters: {
432+
key: "iter_stats_max_iterations"
433+
value: {
434+
string_value: "${iter_stats_max_iterations}"
435+
}
436+
}
437+
parameters: {
438+
key: "request_stats_max_iterations"
439+
value: {
440+
string_value: "${request_stats_max_iterations}"
441+
}
442+
}
377443
parameters: {
378444
key: "enable_kv_cache_reuse"
379445
value: {
@@ -429,9 +495,9 @@ parameters: {
429495
}
430496
}
431497
parameters: {
432-
key: "worker_path"
498+
key: "executor_worker_path"
433499
value: {
434-
string_value: "/opt/tritonserver/backends/tensorrtllm/triton_tensorrtllm_worker"
500+
string_value: "/opt/tritonserver/backends/tensorrtllm/trtllmExecutorWorker"
435501
}
436502
}
437503
parameters: {

dockerfile/Dockerfile.trt_llm_backend

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver
2-
ARG BASE_TAG=24.02-py3
2+
ARG BASE_TAG=24.03-py3
33

44
FROM ${BASE_IMAGE}:${BASE_TAG} as base
55

@@ -37,7 +37,9 @@ RUN bash /tmp/install_mpi4py.sh && rm /tmp/install_mpi4py.sh
3737
# Use "pypi" (default) for x86_64 arch and "src_non_cxx11_abi" for aarch64 arch
3838
ARG TORCH_INSTALL_TYPE="pypi"
3939
COPY tensorrt_llm/docker/common/install_pytorch.sh install_pytorch.sh
40-
RUN bash ./install_pytorch.sh $TORCH_INSTALL_TYPE && rm install_pytorch.sh
40+
# Apply PyTorch patch for supporting compiling with CUDA 12.4 from source codes.
41+
COPY tensorrt_llm/docker/common/pytorch_pr_116072.patch /tmp/pytorch_pr_116072.patch
42+
RUN bash ./install_pytorch.sh $TORCH_INSTALL_TYPE && rm install_pytorch.sh /tmp/pytorch_pr_116072.patch
4143

4244
FROM dev as trt_llm_builder
4345

@@ -64,4 +66,4 @@ RUN mkdir /opt/tritonserver/backends/tensorrtllm
6466
ENV LD_LIBRARY_PATH=/opt/tritonserver/backends/tensorrtllm:${LD_LIBRARY_PATH}
6567
COPY --from=trt_llm_backend_builder /app/inflight_batcher_llm/build/libtriton_tensorrtllm.so /opt/tritonserver/backends/tensorrtllm
6668
COPY --from=trt_llm_backend_builder /app/inflight_batcher_llm/build/libtriton_tensorrtllm_common.so /opt/tritonserver/backends/tensorrtllm
67-
COPY --from=trt_llm_backend_builder /app/inflight_batcher_llm/build/triton_tensorrtllm_worker /opt/tritonserver/backends/tensorrtllm
69+
COPY --from=trt_llm_backend_builder /app/inflight_batcher_llm/build/trtllmExecutorWorker /opt/tritonserver/backends/tensorrtllm

inflight_batcher_llm/CMakeLists.txt

Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -102,20 +102,14 @@ FetchContent_MakeAvailable(repo-common repo-core repo-backend)
102102
configure_file(src/libtriton_tensorrtllm.ldscript
103103
libtriton_tensorrtllm.ldscript COPYONLY)
104104

105-
set(COMMON_SRCS
106-
src/work_item.cc src/work_items_queue.cc src/model_instance_state.cc
107-
src/model_state.cc src/utils.cc src/inference_answer.cc)
105+
set(COMMON_SRCS src/model_instance_state.cc src/model_state.cc src/utils.cc)
108106

109107
add_library(triton-tensorrt-llm-common SHARED ${COMMON_SRCS})
110108

111-
set(BACKEND_SRCS src/libtensorrtllm.cc src/orchestrator.cc)
109+
set(BACKEND_SRCS src/libtensorrtllm.cc)
112110

113111
add_library(triton-tensorrt-llm-backend SHARED ${BACKEND_SRCS})
114112

115-
set(WORKER_SRCS src/worker.cc)
116-
117-
add_executable(triton-tensorrt-llm-worker ${WORKER_SRCS})
118-
119113
enable_language(CUDA)
120114

121115
find_package(CUDA ${CUDA_REQUIRED_VERSION} REQUIRED)
@@ -165,15 +159,15 @@ find_library_create_target(${TRT_LIB} nvinfer SHARED ${TRT_LIB_DIR})
165159
file(STRINGS "${TRT_INCLUDE_DIR}/NvInferVersion.h" VERSION_STRINGS
166160
REGEX "#define NV_TENSORRT_.*")
167161
foreach(TYPE MAJOR MINOR PATCH BUILD)
168-
string(REGEX MATCH "NV_TENSORRT_${TYPE} [0-9]" TRT_TYPE_STRING
162+
string(REGEX MATCH "NV_TENSORRT_${TYPE} [0-9]+" TRT_TYPE_STRING
169163
${VERSION_STRINGS})
170-
string(REGEX MATCH "[0-9]" TRT_${TYPE} ${TRT_TYPE_STRING})
164+
string(REGEX MATCH "[0-9]+" TRT_${TYPE} ${TRT_TYPE_STRING})
171165
endforeach(TYPE)
172166

173167
foreach(TYPE MAJOR MINOR PATCH)
174-
string(REGEX MATCH "NV_TENSORRT_SONAME_${TYPE} [0-9]" TRT_TYPE_STRING
168+
string(REGEX MATCH "NV_TENSORRT_${TYPE} [0-9]+" TRT_TYPE_STRING
175169
${VERSION_STRINGS})
176-
string(REGEX MATCH "[0-9]" TRT_SO_${TYPE} ${TRT_TYPE_STRING})
170+
string(REGEX MATCH "[0-9]+" TRT_SO_${TYPE} ${TRT_TYPE_STRING})
177171
endforeach(TYPE)
178172

179173
set(TRT_VERSION
@@ -187,6 +181,12 @@ message(
187181
"Building for TensorRT version: ${TRT_VERSION}, library version: ${TRT_SOVERSION}"
188182
)
189183

184+
if(${TRT_MAJOR} GREATER_EQUAL 10)
185+
add_definitions("-DTRT_LLM_USE_DIM64")
186+
message(
187+
STATUS "TensorRT version ${TRT_MAJOR} >= 10, int64 dimension is enabled")
188+
endif()
189+
190190
list(APPEND COMMON_HEADER_DIRS ${TORCH_INCLUDE_DIRS} ${TRT_INCLUDE_DIR})
191191
include_directories(${COMMON_HEADER_DIRS})
192192

@@ -204,7 +204,6 @@ target_include_directories(
204204

205205
target_compile_features(triton-tensorrt-llm-common PRIVATE cxx_std_17)
206206
target_compile_features(triton-tensorrt-llm-backend PRIVATE cxx_std_17)
207-
target_compile_features(triton-tensorrt-llm-worker PRIVATE cxx_std_17)
208207

209208
set(COMPILE_OPTIONS
210209
$<$<OR:$<CXX_COMPILER_ID:Clang>,$<CXX_COMPILER_ID:AppleClang>,$<CXX_COMPILER_ID:GNU>>:
@@ -219,7 +218,6 @@ set(COMPILE_OPTIONS
219218

220219
target_compile_options(triton-tensorrt-llm-common PRIVATE ${COMPILE_OPTIONS})
221220
target_compile_options(triton-tensorrt-llm-backend PRIVATE ${COMPILE_OPTIONS})
222-
target_compile_options(triton-tensorrt-llm-worker PRIVATE ${COMPILE_OPTIONS})
223221

224222
add_library(tensorrt_llm SHARED IMPORTED)
225223
set_property(
@@ -308,8 +306,6 @@ target_link_libraries(
308306

309307
target_link_libraries(triton-tensorrt-llm-backend
310308
PRIVATE triton-tensorrt-llm-common)
311-
target_link_libraries(triton-tensorrt-llm-worker
312-
PRIVATE triton-tensorrt-llm-common)
313309

314310
FetchContent_Declare(
315311
json
@@ -325,9 +321,6 @@ if(WIN32)
325321
set_target_properties(
326322
triton-tensorrt-llm-backend PROPERTIES POSITION_INDEPENDENT_CODE ON
327323
OUTPUT_NAME triton_tensorrtllm)
328-
set_target_properties(
329-
triton-tensorrt-llm-worker PROPERTIES POSITION_INDEPENDENT_CODE ON
330-
OUTPUT_NAME triton_tensorrtllm_worker)
331324
set_target_properties(
332325
triton-tensorrt-llm-common PROPERTIES POSITION_INDEPENDENT_CODE ON
333326
OUTPUT_NAME triton_tensorrtllm_common)
@@ -341,8 +334,6 @@ else()
341334
LINK_FLAGS
342335
"-Wl,--version-script libtriton_tensorrtllm.ldscript -Wl,-rpath,'$ORIGIN' -Wl,--no-undefined"
343336
)
344-
set_target_properties(triton-tensorrt-llm-worker
345-
PROPERTIES OUTPUT_NAME triton_tensorrtllm_worker)
346337
set_target_properties(
347338
triton-tensorrt-llm-common PROPERTIES POSITION_INDEPENDENT_CODE ON
348339
OUTPUT_NAME triton_tensorrtllm_common)
@@ -352,3 +343,12 @@ if(BUILD_TESTS)
352343
enable_testing()
353344
add_subdirectory(tests)
354345
endif()
346+
347+
add_custom_command(
348+
TARGET triton-tensorrt-llm-backend
349+
POST_BUILD
350+
COMMAND
351+
${CMAKE_COMMAND} -E copy
352+
${TRTLLM_DIR}/cpp/build/tensorrt_llm/executor_worker/executorWorker
353+
${CMAKE_CURRENT_BINARY_DIR}/trtllmExecutorWorker
354+
COMMENT "Copying executorWorker to the build directory")

0 commit comments

Comments
 (0)