Skip to content

Commit 39ba55a

Browse files
kaiyuxDefTruthbuvnswrnsunjiabin17
authored
Update TensorRT-LLM backend (triton-inference-server#491)
* Update TensorRT-LLM backend --------- Co-authored-by: DefTruth <[email protected]> Co-authored-by: Bhuvaneshwaran I <[email protected]> Co-authored-by: Sun,Jiabin <[email protected]>
1 parent 09dc9e0 commit 39ba55a

File tree

6 files changed

+78
-81
lines changed

6 files changed

+78
-81
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -188,7 +188,7 @@ prompts(string) to input_ids(list of ints).
188188

189189
This model is a wrapper of your TensorRT-LLM model and is used
190190
for inferencing.
191-
Input specification can be found [here](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/inference_request.md)
191+
Input specification can be found [here](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/advanced/inference-request.md)
192192

193193
#### postprocessing
194194

dockerfile/Dockerfile.triton.trt_llm_backend

Lines changed: 44 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,16 @@
1-
ARG BASE_IMAGE
1+
ARG BASE_IMAGE # Use NGC PyTorch image as base image
22

3-
FROM ${BASE_IMAGE} as base
4-
5-
RUN apt-get update -q=2 && apt-get install -y --no-install-recommends python3-pip ccache git-lfs
3+
FROM ${BASE_IMAGE} as install_dependencies
64

5+
RUN apt-get update -q=2 \
6+
&& apt-get install -y --no-install-recommends \
7+
python3-pip \
8+
ccache \
9+
git-lfs \
10+
&& rm -rf /var/lib/apt/lists/*
711
# Remove previous TRT installation
8-
RUN apt-get remove --purge -y tensorrt* libnvinfer*
12+
# We didn't remove libnvinfer* here because tritonserver depends on the pre-installed libraries.
13+
RUN apt-get remove -y tensorrt*
914
RUN pip3 uninstall -y tensorrt
1015

1116
ARG TRT_VER
@@ -24,7 +29,18 @@ RUN [ "$(uname -m)" != "x86_64" ] && arch="sbsa" || arch="x86_64" \
2429
&& curl -o /tmp/cuda-keyring.deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/$arch/cuda-keyring_1.0-1_all.deb \
2530
&& apt install /tmp/cuda-keyring.deb \
2631
&& rm /tmp/cuda-keyring.deb \
27-
&& apt-get update -q=2
32+
&& apt-get update -q=2 \
33+
&& rm -rf /var/lib/apt/lists/*
34+
35+
ARG NVRTC_VER="12.4.99-1"
36+
ENV NVRTC_VER="${NVRTC_VER}"
37+
38+
RUN apt-get remove --purge -y --allow-change-held-packages cuda-nvrtc-dev*
39+
RUN CUDA_VER_SHORT=$(echo $CUDA_VER | awk -F. '{print $1"."$2}') \
40+
&& NVRTC_CUDA_VERSION=$(echo $CUDA_VER_SHORT | sed 's/\./-/g') \
41+
&& apt update -qq \
42+
&& apt-get install -y --no-install-recommends cuda-nvrtc-dev-${NVRTC_CUDA_VERSION}=${NVRTC_VER} \
43+
&& rm -rf /var/lib/apt/lists/*
2844

2945
ARG RELEASE_URL_TRT_x86
3046
ARG RELEASE_URL_TRT_ARM
@@ -40,11 +56,25 @@ RUN pip3 install /usr/local/tensorrt/python/tensorrt-*-cp$( python3 -c "import s
4056
ENV LD_LIBRARY_PATH=/usr/local/tensorrt/lib:${LD_LIBRARY_PATH}
4157
ENV TRT_ROOT=/usr/local/tensorrt
4258

43-
# Align with the pre-installed CUDA / NVCC / NVRTC versions from
44-
# https://docs.nvidia.com/cuda/archive/12.4.0/cuda-toolkit-release-notes/index.html
45-
# NVRTC static library doesn't exist in NGC PyTorch container.
46-
ENV NVRTC_VER="12.4.99-1"
47-
RUN apt-get remove --purge -y --allow-change-held-packages cuda-nvrtc-dev*
48-
RUN CUDA_VER_SHORT=$(echo $CUDA_VER | awk -F. '{print $1"."$2}') \
49-
&& NVRTC_CUDA_VERSION=$(echo $CUDA_VER_SHORT | sed 's/\./-/g') \
50-
&& apt-get install -y --no-install-recommends cuda-nvrtc-dev-${NVRTC_CUDA_VERSION}=${NVRTC_VER}
59+
FROM install_dependencies as tensorrt_llm_build
60+
61+
ARG TENSORRT_LLM_REPO=https://github.com/NVIDIA/TensorRT-LLM.git
62+
ARG TENSORRT_LLM_REPO_TAG=main
63+
64+
RUN pip3 install --no-cache-dir \
65+
cmake \
66+
polygraphy==0.49.0 \
67+
mpi4py==3.1.5
68+
69+
WORKDIR /workspace/
70+
RUN git clone --recurse-submodules --branch ${TENSORRT_LLM_REPO_TAG} ${TENSORRT_LLM_REPO} tenosrrt_llm
71+
72+
WORKDIR /workspace/tenosrrt_llm
73+
RUN python3 scripts/build_wheel.py --trt_root /usr/local/tensorrt
74+
75+
FROM install_dependencies as base
76+
77+
WORKDIR /tmp
78+
COPY --from=tensorrt_llm_build /workspace/tenosrrt_llm/build/tensorrt_llm*whl .
79+
80+
RUN pip3 install --no-cache-dir --extra-index-url https://pypi.nvidia.com tensorrt_llm*.whl

inflight_batcher_llm/CMakeLists.txt

Lines changed: 31 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,10 @@
2424
cmake_minimum_required(VERSION 3.17)
2525
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules/set_ifndef.cmake)
2626

27-
set(TRITON_BUILD
28-
OFF
29-
CACHE STRING "Using Triton build process")
30-
3127
set_ifndef(TRTLLM_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../tensorrt_llm)
3228

29+
include_directories(${TRTLLM_DIR} ${TRTLLM_DIR}/cpp/include)
30+
3331
include(${TRTLLM_DIR}/cpp/cmake/modules/find_library_create_target.cmake)
3432

3533
project(tritontensorrtllmbackend LANGUAGES C CXX)
@@ -122,6 +120,32 @@ add_library(triton-tensorrt-llm-backend SHARED ${BACKEND_SRCS})
122120
enable_language(CUDA)
123121

124122
find_package(CUDA ${CUDA_REQUIRED_VERSION} REQUIRED)
123+
find_package(Python3 COMPONENTS Interpreter Development)
124+
125+
find_library(
126+
tensorrt_llm libtensorrt_llm.so REQUIRED
127+
PATHS ${Python3_SITEARCH}/tensorrt_llm/libs
128+
${TRTLLM_DIR}/cpp/build/tensorrt_llm
129+
${CMAKE_CURRENT_SOURCE_DIR}/../tensorrt_llm/cpp/build/tensorrt_llm)
130+
131+
find_library(
132+
nvinfer_plugin_tensorrt_llm libnvinfer_plugin_tensorrt_llm.so REQUIRED
133+
PATHS
134+
${Python3_SITEARCH}/tensorrt_llm/libs
135+
${TRTLLM_DIR}/cpp/build/tensorrt_llm/plugins
136+
${CMAKE_CURRENT_SOURCE_DIR}/../tensorrt_llm/cpp/build/tensorrt_llm/plugins)
137+
138+
find_program(
139+
TRTLLM_EXECUTOR_WORKER executorWorker REQUIRED
140+
PATHS
141+
${Python3_SITEARCH}/tensorrt_llm/bin
142+
${TRTLLM_DIR}/cpp/build/tensorrt_llm/executor_worker
143+
${CMAKE_CURRENT_SOURCE_DIR}/../tensorrt_llm/cpp/build/tensorrt_llm/executor_worker
144+
)
145+
install(
146+
PROGRAMS ${TRTLLM_EXECUTOR_WORKER}
147+
DESTINATION ${CMAKE_BINARY_DIR}
148+
RENAME trtllmExecutorWorker)
125149

126150
find_library(
127151
CUDNN_LIB cudnn
@@ -232,20 +256,6 @@ set(COMPILE_OPTIONS
232256
target_compile_options(triton-tensorrt-llm-common PRIVATE ${COMPILE_OPTIONS})
233257
target_compile_options(triton-tensorrt-llm-backend PRIVATE ${COMPILE_OPTIONS})
234258

235-
add_library(tensorrt_llm SHARED IMPORTED)
236-
set_property(
237-
TARGET tensorrt_llm
238-
PROPERTY IMPORTED_LOCATION
239-
"${TRTLLM_DIR}/cpp/build/tensorrt_llm/libtensorrt_llm.so")
240-
241-
add_library(nvinfer_plugin_tensorrt_llm SHARED IMPORTED)
242-
set_property(
243-
TARGET nvinfer_plugin_tensorrt_llm
244-
PROPERTY
245-
IMPORTED_LOCATION
246-
"${TRTLLM_DIR}/cpp/build/tensorrt_llm/plugins/libnvinfer_plugin_tensorrt_llm.so"
247-
)
248-
249259
if(TRITON_ENABLE_METRICS)
250260
list(APPEND REPORTER_SRCS
251261
src/custom_metrics_reporter/custom_metrics_reporter.cc)
@@ -276,46 +286,25 @@ if(TRITON_ENABLE_METRICS)
276286
triton-core-serverapi # from repo-core
277287
triton-core-serverstub # from repo-core
278288
triton-backend-utils # from repo-backend
279-
tensorrt_llm)
289+
${tensorrt_llm})
280290

281291
target_compile_definitions(triton-tensorrt-llm-common
282292
PRIVATE TRITON_ENABLE_METRICS=1)
283293
target_link_libraries(triton-tensorrt-llm-common
284294
PRIVATE triton-custom-metrics-reporter-library)
285295
endif()
286296

287-
if(TRITON_BUILD)
288-
289-
if(CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "x86_64")
290-
execute_process(
291-
WORKING_DIRECTORY ${TRTLLM_DIR}
292-
COMMAND bash -x docker/common/install_pytorch.sh pypi COMMAND_ECHO STDOUT
293-
COMMAND_ERROR_IS_FATAL ANY)
294-
else()
295-
execute_process(
296-
WORKING_DIRECTORY ${TRTLLM_DIR}
297-
COMMAND bash -x docker/common/install_pytorch.sh src_non_cxx11_abi
298-
COMMAND_ECHO STDOUT COMMAND_ERROR_IS_FATAL ANY)
299-
endif() # CMAKE_HOST_SYSTEM_PROCESSOR
300-
301-
execute_process(
302-
WORKING_DIRECTORY ${TRTLLM_DIR}
303-
COMMAND python3 scripts/build_wheel.py --trt_root /usr/local/tensorrt
304-
COMMAND_ECHO STDOUT COMMAND_ERROR_IS_FATAL ANY)
305-
306-
endif() # TRITON_BUILD
307-
308297
target_link_libraries(
309298
triton-tensorrt-llm-common
310-
PUBLIC tensorrt_llm
299+
PUBLIC ${tensorrt_llm}
311300
triton-core-serverapi # from repo-core
312301
triton-core-backendapi # from repo-core
313302
triton-core-serverstub # from repo-core
314303
triton-backend-utils # from repo-backend
315304
${MPI_LIBRARIES}
316305
${CUDA_LIBRARIES}
317306
nvinfer
318-
nvinfer_plugin_tensorrt_llm)
307+
${nvinfer_plugin_tensorrt_llm})
319308

320309
target_link_libraries(triton-tensorrt-llm-backend
321310
PRIVATE triton-tensorrt-llm-common)
@@ -356,12 +345,3 @@ if(BUILD_TESTS)
356345
enable_testing()
357346
add_subdirectory(tests)
358347
endif()
359-
360-
add_custom_command(
361-
TARGET triton-tensorrt-llm-backend
362-
POST_BUILD
363-
COMMAND
364-
${CMAKE_COMMAND} -E copy
365-
${TRTLLM_DIR}/cpp/build/tensorrt_llm/executor_worker/executorWorker
366-
${CMAKE_CURRENT_BINARY_DIR}/trtllmExecutorWorker
367-
COMMENT "Copying executorWorker to the build directory")

tensorrt_llm

Submodule tensorrt_llm updated 368 files

tools/utils/utils.py

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -267,19 +267,6 @@ def get_norm_dist_tokens(mean, stdev, num_reqs):
267267
return [max(1, math.ceil(x)) for x in numbers_list]
268268

269269

270-
def gen_random_start_ids(ip_lens):
271-
input_start_ids = []
272-
for ip_len in ip_lens:
273-
start_ids = list(
274-
np.random.randint(low=0,
275-
high=np.iinfo(np.int32).max,
276-
size=ip_len,
277-
dtype=np.int32))
278-
input_start_ids.append(np.array([start_ids]))
279-
280-
return input_start_ids
281-
282-
283270
def get_inflight_reqs_profile(start_times, end_times, requests_per_sec):
284271
"""
285272
Receives start and end times of all requests,

tools/version.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
8473d8980f1b645398fbe760c339e38b388d652b
1+
225fd4fc55948de398989c334464d4478064b4f7

0 commit comments

Comments
 (0)