Update TensorRT-LLM backend (triton-inference-server#491)

kaiyux · DefTruth · buvnswrn · web-flow · commit 39ba55a74526 · 2024-06-04T20:36:29.000+08:00
* Update TensorRT-LLM backend

---------

Co-authored-by: DefTruth &lt;31974251+DefTruth@users.noreply.github.com&gt;
Co-authored-by: Bhuvaneshwaran I &lt;bhuvan.nav@gmail.com&gt;
Co-authored-by: Sun,Jiabin &lt;41134681+sunjiabin17@users.noreply.github.com&gt;
diff --git a/README.md b/README.md
@@ -188,7 +188,7 @@ prompts(string) to input_ids(list of ints).
 
 This model is a wrapper of your TensorRT-LLM model and is used
 for inferencing.
-Input specification can be found [here](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/inference_request.md)
+Input specification can be found [here](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/advanced/inference-request.md)
 
 #### postprocessing
 
diff --git a/dockerfile/Dockerfile.triton.trt_llm_backend b/dockerfile/Dockerfile.triton.trt_llm_backend
@@ -1,11 +1,16 @@
-ARG BASE_IMAGE
+ARG BASE_IMAGE # Use NGC PyTorch image as base image
 
-FROM ${BASE_IMAGE} as base
-
-RUN apt-get update -q=2 && apt-get install -y --no-install-recommends python3-pip ccache git-lfs
+FROM ${BASE_IMAGE} as install_dependencies
 
+RUN apt-get update -q=2 \
+    && apt-get install -y --no-install-recommends \
+        python3-pip \
+        ccache \
+        git-lfs \
+    && rm -rf /var/lib/apt/lists/*
 # Remove previous TRT installation
-RUN apt-get remove --purge -y tensorrt* libnvinfer*
+# We didn't remove libnvinfer* here because tritonserver depends on the pre-installed libraries.
+RUN apt-get remove -y tensorrt*
 RUN pip3 uninstall -y tensorrt
 
 ARG TRT_VER
@@ -24,7 +29,18 @@ RUN [ "$(uname -m)" != "x86_64" ] && arch="sbsa" || arch="x86_64" \
     && curl -o /tmp/cuda-keyring.deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/$arch/cuda-keyring_1.0-1_all.deb \
     && apt install /tmp/cuda-keyring.deb \
     && rm /tmp/cuda-keyring.deb \
-    && apt-get update -q=2
+    && apt-get update -q=2 \
+    && rm -rf /var/lib/apt/lists/*
+
+ARG NVRTC_VER="12.4.99-1"
+ENV NVRTC_VER="${NVRTC_VER}"
+
+RUN apt-get remove --purge -y --allow-change-held-packages cuda-nvrtc-dev*
+RUN CUDA_VER_SHORT=$(echo $CUDA_VER | awk -F. '{print $1"."$2}') \
+    && NVRTC_CUDA_VERSION=$(echo $CUDA_VER_SHORT | sed 's/\./-/g') \
+    && apt update -qq \
+    && apt-get install -y --no-install-recommends cuda-nvrtc-dev-${NVRTC_CUDA_VERSION}=${NVRTC_VER} \
+    && rm -rf /var/lib/apt/lists/*
 
 ARG RELEASE_URL_TRT_x86
 ARG RELEASE_URL_TRT_ARM
@@ -40,11 +56,25 @@ RUN pip3 install /usr/local/tensorrt/python/tensorrt-*-cp$( python3 -c "import s
 ENV LD_LIBRARY_PATH=/usr/local/tensorrt/lib:${LD_LIBRARY_PATH}
 ENV TRT_ROOT=/usr/local/tensorrt
 
-# Align with the pre-installed CUDA / NVCC / NVRTC versions from
-# https://docs.nvidia.com/cuda/archive/12.4.0/cuda-toolkit-release-notes/index.html
-# NVRTC static library doesn't exist in NGC PyTorch container.
-ENV NVRTC_VER="12.4.99-1"
-RUN apt-get remove --purge -y --allow-change-held-packages cuda-nvrtc-dev*
-RUN CUDA_VER_SHORT=$(echo $CUDA_VER | awk -F. '{print $1"."$2}') \
-    && NVRTC_CUDA_VERSION=$(echo $CUDA_VER_SHORT | sed 's/\./-/g') \
-    && apt-get install -y --no-install-recommends cuda-nvrtc-dev-${NVRTC_CUDA_VERSION}=${NVRTC_VER}
+FROM install_dependencies as tensorrt_llm_build
+
+ARG TENSORRT_LLM_REPO=https://github.com/NVIDIA/TensorRT-LLM.git
+ARG TENSORRT_LLM_REPO_TAG=main
+
+RUN pip3 install --no-cache-dir \
+      cmake \
+      polygraphy==0.49.0 \
+      mpi4py==3.1.5
+
+WORKDIR /workspace/
+RUN git clone --recurse-submodules --branch ${TENSORRT_LLM_REPO_TAG} ${TENSORRT_LLM_REPO} tenosrrt_llm
+
+WORKDIR /workspace/tenosrrt_llm
+RUN python3 scripts/build_wheel.py --trt_root /usr/local/tensorrt
+
+FROM install_dependencies as base
+
+WORKDIR /tmp
+COPY --from=tensorrt_llm_build /workspace/tenosrrt_llm/build/tensorrt_llm*whl .
+
+RUN pip3 install --no-cache-dir --extra-index-url https://pypi.nvidia.com tensorrt_llm*.whl
diff --git a/inflight_batcher_llm/CMakeLists.txt b/inflight_batcher_llm/CMakeLists.txt
@@ -24,12 +24,10 @@
 cmake_minimum_required(VERSION 3.17)
 include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules/set_ifndef.cmake)
 
-set(TRITON_BUILD
-    OFF
-    CACHE STRING "Using Triton build process")
-
 set_ifndef(TRTLLM_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../tensorrt_llm)
 
+include_directories(${TRTLLM_DIR} ${TRTLLM_DIR}/cpp/include)
+
 include(${TRTLLM_DIR}/cpp/cmake/modules/find_library_create_target.cmake)
 
 project(tritontensorrtllmbackend LANGUAGES C CXX)
@@ -122,6 +120,32 @@ add_library(triton-tensorrt-llm-backend SHARED ${BACKEND_SRCS})
 enable_language(CUDA)
 
 find_package(CUDA ${CUDA_REQUIRED_VERSION} REQUIRED)
+find_package(Python3 COMPONENTS Interpreter Development)
+
+find_library(
+  tensorrt_llm libtensorrt_llm.so REQUIRED
+  PATHS ${Python3_SITEARCH}/tensorrt_llm/libs
+        ${TRTLLM_DIR}/cpp/build/tensorrt_llm
+        ${CMAKE_CURRENT_SOURCE_DIR}/../tensorrt_llm/cpp/build/tensorrt_llm)
+
+find_library(
+  nvinfer_plugin_tensorrt_llm libnvinfer_plugin_tensorrt_llm.so REQUIRED
+  PATHS
+    ${Python3_SITEARCH}/tensorrt_llm/libs
+    ${TRTLLM_DIR}/cpp/build/tensorrt_llm/plugins
+    ${CMAKE_CURRENT_SOURCE_DIR}/../tensorrt_llm/cpp/build/tensorrt_llm/plugins)
+
+find_program(
+  TRTLLM_EXECUTOR_WORKER executorWorker REQUIRED
+  PATHS
+    ${Python3_SITEARCH}/tensorrt_llm/bin
+    ${TRTLLM_DIR}/cpp/build/tensorrt_llm/executor_worker
+    ${CMAKE_CURRENT_SOURCE_DIR}/../tensorrt_llm/cpp/build/tensorrt_llm/executor_worker
+)
+install(
+  PROGRAMS ${TRTLLM_EXECUTOR_WORKER}
+  DESTINATION ${CMAKE_BINARY_DIR}
+  RENAME trtllmExecutorWorker)
 
 find_library(
   CUDNN_LIB cudnn
@@ -232,20 +256,6 @@ set(COMPILE_OPTIONS
 target_compile_options(triton-tensorrt-llm-common PRIVATE ${COMPILE_OPTIONS})
 target_compile_options(triton-tensorrt-llm-backend PRIVATE ${COMPILE_OPTIONS})
 
-add_library(tensorrt_llm SHARED IMPORTED)
-set_property(
-  TARGET tensorrt_llm
-  PROPERTY IMPORTED_LOCATION
-           "${TRTLLM_DIR}/cpp/build/tensorrt_llm/libtensorrt_llm.so")
-
-add_library(nvinfer_plugin_tensorrt_llm SHARED IMPORTED)
-set_property(
-  TARGET nvinfer_plugin_tensorrt_llm
-  PROPERTY
-    IMPORTED_LOCATION
-    "${TRTLLM_DIR}/cpp/build/tensorrt_llm/plugins/libnvinfer_plugin_tensorrt_llm.so"
-)
-
 if(TRITON_ENABLE_METRICS)
   list(APPEND REPORTER_SRCS
        src/custom_metrics_reporter/custom_metrics_reporter.cc)
@@ -276,46 +286,25 @@ if(TRITON_ENABLE_METRICS)
            triton-core-serverapi # from repo-core
            triton-core-serverstub # from repo-core
            triton-backend-utils # from repo-backend
-           tensorrt_llm)
+           ${tensorrt_llm})
 
   target_compile_definitions(triton-tensorrt-llm-common
                              PRIVATE TRITON_ENABLE_METRICS=1)
   target_link_libraries(triton-tensorrt-llm-common
                         PRIVATE triton-custom-metrics-reporter-library)
 endif()
 
-if(TRITON_BUILD)
-
-  if(CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "x86_64")
-    execute_process(
-      WORKING_DIRECTORY ${TRTLLM_DIR}
-      COMMAND bash -x docker/common/install_pytorch.sh pypi COMMAND_ECHO STDOUT
-              COMMAND_ERROR_IS_FATAL ANY)
-  else()
-    execute_process(
-      WORKING_DIRECTORY ${TRTLLM_DIR}
-      COMMAND bash -x docker/common/install_pytorch.sh src_non_cxx11_abi
-              COMMAND_ECHO STDOUT COMMAND_ERROR_IS_FATAL ANY)
-  endif() # CMAKE_HOST_SYSTEM_PROCESSOR
-
-  execute_process(
-    WORKING_DIRECTORY ${TRTLLM_DIR}
-    COMMAND python3 scripts/build_wheel.py --trt_root /usr/local/tensorrt
-            COMMAND_ECHO STDOUT COMMAND_ERROR_IS_FATAL ANY)
-
-endif() # TRITON_BUILD
-
 target_link_libraries(
   triton-tensorrt-llm-common
-  PUBLIC tensorrt_llm
+  PUBLIC ${tensorrt_llm}
          triton-core-serverapi # from repo-core
          triton-core-backendapi # from repo-core
          triton-core-serverstub # from repo-core
          triton-backend-utils # from repo-backend
          ${MPI_LIBRARIES}
          ${CUDA_LIBRARIES}
          nvinfer
-         nvinfer_plugin_tensorrt_llm)
+         ${nvinfer_plugin_tensorrt_llm})
 
 target_link_libraries(triton-tensorrt-llm-backend
                       PRIVATE triton-tensorrt-llm-common)
@@ -356,12 +345,3 @@ if(BUILD_TESTS)
   enable_testing()
   add_subdirectory(tests)
 endif()
-
-add_custom_command(
-  TARGET triton-tensorrt-llm-backend
-  POST_BUILD
-  COMMAND
-    ${CMAKE_COMMAND} -E copy
-    ${TRTLLM_DIR}/cpp/build/tensorrt_llm/executor_worker/executorWorker
-    ${CMAKE_CURRENT_BINARY_DIR}/trtllmExecutorWorker
-  COMMENT "Copying executorWorker to the build directory")
diff --git a/tensorrt_llm b/tensorrt_llm
@@ -1 +1 @@
-Subproject commit f430a4b447ef4cba22698902d43eae0debf08594
+Subproject commit b777bd64750abf30ca7eda48e8b6ba3c5174aafd
diff --git a/tools/utils/utils.py b/tools/utils/utils.py
@@ -267,19 +267,6 @@ def get_norm_dist_tokens(mean, stdev, num_reqs):
     return [max(1, math.ceil(x)) for x in numbers_list]
 
 
-def gen_random_start_ids(ip_lens):
-    input_start_ids = []
-    for ip_len in ip_lens:
-        start_ids = list(
-            np.random.randint(low=0,
-                              high=np.iinfo(np.int32).max,
-                              size=ip_len,
-                              dtype=np.int32))
-        input_start_ids.append(np.array([start_ids]))
-
-    return input_start_ids
-
-
 def get_inflight_reqs_profile(start_times, end_times, requests_per_sec):
     """
     Receives start and end times of all requests,
diff --git a/tools/version.txt b/tools/version.txt
@@ -1 +1 @@
-8473d8980f1b645398fbe760c339e38b388d652b
+225fd4fc55948de398989c334464d4478064b4f7

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-8473d8980f1b645398fbe760c339e38b388d652b`
	`1`	`+225fd4fc55948de398989c334464d4478064b4f7`