Skip to content

Commit

Permalink
Update TensorRT-LLM backend (triton-inference-server#444)
Browse files Browse the repository at this point in the history
  • Loading branch information
kaiyux authored May 7, 2024
1 parent f87ad6b commit e239adc
Show file tree
Hide file tree
Showing 23 changed files with 1,291 additions and 2,281 deletions.
15 changes: 9 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,10 +70,10 @@ The below commands will build the same Triton TRT-LLM container as the one on th
# Prepare the TRT-LLM base image using the dockerfile from tensorrtllm_backend.
cd tensorrtllm_backend
# Specify the build args for the dockerfile.
BASE_IMAGE=nvcr.io/nvidia/tritonserver:24.01-py3-min
TRT_VERSION=9.2.0.5
TRT_URL_x86=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/9.2.0/tensorrt-9.2.0.5.linux.x86_64-gnu.cuda-12.2.tar.gz
TRT_URL_ARM=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/9.2.0/tensorrt-9.2.0.5.Ubuntu-22.04.aarch64-gnu.cuda-12.2.tar.gz
BASE_IMAGE=nvcr.io/nvidia/pytorch:24.02-py3
TRT_VERSION=9.3.0.1
TRT_URL_x86=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/9.3.0/tensorrt-9.3.0.1.linux.x86_64-gnu.cuda-12.2.tar.gz
TRT_URL_ARM=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/9.3.0/tensorrt-9.3.0.1.ubuntu-22.04.aarch64-gnu.cuda-12.2.tar.gz

docker build -t trtllm_base \
--build-arg BASE_IMAGE="${BASE_IMAGE}" \
Expand All @@ -86,8 +86,8 @@ docker build -t trtllm_base \
# endpoints can be removed if not needed. Please refer to the support matrix to
# see the aligned versions: https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html
TRTLLM_BASE_IMAGE=trtllm_base
TENSORRTLLM_BACKEND_REPO_TAG=v0.7.2
PYTHON_BACKEND_REPO_TAG=r24.01
TENSORRTLLM_BACKEND_REPO_TAG=rel
PYTHON_BACKEND_REPO_TAG=r24.04

cd server
./build.py -v --no-container-interactive --enable-logging --enable-stats --enable-tracing \
Expand Down Expand Up @@ -299,6 +299,9 @@ The following table shows the fields that may to be modified before deployment:
| `kv_cache_free_gpu_mem_fraction` | Optional (default=0.9). Set to a number between 0 and 1 to indicate the maximum fraction of GPU memory (after loading the model) that may be used for KV cache.|
| `enable_trt_overlap` | Optional (default=`false`). Set to `true` to partition available requests into 2 'microbatches' that can be run concurrently to hide exposed CPU runtime |
| `exclude_input_in_output` | Optional (default=`false`). Set to `true` to only return completion tokens in a response. Set to `false` to return the prompt tokens concatenated with the generated tokens |
| `cancellation_check_period_ms` | Optional (default=100). The time for cancellation check thread to sleep before doing the next check. It checks if any of the current active requests are cancelled through triton and prevent further execution of them. |
| `iter_stats_max_iterations` | Optional (default=executor::kDefaultIterStatsMaxIterations). The numbers of iteration stats to be kept. |
| `request_stats_max_iterations` | Optional (default=executor::kDefaultRequestStatsMaxIterations). The numbers of request stats to be kept. |
| `normalize_log_probs` | Optional (default=`true`). Set to `false` to skip normalization of `output_log_probs` |
| `enable_chunked_context` | Optional (default=`false`). Set to `true` to enable context chunking. |
| `gpu_device_ids` | Optional (default=unspecified). Comma-separated list of GPU IDs to use for this model. If not provided, the model will use all visible GPUs. |
Expand Down
70 changes: 68 additions & 2 deletions all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,13 @@ input [
optional: true
allow_ragged_batch: true
},
{
name: "draft_acceptance_threshold"
data_type: TYPE_FP32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "end_id"
data_type: TYPE_INT32
Expand Down Expand Up @@ -132,13 +139,41 @@ input [
reshape: { shape: [ ] }
optional: true
},
{
name: "runtime_top_p_min"
data_type: TYPE_FP32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "runtime_top_p_decay"
data_type: TYPE_FP32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "runtime_top_p_reset_ids"
data_type: TYPE_INT32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "len_penalty"
data_type: TYPE_FP32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "early_stopping"
data_type: TYPE_BOOL
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "repetition_penalty"
data_type: TYPE_FP32
Expand All @@ -153,6 +188,13 @@ input [
reshape: { shape: [ ] }
optional: true
},
{
name: "beam_search_diversity_rate"
data_type: TYPE_FP32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "presence_penalty"
data_type: TYPE_FP32
Expand Down Expand Up @@ -338,6 +380,12 @@ parameters: {
string_value: "${max_attention_window_size}"
}
}
parameters: {
key: "sink_token_length"
value: {
string_value: "${sink_token_length}"
}
}
parameters: {
key: "batch_scheduler_policy"
value: {
Expand Down Expand Up @@ -374,6 +422,24 @@ parameters: {
string_value: "${exclude_input_in_output}"
}
}
parameters: {
key: "cancellation_check_period_ms"
value: {
string_value: "${cancellation_check_period_ms}"
}
}
parameters: {
key: "iter_stats_max_iterations"
value: {
string_value: "${iter_stats_max_iterations}"
}
}
parameters: {
key: "request_stats_max_iterations"
value: {
string_value: "${request_stats_max_iterations}"
}
}
parameters: {
key: "enable_kv_cache_reuse"
value: {
Expand Down Expand Up @@ -429,9 +495,9 @@ parameters: {
}
}
parameters: {
key: "worker_path"
key: "executor_worker_path"
value: {
string_value: "/opt/tritonserver/backends/tensorrtllm/triton_tensorrtllm_worker"
string_value: "/opt/tritonserver/backends/tensorrtllm/trtllmExecutorWorker"
}
}
parameters: {
Expand Down
8 changes: 5 additions & 3 deletions dockerfile/Dockerfile.trt_llm_backend
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver
ARG BASE_TAG=24.02-py3
ARG BASE_TAG=24.03-py3

FROM ${BASE_IMAGE}:${BASE_TAG} as base

Expand Down Expand Up @@ -37,7 +37,9 @@ RUN bash /tmp/install_mpi4py.sh && rm /tmp/install_mpi4py.sh
# Use "pypi" (default) for x86_64 arch and "src_non_cxx11_abi" for aarch64 arch
ARG TORCH_INSTALL_TYPE="pypi"
COPY tensorrt_llm/docker/common/install_pytorch.sh install_pytorch.sh
RUN bash ./install_pytorch.sh $TORCH_INSTALL_TYPE && rm install_pytorch.sh
# Apply PyTorch patch for supporting compiling with CUDA 12.4 from source codes.
COPY tensorrt_llm/docker/common/pytorch_pr_116072.patch /tmp/pytorch_pr_116072.patch
RUN bash ./install_pytorch.sh $TORCH_INSTALL_TYPE && rm install_pytorch.sh /tmp/pytorch_pr_116072.patch

FROM dev as trt_llm_builder

Expand All @@ -64,4 +66,4 @@ RUN mkdir /opt/tritonserver/backends/tensorrtllm
ENV LD_LIBRARY_PATH=/opt/tritonserver/backends/tensorrtllm:${LD_LIBRARY_PATH}
COPY --from=trt_llm_backend_builder /app/inflight_batcher_llm/build/libtriton_tensorrtllm.so /opt/tritonserver/backends/tensorrtllm
COPY --from=trt_llm_backend_builder /app/inflight_batcher_llm/build/libtriton_tensorrtllm_common.so /opt/tritonserver/backends/tensorrtllm
COPY --from=trt_llm_backend_builder /app/inflight_batcher_llm/build/triton_tensorrtllm_worker /opt/tritonserver/backends/tensorrtllm
COPY --from=trt_llm_backend_builder /app/inflight_batcher_llm/build/trtllmExecutorWorker /opt/tritonserver/backends/tensorrtllm
42 changes: 21 additions & 21 deletions inflight_batcher_llm/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -102,20 +102,14 @@ FetchContent_MakeAvailable(repo-common repo-core repo-backend)
configure_file(src/libtriton_tensorrtllm.ldscript
libtriton_tensorrtllm.ldscript COPYONLY)

set(COMMON_SRCS
src/work_item.cc src/work_items_queue.cc src/model_instance_state.cc
src/model_state.cc src/utils.cc src/inference_answer.cc)
set(COMMON_SRCS src/model_instance_state.cc src/model_state.cc src/utils.cc)

add_library(triton-tensorrt-llm-common SHARED ${COMMON_SRCS})

set(BACKEND_SRCS src/libtensorrtllm.cc src/orchestrator.cc)
set(BACKEND_SRCS src/libtensorrtllm.cc)

add_library(triton-tensorrt-llm-backend SHARED ${BACKEND_SRCS})

set(WORKER_SRCS src/worker.cc)

add_executable(triton-tensorrt-llm-worker ${WORKER_SRCS})

enable_language(CUDA)

find_package(CUDA ${CUDA_REQUIRED_VERSION} REQUIRED)
Expand Down Expand Up @@ -165,15 +159,15 @@ find_library_create_target(${TRT_LIB} nvinfer SHARED ${TRT_LIB_DIR})
file(STRINGS "${TRT_INCLUDE_DIR}/NvInferVersion.h" VERSION_STRINGS
REGEX "#define NV_TENSORRT_.*")
foreach(TYPE MAJOR MINOR PATCH BUILD)
string(REGEX MATCH "NV_TENSORRT_${TYPE} [0-9]" TRT_TYPE_STRING
string(REGEX MATCH "NV_TENSORRT_${TYPE} [0-9]+" TRT_TYPE_STRING
${VERSION_STRINGS})
string(REGEX MATCH "[0-9]" TRT_${TYPE} ${TRT_TYPE_STRING})
string(REGEX MATCH "[0-9]+" TRT_${TYPE} ${TRT_TYPE_STRING})
endforeach(TYPE)

foreach(TYPE MAJOR MINOR PATCH)
string(REGEX MATCH "NV_TENSORRT_SONAME_${TYPE} [0-9]" TRT_TYPE_STRING
string(REGEX MATCH "NV_TENSORRT_${TYPE} [0-9]+" TRT_TYPE_STRING
${VERSION_STRINGS})
string(REGEX MATCH "[0-9]" TRT_SO_${TYPE} ${TRT_TYPE_STRING})
string(REGEX MATCH "[0-9]+" TRT_SO_${TYPE} ${TRT_TYPE_STRING})
endforeach(TYPE)

set(TRT_VERSION
Expand All @@ -187,6 +181,12 @@ message(
"Building for TensorRT version: ${TRT_VERSION}, library version: ${TRT_SOVERSION}"
)

if(${TRT_MAJOR} GREATER_EQUAL 10)
add_definitions("-DTRT_LLM_USE_DIM64")
message(
STATUS "TensorRT version ${TRT_MAJOR} >= 10, int64 dimension is enabled")
endif()

list(APPEND COMMON_HEADER_DIRS ${TORCH_INCLUDE_DIRS} ${TRT_INCLUDE_DIR})
include_directories(${COMMON_HEADER_DIRS})

Expand All @@ -204,7 +204,6 @@ target_include_directories(

target_compile_features(triton-tensorrt-llm-common PRIVATE cxx_std_17)
target_compile_features(triton-tensorrt-llm-backend PRIVATE cxx_std_17)
target_compile_features(triton-tensorrt-llm-worker PRIVATE cxx_std_17)

set(COMPILE_OPTIONS
$<$<OR:$<CXX_COMPILER_ID:Clang>,$<CXX_COMPILER_ID:AppleClang>,$<CXX_COMPILER_ID:GNU>>:
Expand All @@ -219,7 +218,6 @@ set(COMPILE_OPTIONS

target_compile_options(triton-tensorrt-llm-common PRIVATE ${COMPILE_OPTIONS})
target_compile_options(triton-tensorrt-llm-backend PRIVATE ${COMPILE_OPTIONS})
target_compile_options(triton-tensorrt-llm-worker PRIVATE ${COMPILE_OPTIONS})

add_library(tensorrt_llm SHARED IMPORTED)
set_property(
Expand Down Expand Up @@ -308,8 +306,6 @@ target_link_libraries(

target_link_libraries(triton-tensorrt-llm-backend
PRIVATE triton-tensorrt-llm-common)
target_link_libraries(triton-tensorrt-llm-worker
PRIVATE triton-tensorrt-llm-common)

FetchContent_Declare(
json
Expand All @@ -325,9 +321,6 @@ if(WIN32)
set_target_properties(
triton-tensorrt-llm-backend PROPERTIES POSITION_INDEPENDENT_CODE ON
OUTPUT_NAME triton_tensorrtllm)
set_target_properties(
triton-tensorrt-llm-worker PROPERTIES POSITION_INDEPENDENT_CODE ON
OUTPUT_NAME triton_tensorrtllm_worker)
set_target_properties(
triton-tensorrt-llm-common PROPERTIES POSITION_INDEPENDENT_CODE ON
OUTPUT_NAME triton_tensorrtllm_common)
Expand All @@ -341,8 +334,6 @@ else()
LINK_FLAGS
"-Wl,--version-script libtriton_tensorrtllm.ldscript -Wl,-rpath,'$ORIGIN' -Wl,--no-undefined"
)
set_target_properties(triton-tensorrt-llm-worker
PROPERTIES OUTPUT_NAME triton_tensorrtllm_worker)
set_target_properties(
triton-tensorrt-llm-common PROPERTIES POSITION_INDEPENDENT_CODE ON
OUTPUT_NAME triton_tensorrtllm_common)
Expand All @@ -352,3 +343,12 @@ if(BUILD_TESTS)
enable_testing()
add_subdirectory(tests)
endif()

add_custom_command(
TARGET triton-tensorrt-llm-backend
POST_BUILD
COMMAND
${CMAKE_COMMAND} -E copy
${TRTLLM_DIR}/cpp/build/tensorrt_llm/executor_worker/executorWorker
${CMAKE_CURRENT_BINARY_DIR}/trtllmExecutorWorker
COMMENT "Copying executorWorker to the build directory")
Loading

0 comments on commit e239adc

Please sign in to comment.