Update TensorRT-LLM backend (triton-inference-server#431)

kaiyux · web-flow · commit c6ac3d8a7532 · 2024-04-24T14:46:56.000+08:00
* Update TensorRT-LLM backend
diff --git a/README.md b/README.md
@@ -299,7 +299,7 @@ The following table shows the fields that may to be modified before deployment:
 | `normalize_log_probs` | Optional (default=`true`). Set to `false` to skip normalization of `output_log_probs`  |
 | `enable_chunked_context` | Optional (default=`false`). Set to `true` to enable context chunking. |
 | `gpu_device_ids` | Optional (default=unspecified). Comma-separated list of GPU IDs to use for this model. If not provided, the model will use all visible GPUs. |
-| `decoding_mode` | Optional. Set to one of the following: `{top_k, top_p, top_k_top_p, beam_search}` to select the decoding mode. The `top_k` mode exclusively uses Top-K algorithm for sampling, The `top_p` mode uses exclusively Top-P algorithm for sampling. The top_k_top_p mode employs both Top-K and Top-P algorithms, depending on the runtime sampling params of the request. Note that the `top_k_top_p option` requires more memory and has a longer runtime than using `top_k` or `top_p` individually; therefore, it should be used only when necessary. `beam_search` uses beam search algorithm. If not specified, the default is to use `top_k_top_p` if `max_beam_width == 1`; otherwise, `beam_search` is used. |
+| `decoding_mode` | Optional. Set to one of the following: `{top_k, top_p, top_k_top_p, beam_search, medusa}` to select the decoding mode. The `top_k` mode exclusively uses Top-K algorithm for sampling, The `top_p` mode uses exclusively Top-P algorithm for sampling. The top_k_top_p mode employs both Top-K and Top-P algorithms, depending on the runtime sampling params of the request. Note that the `top_k_top_p option` requires more memory and has a longer runtime than using `top_k` or `top_p` individually; therefore, it should be used only when necessary. `beam_search` uses beam search algorithm. If not specified, the default is to use `top_k_top_p` if `max_beam_width == 1`; otherwise, `beam_search` is used. When Medusa model is used, `medusa` decoding mode should be set. However, TensorRT-LLM detects loaded Medusa model and overwrites decoding mode to `medusa` with warning. |
 | `medusa_choices` | Optional. To specify Medusa choices tree in the format of e.g. "{0, 0, 0}, {0, 1}". By default, mc_sim_7b_63 choices are used. |
 | `lora_cache_optimal_adapter_size` | Optional (default=8) Optimal adapter size used to size cache pages. Typically optimally sized adapters will fix exactly into 1 cache page. |
 | `lora_cache_max_adapter_size` | Optional (default=64) Used to set the minimum size of a cache page.  Pages must be at least large enough to fit a single module, single later adapter_size `maxAdapterSize` row of weights. |
diff --git a/all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt b/all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt
@@ -350,6 +350,18 @@ parameters: {
     string_value: "${kv_cache_free_gpu_mem_fraction}"
   }
 }
+parameters: {
+  key: "kv_cache_host_memory_bytes"
+  value: {
+    string_value: "${kv_cache_host_memory_bytes}"
+  }
+}
+parameters: {
+  key: "kv_cache_onboard_blocks"
+  value: {
+    string_value: "${kv_cache_onboard_blocks}"
+  }
+}
 parameters: {
   key: "enable_trt_overlap"
   value: {
diff --git a/ci/L0_backend_trtllm/custom_metrics_verification_tests.py b/ci/L0_backend_trtllm/custom_metrics_verification_tests.py
@@ -46,8 +46,7 @@
     "inflight_batcher_specific_metric=micro_batch_id": "MicroBatch ID",
     "inflight_batcher_specific_metric=generation_requests":
     "Generation Requests",
-    "inflight_batcher_specific_metric=terminated_requests":
-    "Terminated Requests",
+    "inflight_batcher_specific_metric=paused_requests": "Paused Requests",
     "v1_specific_metric=total_context_tokens": "Total Context Tokens",
     "v1_specific_metric=total_generation_tokens": "Total Generation Tokens",
     "v1_specific_metric=empty_generation_slots": "Empty Generation Slots",
@@ -109,7 +108,7 @@ def _base_test(self, stats_file, metrics_file, is_v1):
                                  int(metrics[metric_key]))
             else:
                 dt_log = datetime.strptime(stats[metric_key],
-                                           "%m-%d-%Y %H:%M:%S")
+                                           '%m-%d-%Y %H:%M:%S.%f')
                 dt_curl = datetime.utcfromtimestamp(
                     int(metrics[metric_key]) // 1000000)
                 difference = dt_log - dt_curl
diff --git a/ci/L0_backend_trtllm/generate_engines.sh b/ci/L0_backend_trtllm/generate_engines.sh
@@ -55,26 +55,6 @@ function build_tensorrt_engine_inflight_batcher {
     cd ${BASE_DIR}
 }
 
-function install_trt_llm {
-    # Install CMake
-    bash /opt/tritonserver/tensorrtllm_backend/tensorrt_llm/docker/common/install_cmake.sh
-    export PATH="/usr/local/cmake/bin:${PATH}"
-
-    # PyTorch needs to be built from source for aarch64
-    ARCH="$(uname -i)"
-    if [ "${ARCH}" = "aarch64" ]; then TORCH_INSTALL_TYPE="src_non_cxx11_abi"; \
-    else TORCH_INSTALL_TYPE="pypi"; fi && \
-    (cd /opt/tritonserver/tensorrtllm_backend/tensorrt_llm &&
-        bash docker/common/install_pytorch.sh $TORCH_INSTALL_TYPE &&
-        python3 ./scripts/build_wheel.py --trt_root="${TRT_ROOT}" &&
-        pip3 install ./build/tensorrt_llm*.whl)
-}
-
-# Install TRT LLM
-install_trt_llm
-
-# Install dependencies
-pip3 install -r ${TRTLLM_DIR}/requirements-dev.txt --extra-index-url https://pypi.ngc.nvidia.com
 # Downgrade to legacy version to accommodate Triton CI runners
 pip install pynvml==11.4.0
 
@@ -97,8 +77,3 @@ mv ${GPT_DIR}/inflight_*_gpu/ engines/
 # Move the tokenizer into the CI directory
 mkdir tokenizer
 mv ${GPT_DIR}/gpt2/* tokenizer/
-
-# Now that the engines are generated, we should remove the
-# tensorrt_llm module to ensure the C++ backend tests are
-# not using it
-pip3 uninstall -y torch tensorrt_llm
diff --git a/ci/L0_backend_trtllm/test.sh b/ci/L0_backend_trtllm/test.sh
@@ -306,6 +306,39 @@ for NUM_GPU in "${NUM_GPUS_TO_TEST[@]}"; do
     kill_server
     wait_for_server_terminated ${SERVER_PID[@]}
 
+    # World size must be 1 when using multi-model
+    if [ "${NUM_GPU}" == "0" ]; then
+        # Multi-model
+        SERVER_LOG="./${NUM_GPU}gpu_multi_model.log"
+        run_server "${SERVER_ARGS} --multi-model"
+        wait_for_server_ready ${SERVER_TIMEOUT} ${SERVER_PID[@]}
+        if [ "$WAIT_RET" != "0" ]; then
+            # Cleanup
+            kill $SERVER_PID > /dev/null 2>&1 || true
+            echo -e "\n***\n*** Failed to start $SERVER\n***"
+            cat $SERVER_LOG
+            exit 1
+        fi
+        set -e
+
+        python3 ${TOOLS_DIR}/inflight_batcher_llm/end_to_end_test.py \
+            --max-input-len=500 \
+            --dataset=${DATASET}
+
+        if [ $? -ne 0 ]; then
+            cat $SERVER_LOG
+            echo -e "\n***\n*** Error executing inflight batching end-to-end test with ${NUM_GPU}GPU(s): line ${LINENO}\n***"
+            kill_server
+            wait_for_server_terminated ${SERVER_PID[@]}
+            RET=1
+        fi
+        set +e
+
+        curl localhost:8002/metrics -o ${NUM_GPU}gpu_multi_model_metrics.out
+        kill_server
+        wait_for_server_terminated ${SERVER_PID[@]}
+    fi
+
     # inflight batching ON
     # streaming ON
     SERVER_LOG="./${NUM_GPU}gpu_IFB_streaming_server.log"
@@ -338,37 +371,6 @@ for NUM_GPU in "${NUM_GPUS_TO_TEST[@]}"; do
     kill_server
     wait_for_server_terminated ${SERVER_PID[@]}
 
-    # Multi-model
-    SERVER_LOG="./${NUM_GPU}gpu_multi_model.log"
-    run_server "${SERVER_ARGS} --multi-model"
-    wait_for_server_ready ${SERVER_TIMEOUT} ${SERVER_PID[@]}
-    if [ "$WAIT_RET" != "0" ]; then
-        # Cleanup
-        kill $SERVER_PID > /dev/null 2>&1 || true
-        echo -e "\n***\n*** Failed to start $SERVER\n***"
-        cat $SERVER_LOG
-        exit 1
-    fi
-    set -e
-
-    set -e
-    python3 ${TOOLS_DIR}/inflight_batcher_llm/end_to_end_test.py \
-        --max-input-len=500 \
-        --dataset=${DATASET}
-
-    if [ $? -ne 0 ]; then
-        cat $SERVER_LOG
-        echo -e "\n***\n*** Error executing inflight batching end-to-end test with ${NUM_GPU}GPU(s): line ${LINENO}\n***"
-        kill_server
-        wait_for_server_terminated ${SERVER_PID[@]}
-        RET=1
-    fi
-    set +e
-
-    curl localhost:8002/metrics -o ${NUM_GPU}gpu_IFB_no_stream_metrics.out
-    kill_server
-    wait_for_server_terminated ${SERVER_PID[@]}
-
 done
 
 # Verify TRT LLM statistics are being properly reported as custom metrics
diff --git a/inflight_batcher_llm/src/model_instance_state.cc b/inflight_batcher_llm/src/model_instance_state.cc
@@ -210,6 +210,25 @@ ModelInstanceState::ModelInstanceState(
             "kv_cache_free_gpu_mem_fraction is not specified, will use default value of 0.9 or "
             "max_tokens_in_paged_kv_cache");
     }
+    std::optional<size_t> kvCacheHostCacheSize = std::nullopt;
+    try
+    {
+        kvCacheHostCacheSize = model_state_->GetParameter<size_t>("kv_cache_host_memory_bytes");
+    }
+    catch (std::exception const& e)
+    {
+        TLLM_LOG_WARNING("kv_cache_host_memory_bytes not set, defaulting to 0");
+    }
+    bool kvCacheOnboardBlocks = true;
+    try
+    {
+        kvCacheOnboardBlocks = model_state_->GetParameter<bool>("kv_cache_onboard_blocks");
+    }
+    catch (std::exception const& e)
+    {
+        // If parameter is not specified, just ignore
+        TLLM_LOG_WARNING("kv_cache_onboard_blocks not set, defaulting to true");
+    }
 
     bool enableTrtOverlap = false;
     try
@@ -288,6 +307,10 @@ ModelInstanceState::ModelInstanceState(
         {
             decodingMode = DecodingMode::BeamSearch();
         }
+        else if (decodingModeStr == "medusa")
+        {
+            decodingMode = DecodingMode::Medusa();
+        }
         else
         {
             throw std::runtime_error("");
@@ -368,6 +391,8 @@ ModelInstanceState::ModelInstanceState(
     optionalParams.kvCacheConfig.freeGpuMemoryFraction = kvCacheFreeGpuMemFraction;
     optionalParams.kvCacheConfig.maxAttentionWindow = maxAttentionWindow;
     optionalParams.kvCacheConfig.enableBlockReuse = enableKVCacheReuse;
+    optionalParams.kvCacheConfig.hostCacheSize = kvCacheHostCacheSize;
+    optionalParams.kvCacheConfig.onboardBlocks = kvCacheOnboardBlocks;
     optionalParams.enableTrtOverlap = enableTrtOverlap;
     optionalParams.normalizeLogProbs = normalizeLogProbs;
     optionalParams.enableChunkedContext = enableChunkedContext;
diff --git a/tensorrt_llm b/tensorrt_llm
@@ -1 +1 @@
-Subproject commit 71d8d4d3dc655671f32535d6d2b60cab87f36e87
+Subproject commit 66ef1df492f7bc9c8eeb01d7e14db01838e3f0bd
diff --git a/tools/version.txt b/tools/version.txt
@@ -1 +1 @@
-d0c1a4fdcc47207417b7841e343b3c89ee512770
+b95f14b34c5b22194c87be2f30b07269ebb2f328

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-d0c1a4fdcc47207417b7841e343b3c89ee512770`
	`1`	`+b95f14b34c5b22194c87be2f30b07269ebb2f328`