Update TensorRT-LLM backend (triton-inference-server#301)

kaiyux · web-flow · commit cad22332550e · 2024-01-16T20:08:40.000+08:00
* Update TensorRT-LLM backend
diff --git a/README.md b/README.md
@@ -220,7 +220,7 @@ The following table shows the fields that may to be modified before deployment:
 | `max_attention_window_size` | Optional (default=max_sequence_length). When using techniques like sliding window attention, the maximum number of tokens that are attended to generate one token. Defaults attends to all tokens in sequence. |
 | `kv_cache_free_gpu_mem_fraction` | Optional (default=0.9). Set to a number between 0 and 1 to indicate the maximum fraction of GPU memory (after loading the model) that may be used for KV cache.|
 | `max_num_sequences` | Optional (default=`max_batch_size` if `enable_trt_overlap` is `false` and to `2 * max_batch_size` if `enable_trt_overlap` is `true`, where `max_batch_size` is the TRT engine maximum batch size). Maximum number of sequences that the in-flight batching scheme can maintain state for.
-| `enable_trt_overlap` | Optional (default=`true`). Set to `true` to partition available requests into 2 'microbatches' that can be run concurrently to hide exposed CPU runtime |
+| `enable_trt_overlap` | Optional (default=`false`). Set to `true` to partition available requests into 2 'microbatches' that can be run concurrently to hide exposed CPU runtime |
 | `exclude_input_in_output` | Optional (default=`false`). Set to `true` to only return completion tokens in a response. Set to `false` to return the prompt tokens concatenated with the generated tokens  |
 | `normalize_log_probs` | Optional (default=`true`). Set to `false` to skip normalization of `output_log_probs`  |
 
diff --git a/dockerfile/Dockerfile.trt_llm_backend b/dockerfile/Dockerfile.trt_llm_backend
@@ -1,5 +1,5 @@
 ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver
-ARG BASE_TAG=23.10-py3
+ARG BASE_TAG=23.12-py3
 
 FROM ${BASE_IMAGE}:${BASE_TAG} as base
 
diff --git a/inflight_batcher_llm/README.md b/inflight_batcher_llm/README.md
@@ -88,25 +88,25 @@ parameters: {
 }
 ```
 
-By default, in-flight batching will try to overlap the execution of batches of
+In-flight batching is able to overlap the execution of batches of
 requests. It may have a negative impact on performance when the number of
-requests is too small. To disable that feature, set the `enable_trt_overlap`
-parameter to `False` in the `config.pbtxt` file:
+requests is too small. To enable that feature, set the `enable_trt_overlap`
+parameter to `True` in the `config.pbtxt` file:
 
 ```
 parameters: {
   key: "enable_trt_overlap"
   value: {
-    string_value: "False"
+    string_value: "True"
   }
 }
 ```
 
-Or, equivalently, add `enable_trt_overlap:False` to the invocation of the
+Or, equivalently, add `enable_trt_overlap:True` to the invocation of the
 `fill_template.py` tool:
 
 ```bash
-python3 tools/fill_template.py -i all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt "enable_trt_overlap:False"
+python3 tools/fill_template.py -i all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt "enable_trt_overlap:True"
 ```
 
 To reuse previously computed KV cache values (e.g. for system prompt), set `enable_kv_cache_reuse`
diff --git a/inflight_batcher_llm/src/model_instance_state.cc b/inflight_batcher_llm/src/model_instance_state.cc
@@ -174,15 +174,15 @@ ModelInstanceState::ModelInstanceState(ModelState* model_state, TRITONBACKEND_Mo
         TLLM_LOG_WARNING("max_num_sequences is not specified, will be set to the TRT engine max_batch_size");
     }
 
-    bool enableTrtOverlap = true;
+    bool enableTrtOverlap = false;
     try
     {
         enableTrtOverlap = model_state_->GetParameter<bool>("enable_trt_overlap");
     }
     catch (const std::exception& e)
     {
         // If parameter is not specified, just ignore
-        TLLM_LOG_WARNING("enable_trt_overlap is not specified, will be set to true");
+        TLLM_LOG_WARNING("enable_trt_overlap is not specified, will be set to false");
     }
 
     bool normalizeLogProbs = true;
diff --git a/requirements.txt b/requirements.txt
@@ -1,6 +1,6 @@
 regex
 fire
 tritonclient[all]
-transformers==4.31.0
+transformers==4.36.1
 pandas
 tabulate
diff --git a/tensorrt_llm b/tensorrt_llm
@@ -1 +1 @@
-Subproject commit d879430b040ea30c65220c9171a8a70607398748
+Subproject commit c89653021e66ca78c55f02b366f404455bc12e8d
diff --git a/tools/version.txt b/tools/version.txt
@@ -1 +1 @@
-77a564a261cdb68c9091ac04c87d5c704da48da5
+ad7d4adac6bebead80be01388b94d1f57a50245a

Original file line number	Diff line number	Diff line change
`@@ -174,15 +174,15 @@ ModelInstanceState::ModelInstanceState(ModelState* model_state, TRITONBACKEND_Mo`
`174`	`174`	`TLLM_LOG_WARNING("max_num_sequences is not specified, will be set to the TRT engine max_batch_size");`
`175`	`175`	`}`
`176`	`176`
`177`		`- bool enableTrtOverlap = true;`
	`177`	`+ bool enableTrtOverlap = false;`
`178`	`178`	`try`
`179`	`179`	`{`
`180`	`180`	`enableTrtOverlap = model_state_->GetParameter<bool>("enable_trt_overlap");`
`181`	`181`	`}`
`182`	`182`	`catch (const std::exception& e)`
`183`	`183`	`{`
`184`	`184`	`// If parameter is not specified, just ignore`
`185`		`- TLLM_LOG_WARNING("enable_trt_overlap is not specified, will be set to true");`
	`185`	`+ TLLM_LOG_WARNING("enable_trt_overlap is not specified, will be set to false");`
`186`	`186`	`}`
`187`	`187`
`188`	`188`	`bool normalizeLogProbs = true;`
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-77a564a261cdb68c9091ac04c87d5c704da48da5`
	`1`	`+ad7d4adac6bebead80be01388b94d1f57a50245a`