Update TensorRT-LLM backend (triton-inference-server#60)

kaiyux · web-flow · commit 329937adc2b7 · 2023-10-30T16:27:32.000+08:00
* Update src

* Update .gitmodules

* Update .pre-commit-config.yaml

* Update submodule
diff --git a/.gitmodules b/.gitmodules
@@ -1,3 +1,3 @@
 [submodule "tensorrt_llm"]
 	path = tensorrt_llm
-	url = git@github.com:NVIDIA/TensorRT-LLM.git
+	url = https://github.com/NVIDIA/TensorRT-LLM.git
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -40,3 +40,9 @@ repos:
     rev: v0.6.10
     hooks:
     -   id: cmake-format
+-   repo: https://github.com/codespell-project/codespell
+    rev: v2.2.4
+    hooks:
+    -   id: codespell
+        args:
+        - --skip=".git,tensorrt_llm"
diff --git a/README.md b/README.md
@@ -363,7 +363,7 @@ You might have to contact your cluster's administrator to help you customize the
 ### Kill the Triton server
 
 ```bash
-pgrep tritonserver | xargs kill -9
+pkill tritonserver
 ```
 
 ## Testing the TensorRT-LLM Backend
diff --git a/all_models/gpt/tensorrt_llm/1/model.py b/all_models/gpt/tensorrt_llm/1/model.py
@@ -242,7 +242,7 @@ def execute(self, requests):
                 # response:
                 #
                 # pb_utils.InferenceResponse(
-                #    output_tensors=..., TritonError("An error occured"))
+                #    output_tensors=..., TritonError("An error occurred"))
 
                 inference_response = pb_utils.InferenceResponse(output_tensors)
             else:
diff --git a/dockerfile/Dockerfile.trt_llm_backend b/dockerfile/Dockerfile.trt_llm_backend
@@ -8,7 +8,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends rapidjson-dev p
 COPY requirements.txt /tmp/
 RUN pip3 install -r /tmp/requirements.txt --extra-index-url https://pypi.ngc.nvidia.com
 
-# Remove prevous TRT installation
+# Remove previous TRT installation
 # We didn't remove libnvinfer* here because tritonserver depends on the pre-installed libraries.
 RUN apt-get remove --purge -y tensorrt*
 RUN pip uninstall -y tensorrt
diff --git a/inflight_batcher_llm/CMakeLists.txt b/inflight_batcher_llm/CMakeLists.txt
@@ -30,7 +30,7 @@ set(TRITON_BUILD
 
 if(TRITON_BUILD)
   set_ifndef(TRTLLM_DIR ${CMAKE_CURRENT_SOURCE_DIR}/tensorrt_llm)
-  # Install build time dependencies. This section is excuted during cmake
+  # Install build time dependencies. This section is executed during cmake
   # configure time.
   execute_process(
     COMMAND bash -x ./tools/environment_setup.sh
diff --git a/inflight_batcher_llm/src/libtensorrtllm.cc b/inflight_batcher_llm/src/libtensorrtllm.cc
@@ -858,7 +858,6 @@ class ModelInstanceState
                             packed.insert(
                                 packed.end(), std::move_iterator(vpacked.begin()), std::move_iterator(vpacked.end()));
                         }
-                        int64_t nWords1 = static_cast<int64_t>(packed.size());
                         bcast(packed, 0, COMM_WORLD);
                     }
                 }
@@ -1128,7 +1127,7 @@ class ModelInstanceState
             TLLM_LOG_WARNING("max_num_sequences is not specified, will be set to the TRT engine max_batch_size");
         }
 
-        std::optional<bool> enableTrtOverlap = std::nullopt;
+        bool enableTrtOverlap = true;
         try
         {
             enableTrtOverlap = model_state_->GetParameter<bool>("enable_trt_overlap");
@@ -1139,8 +1138,11 @@ class ModelInstanceState
             TLLM_LOG_WARNING("enable_trt_overlap is not specified, will be set to true");
         }
 
-        TrtGptModelOptionalParams optionalParams(
-            maxNumSequences, maxTokensInPagedKvCache, kvCacheFreeGpuMemFraction, enableTrtOverlap);
+        TrtGptModelOptionalParams optionalParams;
+        optionalParams.maxNumSequences = maxNumSequences;
+        optionalParams.kvCacheConfig.maxTokens = maxTokensInPagedKvCache;
+        optionalParams.kvCacheConfig.freeGpuMemoryFraction = kvCacheFreeGpuMemFraction;
+        optionalParams.enableTrtOverlap = enableTrtOverlap;
 
         mBatchManager = std::make_shared<GptManager>(
             mModelPath, mTrtGptModelType, maxBeamWidth, schedulerPolicy,
diff --git a/scripts/launch_triton_server.py b/scripts/launch_triton_server.py
@@ -1,5 +1,6 @@
 import argparse
 import subprocess
+import sys
 from pathlib import Path
 
 
@@ -9,9 +10,18 @@ def parse_arguments():
                         type=int,
                         default=1,
                         help='world size, only support tensor parallelism now')
-    parser.add_argument('--tritonserver',
-                        type=str,
-                        default='/opt/tritonserver/bin/tritonserver')
+    parser.add_argument(
+        '--tritonserver',
+        type=str,
+        help='path to the tritonserver exe',
+        default='/opt/tritonserver/bin/tritonserver',
+    )
+    parser.add_argument(
+        '--force',
+        '-f',
+        action='store_true',
+        help='launch tritonserver regardless of other instances running')
+
     path = str(Path(__file__).parent.absolute()) + '/../all_models/gpt'
     parser.add_argument('--model_repo', type=str, default=path)
     return parser.parse_args()
@@ -30,13 +40,15 @@ def get_cmd(world_size, tritonserver, model_repo):
 
 if __name__ == '__main__':
     args = parse_arguments()
-    res = subprocess.run(['pgrep', 'tritonserver'],
+    res = subprocess.run(['pgrep', '-r', 'R', 'tritonserver'],
                          capture_output=True,
                          encoding='utf-8')
     if res.stdout:
         pids = res.stdout.replace('\n', ' ').rstrip()
-        raise RuntimeError(
-            f'tritonserver process(es) already found with PID(s): {pids}.\n\tUse `kill {pids}` to stop them.'
-        )
+        msg = f'tritonserver process(es) already found with PID(s): {pids}.\n\tUse `kill {pids}` to stop them.'
+        if args.force:
+            print(msg, file=sys.stderr)
+        else:
+            raise RuntimeError(msg + ' Or use --force.')
     cmd = get_cmd(int(args.world_size), args.tritonserver, args.model_repo)
     subprocess.Popen(cmd)
diff --git a/tensorrt_llm b/tensorrt_llm
@@ -1 +1 @@
-Subproject commit d8b408e6dcc1d45982a8b94399cd74b78f80befa
+Subproject commit 4de32a86ae92bc49a7ec17c00ec2f2d03663c198
diff --git a/tools/environment_setup.sh b/tools/environment_setup.sh
@@ -33,7 +33,7 @@ git lfs install
 
 pip3 install -r requirements.txt --extra-index-url https://pypi.ngc.nvidia.com
 
-# Remove prevous TRT installation
+# Remove previous TRT installation
 apt-get remove --purge -y tensorrt* libnvinfer*
 pip uninstall -y tensorrt
 
diff --git a/tools/fill_template.py b/tools/fill_template.py
@@ -27,7 +27,7 @@ def main(file_path, substitutions, in_place):
     parser.add_argument(
         "substitutions",
         help=
-        "substitions to perform, in the format variable_name_1:value_1,variable_name_2:value_2..."
+        "substitutions to perform, in the format variable_name_1:value_1,variable_name_2:value_2..."
     )
     parser.add_argument("--in_place",
                         "-i",
diff --git a/tools/gen_trtllm_dockerfile.py b/tools/gen_trtllm_dockerfile.py
@@ -33,7 +33,7 @@
 
 def install_new_version_of_TRT(clone_repo=False, trtllm_be_repo_tag="main"):
     df = """
-# Remove prevous TRT installation
+# Remove previous TRT installation
 RUN apt-get remove --purge -y tensorrt* libnvinfer*
 RUN pip uninstall -y tensorrt
 

Original file line number	Diff line number	Diff line change
`@@ -242,7 +242,7 @@ def execute(self, requests):`
`242`	`242`	`# response:`
`243`	`243`	`#`
`244`	`244`	`# pb_utils.InferenceResponse(`
`245`		`- # output_tensors=..., TritonError("An error occured"))`
	`245`	`+ # output_tensors=..., TritonError("An error occurred"))`
`246`	`246`
`247`	`247`	`inference_response = pb_utils.InferenceResponse(output_tensors)`
`248`	`248`	`else:`
Original file line number	Diff line number	Diff line change
`@@ -858,7 +858,6 @@ class ModelInstanceState`
`858`	`858`	`packed.insert(`
`859`	`859`	`packed.end(), std::move_iterator(vpacked.begin()), std::move_iterator(vpacked.end()));`
`860`	`860`	`}`
`861`		`- int64_t nWords1 = static_cast<int64_t>(packed.size());`
`862`	`861`	`bcast(packed, 0, COMM_WORLD);`
`863`	`862`	`}`
`864`	`863`	`}`
`@@ -1128,7 +1127,7 @@ class ModelInstanceState`
`1128`	`1127`	`TLLM_LOG_WARNING("max_num_sequences is not specified, will be set to the TRT engine max_batch_size");`
`1129`	`1128`	`}`
`1130`	`1129`
`1131`		`- std::optional<bool> enableTrtOverlap = std::nullopt;`
	`1130`	`+ bool enableTrtOverlap = true;`
`1132`	`1131`	`try`
`1133`	`1132`	`{`
`1134`	`1133`	`enableTrtOverlap = model_state_->GetParameter<bool>("enable_trt_overlap");`
`@@ -1139,8 +1138,11 @@ class ModelInstanceState`
`1139`	`1138`	`TLLM_LOG_WARNING("enable_trt_overlap is not specified, will be set to true");`
`1140`	`1139`	`}`
`1141`	`1140`
`1142`		`- TrtGptModelOptionalParams optionalParams(`
`1143`		`- maxNumSequences, maxTokensInPagedKvCache, kvCacheFreeGpuMemFraction, enableTrtOverlap);`
	`1141`	`+ TrtGptModelOptionalParams optionalParams;`
	`1142`	`+ optionalParams.maxNumSequences = maxNumSequences;`
	`1143`	`+ optionalParams.kvCacheConfig.maxTokens = maxTokensInPagedKvCache;`
	`1144`	`+ optionalParams.kvCacheConfig.freeGpuMemoryFraction = kvCacheFreeGpuMemFraction;`
	`1145`	`+ optionalParams.enableTrtOverlap = enableTrtOverlap;`
`1144`	`1146`
`1145`	`1147`	`mBatchManager = std::make_shared<GptManager>(`
`1146`	`1148`	`mModelPath, mTrtGptModelType, maxBeamWidth, schedulerPolicy,`