diff --git a/.gitmodules b/.gitmodules index 0e5eaa77..70ad46f3 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,3 @@ [submodule "tensorrt_llm"] path = tensorrt_llm - url = git@github.com:NVIDIA/TensorRT-LLM.git + url = https://github.com/NVIDIA/TensorRT-LLM.git diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9ee1c078..4fffede7 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -40,3 +40,9 @@ repos: rev: v0.6.10 hooks: - id: cmake-format +- repo: https://github.com/codespell-project/codespell + rev: v2.2.4 + hooks: + - id: codespell + args: + - --skip=".git,tensorrt_llm" diff --git a/README.md b/README.md index a899fd77..53583bf7 100644 --- a/README.md +++ b/README.md @@ -363,7 +363,7 @@ You might have to contact your cluster's administrator to help you customize the ### Kill the Triton server ```bash -pgrep tritonserver | xargs kill -9 +pkill tritonserver ``` ## Testing the TensorRT-LLM Backend diff --git a/all_models/gpt/tensorrt_llm/1/model.py b/all_models/gpt/tensorrt_llm/1/model.py index 3d036efd..55cab37b 100644 --- a/all_models/gpt/tensorrt_llm/1/model.py +++ b/all_models/gpt/tensorrt_llm/1/model.py @@ -242,7 +242,7 @@ def execute(self, requests): # response: # # pb_utils.InferenceResponse( - # output_tensors=..., TritonError("An error occured")) + # output_tensors=..., TritonError("An error occurred")) inference_response = pb_utils.InferenceResponse(output_tensors) else: diff --git a/dockerfile/Dockerfile.trt_llm_backend b/dockerfile/Dockerfile.trt_llm_backend index 7826a746..8fc565fc 100644 --- a/dockerfile/Dockerfile.trt_llm_backend +++ b/dockerfile/Dockerfile.trt_llm_backend @@ -8,7 +8,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends rapidjson-dev p COPY requirements.txt /tmp/ RUN pip3 install -r /tmp/requirements.txt --extra-index-url https://pypi.ngc.nvidia.com -# Remove prevous TRT installation +# Remove previous TRT installation # We didn't remove libnvinfer* here because tritonserver depends on the pre-installed libraries. RUN apt-get remove --purge -y tensorrt* RUN pip uninstall -y tensorrt diff --git a/inflight_batcher_llm/CMakeLists.txt b/inflight_batcher_llm/CMakeLists.txt index d8b2c74e..f08e8015 100644 --- a/inflight_batcher_llm/CMakeLists.txt +++ b/inflight_batcher_llm/CMakeLists.txt @@ -30,7 +30,7 @@ set(TRITON_BUILD if(TRITON_BUILD) set_ifndef(TRTLLM_DIR ${CMAKE_CURRENT_SOURCE_DIR}/tensorrt_llm) - # Install build time dependencies. This section is excuted during cmake + # Install build time dependencies. This section is executed during cmake # configure time. execute_process( COMMAND bash -x ./tools/environment_setup.sh diff --git a/inflight_batcher_llm/src/libtensorrtllm.cc b/inflight_batcher_llm/src/libtensorrtllm.cc index 39dc133b..c68b940d 100644 --- a/inflight_batcher_llm/src/libtensorrtllm.cc +++ b/inflight_batcher_llm/src/libtensorrtllm.cc @@ -858,7 +858,6 @@ class ModelInstanceState packed.insert( packed.end(), std::move_iterator(vpacked.begin()), std::move_iterator(vpacked.end())); } - int64_t nWords1 = static_cast(packed.size()); bcast(packed, 0, COMM_WORLD); } } @@ -1128,7 +1127,7 @@ class ModelInstanceState TLLM_LOG_WARNING("max_num_sequences is not specified, will be set to the TRT engine max_batch_size"); } - std::optional enableTrtOverlap = std::nullopt; + bool enableTrtOverlap = true; try { enableTrtOverlap = model_state_->GetParameter("enable_trt_overlap"); @@ -1139,8 +1138,11 @@ class ModelInstanceState TLLM_LOG_WARNING("enable_trt_overlap is not specified, will be set to true"); } - TrtGptModelOptionalParams optionalParams( - maxNumSequences, maxTokensInPagedKvCache, kvCacheFreeGpuMemFraction, enableTrtOverlap); + TrtGptModelOptionalParams optionalParams; + optionalParams.maxNumSequences = maxNumSequences; + optionalParams.kvCacheConfig.maxTokens = maxTokensInPagedKvCache; + optionalParams.kvCacheConfig.freeGpuMemoryFraction = kvCacheFreeGpuMemFraction; + optionalParams.enableTrtOverlap = enableTrtOverlap; mBatchManager = std::make_shared( mModelPath, mTrtGptModelType, maxBeamWidth, schedulerPolicy, diff --git a/scripts/launch_triton_server.py b/scripts/launch_triton_server.py index 2efec7e8..a2cc2ac7 100644 --- a/scripts/launch_triton_server.py +++ b/scripts/launch_triton_server.py @@ -1,5 +1,6 @@ import argparse import subprocess +import sys from pathlib import Path @@ -9,9 +10,18 @@ def parse_arguments(): type=int, default=1, help='world size, only support tensor parallelism now') - parser.add_argument('--tritonserver', - type=str, - default='/opt/tritonserver/bin/tritonserver') + parser.add_argument( + '--tritonserver', + type=str, + help='path to the tritonserver exe', + default='/opt/tritonserver/bin/tritonserver', + ) + parser.add_argument( + '--force', + '-f', + action='store_true', + help='launch tritonserver regardless of other instances running') + path = str(Path(__file__).parent.absolute()) + '/../all_models/gpt' parser.add_argument('--model_repo', type=str, default=path) return parser.parse_args() @@ -30,13 +40,15 @@ def get_cmd(world_size, tritonserver, model_repo): if __name__ == '__main__': args = parse_arguments() - res = subprocess.run(['pgrep', 'tritonserver'], + res = subprocess.run(['pgrep', '-r', 'R', 'tritonserver'], capture_output=True, encoding='utf-8') if res.stdout: pids = res.stdout.replace('\n', ' ').rstrip() - raise RuntimeError( - f'tritonserver process(es) already found with PID(s): {pids}.\n\tUse `kill {pids}` to stop them.' - ) + msg = f'tritonserver process(es) already found with PID(s): {pids}.\n\tUse `kill {pids}` to stop them.' + if args.force: + print(msg, file=sys.stderr) + else: + raise RuntimeError(msg + ' Or use --force.') cmd = get_cmd(int(args.world_size), args.tritonserver, args.model_repo) subprocess.Popen(cmd) diff --git a/tensorrt_llm b/tensorrt_llm index d8b408e6..4de32a86 160000 --- a/tensorrt_llm +++ b/tensorrt_llm @@ -1 +1 @@ -Subproject commit d8b408e6dcc1d45982a8b94399cd74b78f80befa +Subproject commit 4de32a86ae92bc49a7ec17c00ec2f2d03663c198 diff --git a/tools/environment_setup.sh b/tools/environment_setup.sh index 4367dbe1..d799f950 100644 --- a/tools/environment_setup.sh +++ b/tools/environment_setup.sh @@ -33,7 +33,7 @@ git lfs install pip3 install -r requirements.txt --extra-index-url https://pypi.ngc.nvidia.com -# Remove prevous TRT installation +# Remove previous TRT installation apt-get remove --purge -y tensorrt* libnvinfer* pip uninstall -y tensorrt diff --git a/tools/fill_template.py b/tools/fill_template.py index cb298b31..0524f9ef 100644 --- a/tools/fill_template.py +++ b/tools/fill_template.py @@ -27,7 +27,7 @@ def main(file_path, substitutions, in_place): parser.add_argument( "substitutions", help= - "substitions to perform, in the format variable_name_1:value_1,variable_name_2:value_2..." + "substitutions to perform, in the format variable_name_1:value_1,variable_name_2:value_2..." ) parser.add_argument("--in_place", "-i", diff --git a/tools/gen_trtllm_dockerfile.py b/tools/gen_trtllm_dockerfile.py index 922fe11e..9cc9032d 100644 --- a/tools/gen_trtllm_dockerfile.py +++ b/tools/gen_trtllm_dockerfile.py @@ -33,7 +33,7 @@ def install_new_version_of_TRT(clone_repo=False, trtllm_be_repo_tag="main"): df = """ -# Remove prevous TRT installation +# Remove previous TRT installation RUN apt-get remove --purge -y tensorrt* libnvinfer* RUN pip uninstall -y tensorrt