Skip to content

Commit

Permalink
Update TensorRT-LLM backend (triton-inference-server#60)
Browse files Browse the repository at this point in the history
* Update src

* Update .gitmodules

* Update .pre-commit-config.yaml

* Update submodule
  • Loading branch information
kaiyux authored Oct 30, 2023
1 parent 06f63fe commit 329937a
Show file tree
Hide file tree
Showing 12 changed files with 40 additions and 20 deletions.
2 changes: 1 addition & 1 deletion .gitmodules
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
[submodule "tensorrt_llm"]
path = tensorrt_llm
url = git@github.com:NVIDIA/TensorRT-LLM.git
url = https://github.com/NVIDIA/TensorRT-LLM.git
6 changes: 6 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,9 @@ repos:
rev: v0.6.10
hooks:
- id: cmake-format
- repo: https://github.com/codespell-project/codespell
rev: v2.2.4
hooks:
- id: codespell
args:
- --skip=".git,tensorrt_llm"
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -363,7 +363,7 @@ You might have to contact your cluster's administrator to help you customize the
### Kill the Triton server

```bash
pgrep tritonserver | xargs kill -9
pkill tritonserver
```

## Testing the TensorRT-LLM Backend
Expand Down
2 changes: 1 addition & 1 deletion all_models/gpt/tensorrt_llm/1/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,7 @@ def execute(self, requests):
# response:
#
# pb_utils.InferenceResponse(
# output_tensors=..., TritonError("An error occured"))
# output_tensors=..., TritonError("An error occurred"))

inference_response = pb_utils.InferenceResponse(output_tensors)
else:
Expand Down
2 changes: 1 addition & 1 deletion dockerfile/Dockerfile.trt_llm_backend
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends rapidjson-dev p
COPY requirements.txt /tmp/
RUN pip3 install -r /tmp/requirements.txt --extra-index-url https://pypi.ngc.nvidia.com

# Remove prevous TRT installation
# Remove previous TRT installation
# We didn't remove libnvinfer* here because tritonserver depends on the pre-installed libraries.
RUN apt-get remove --purge -y tensorrt*
RUN pip uninstall -y tensorrt
Expand Down
2 changes: 1 addition & 1 deletion inflight_batcher_llm/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ set(TRITON_BUILD

if(TRITON_BUILD)
set_ifndef(TRTLLM_DIR ${CMAKE_CURRENT_SOURCE_DIR}/tensorrt_llm)
# Install build time dependencies. This section is excuted during cmake
# Install build time dependencies. This section is executed during cmake
# configure time.
execute_process(
COMMAND bash -x ./tools/environment_setup.sh
Expand Down
10 changes: 6 additions & 4 deletions inflight_batcher_llm/src/libtensorrtllm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -858,7 +858,6 @@ class ModelInstanceState
packed.insert(
packed.end(), std::move_iterator(vpacked.begin()), std::move_iterator(vpacked.end()));
}
int64_t nWords1 = static_cast<int64_t>(packed.size());
bcast(packed, 0, COMM_WORLD);
}
}
Expand Down Expand Up @@ -1128,7 +1127,7 @@ class ModelInstanceState
TLLM_LOG_WARNING("max_num_sequences is not specified, will be set to the TRT engine max_batch_size");
}

std::optional<bool> enableTrtOverlap = std::nullopt;
bool enableTrtOverlap = true;
try
{
enableTrtOverlap = model_state_->GetParameter<bool>("enable_trt_overlap");
Expand All @@ -1139,8 +1138,11 @@ class ModelInstanceState
TLLM_LOG_WARNING("enable_trt_overlap is not specified, will be set to true");
}

TrtGptModelOptionalParams optionalParams(
maxNumSequences, maxTokensInPagedKvCache, kvCacheFreeGpuMemFraction, enableTrtOverlap);
TrtGptModelOptionalParams optionalParams;
optionalParams.maxNumSequences = maxNumSequences;
optionalParams.kvCacheConfig.maxTokens = maxTokensInPagedKvCache;
optionalParams.kvCacheConfig.freeGpuMemoryFraction = kvCacheFreeGpuMemFraction;
optionalParams.enableTrtOverlap = enableTrtOverlap;

mBatchManager = std::make_shared<GptManager>(
mModelPath, mTrtGptModelType, maxBeamWidth, schedulerPolicy,
Expand Down
26 changes: 19 additions & 7 deletions scripts/launch_triton_server.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import argparse
import subprocess
import sys
from pathlib import Path


Expand All @@ -9,9 +10,18 @@ def parse_arguments():
type=int,
default=1,
help='world size, only support tensor parallelism now')
parser.add_argument('--tritonserver',
type=str,
default='/opt/tritonserver/bin/tritonserver')
parser.add_argument(
'--tritonserver',
type=str,
help='path to the tritonserver exe',
default='/opt/tritonserver/bin/tritonserver',
)
parser.add_argument(
'--force',
'-f',
action='store_true',
help='launch tritonserver regardless of other instances running')

path = str(Path(__file__).parent.absolute()) + '/../all_models/gpt'
parser.add_argument('--model_repo', type=str, default=path)
return parser.parse_args()
Expand All @@ -30,13 +40,15 @@ def get_cmd(world_size, tritonserver, model_repo):

if __name__ == '__main__':
args = parse_arguments()
res = subprocess.run(['pgrep', 'tritonserver'],
res = subprocess.run(['pgrep', '-r', 'R', 'tritonserver'],
capture_output=True,
encoding='utf-8')
if res.stdout:
pids = res.stdout.replace('\n', ' ').rstrip()
raise RuntimeError(
f'tritonserver process(es) already found with PID(s): {pids}.\n\tUse `kill {pids}` to stop them.'
)
msg = f'tritonserver process(es) already found with PID(s): {pids}.\n\tUse `kill {pids}` to stop them.'
if args.force:
print(msg, file=sys.stderr)
else:
raise RuntimeError(msg + ' Or use --force.')
cmd = get_cmd(int(args.world_size), args.tritonserver, args.model_repo)
subprocess.Popen(cmd)
2 changes: 1 addition & 1 deletion tensorrt_llm
Submodule tensorrt_llm updated 43 files
+36 −38 benchmarks/cpp/gptManagerBenchmark.cpp
+46 −20 benchmarks/cpp/gptSessionBenchmark.cpp
+45 −0 cpp/include/tensorrt_llm/batch_manager/kvCacheConfig.h
+9 −3 cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
+11 −39 cpp/include/tensorrt_llm/batch_manager/trtGptModelOptionalParams.h
+5 −5 cpp/include/tensorrt_llm/runtime/gptDecoderBatch.h
+61 −47 cpp/include/tensorrt_llm/runtime/gptSession.h
+1 −1 cpp/include/tensorrt_llm/runtime/iGptDecoderBatch.h
+7 −5 cpp/include/tensorrt_llm/runtime/iStatefulGptDecoder.h
+6 −0 cpp/include/tensorrt_llm/runtime/worldConfig.h
+2 −2 cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a
+2 −2 cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
+3 −3 cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt
+2 −2 cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a
+2 −2 cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
+16 −9 cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttentionTemplate.h
+11 −1 cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttentionUtils.h
+38 −43 cpp/tensorrt_llm/runtime/gptDecoder.cpp
+2 −5 cpp/tensorrt_llm/runtime/gptDecoderBatch.cpp
+246 −301 cpp/tensorrt_llm/runtime/gptSession.cpp
+18 −18 cpp/tensorrt_llm/runtime/runtimeBuffers.cpp
+10 −4 cpp/tensorrt_llm/runtime/runtimeBuffers.h
+1 −8 cpp/tensorrt_llm/runtime/statefulGptDecoder.cpp
+1 −1 cpp/tensorrt_llm/runtime/statefulGptDecoder.h
+183 −92 cpp/tests/resources/scripts/test_cpp.py
+14 −11 cpp/tests/runtime/gptSessionTest.cpp
+3 −0 docker/Makefile
+9 −1 docker/common/install_base.sh
+16 −4 docs/source/batch_manager.md
+0 −186 examples/chatglm6b/convert.py
+0 −25 examples/chatglm6b/exportLM.py
+0 −212 examples/chatglm6b/hf_chatglm6b_convert.py
+0 −1,552 examples/chatglm6b/modeling_chatglm.py
+5 −1 examples/falcon/build.py
+6 −1 examples/gpt/build.py
+3 −0 examples/gpt/run.py
+3 −0 examples/gpt/summarize.py
+5 −1 examples/gptj/build.py
+5 −1 examples/llama/build.py
+1 −1 examples/llama/summarize.py
+5 −1 examples/mpt/build.py
+2 −4 tensorrt_llm/runtime/session.py
+69 −56 tests/attention/test_gpt_attention.py
2 changes: 1 addition & 1 deletion tools/environment_setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ git lfs install

pip3 install -r requirements.txt --extra-index-url https://pypi.ngc.nvidia.com

# Remove prevous TRT installation
# Remove previous TRT installation
apt-get remove --purge -y tensorrt* libnvinfer*
pip uninstall -y tensorrt

Expand Down
2 changes: 1 addition & 1 deletion tools/fill_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def main(file_path, substitutions, in_place):
parser.add_argument(
"substitutions",
help=
"substitions to perform, in the format variable_name_1:value_1,variable_name_2:value_2..."
"substitutions to perform, in the format variable_name_1:value_1,variable_name_2:value_2..."
)
parser.add_argument("--in_place",
"-i",
Expand Down
2 changes: 1 addition & 1 deletion tools/gen_trtllm_dockerfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@

def install_new_version_of_TRT(clone_repo=False, trtllm_be_repo_tag="main"):
df = """
# Remove prevous TRT installation
# Remove previous TRT installation
RUN apt-get remove --purge -y tensorrt* libnvinfer*
RUN pip uninstall -y tensorrt
Expand Down

0 comments on commit 329937a

Please sign in to comment.