diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml new file mode 100644 index 00000000..21518a54 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -0,0 +1,117 @@ +name: "Bug Report" +description: Submit a bug report to help us improve TensorRT-LLM backend +labels: [ "bug" ] +body: + - type: textarea + id: system-info + attributes: + label: System Info + description: Please share your system info with us. + placeholder: | + - CPU architecture (e.g., x86_64, aarch64) + - CPU/Host memory size (if known) + - GPU properties + - GPU name (e.g., NVIDIA H100, NVIDIA A100, NVIDIA L40S) + - GPU memory size (if known) + - Clock frequencies used (if applicable) + - Libraries + - TensorRT-LLM branch or tag (e.g., main, v0.7.1) + - TensorRT-LLM commit (if known) + - Versions of TensorRT, AMMO, CUDA, cuBLAS, etc. used + - Container used (if running TensorRT-LLM in a container) + - NVIDIA driver version + - OS (Ubuntu 22.04, CentOS 7, Windows 10) + - Docker image version + - Any other information that may be useful in reproducing the bug + validations: + required: true + + - type: textarea + id: who-can-help + attributes: + label: Who can help? + description: | + To expedite the response to your issue, it would be helpful if you could identify the appropriate person + to tag using the **@** symbol. Here is a general guideline on **whom to tag**. + + Rest assured that all issues are reviewed by the core maintainers. If you are unsure about whom to tag, + you can leave it blank, and a core maintainer will make sure to involve the appropriate person. + + Please tag fewer than 3 people. + + Quantization: @Tracin + + Documentation: @juney-nvidia + + Feature request: @ncomly-nvidia + + Performance: @kaiyux + + Others: @byshiue @schetlur-nv + + placeholder: "@Username ..." + + - type: checkboxes + id: information-scripts-examples + attributes: + label: Information + description: 'The problem arises when using:' + options: + - label: "The official example scripts" + - label: "My own modified scripts" + + - type: checkboxes + id: information-tasks + attributes: + label: Tasks + description: "The tasks I am working on are:" + options: + - label: "An officially supported task in the `examples` folder (such as GLUE/SQuAD, ...)" + - label: "My own task or dataset (give details below)" + + - type: textarea + id: reproduction + validations: + required: true + attributes: + label: Reproduction + description: | + Kindly share a code example that demonstrates the issue you encountered. It is recommending to provide a code snippet directly. + Additionally, if you have any error messages, or stack traces related to the problem, please include them here. + + Remember to use code tags to properly format your code. You can refer to the + link https://help.github.com/en/github/writing-on-github/creating-and-highlighting-code-blocks#syntax-highlighting for guidance on code formatting. + + Please refrain from using screenshots, as they can be difficult to read and prevent others from copying and pasting your code. + It would be most helpful if we could reproduce your issue by simply copying and pasting your scripts and codes. + + placeholder: | + Steps to reproduce the behavior: + + 1. + 2. + 3. + + - type: textarea + id: expected-behavior + validations: + required: true + attributes: + label: Expected behavior + description: "Provide a brief summary of the expected behavior of the software. Provide output files or examples if possible." + + - type: textarea + id: actual-behavior + validations: + required: true + attributes: + label: actual behavior + description: "Describe the actual behavior of the software and how it deviates from the expected behavior. Provide output files or examples if possible." + + - type: textarea + id: additioanl-notes + validations: + required: true + attributes: + label: additional notes + description: "Provide any additional context here you think might be useful for the TensorRT-LLM team to help debug this issue (such as experiments done, potential things to investigate)." diff --git a/README.md b/README.md index 799d60cb..74154082 100644 --- a/README.md +++ b/README.md @@ -219,7 +219,6 @@ The following table shows the fields that may to be modified before deployment: | `max_tokens_in_paged_kv_cache` | Optional (default=unspecified). The maximum size of the KV cache in number of tokens. If unspecified, value is interpreted as 'infinite'. KV cache allocation is the min of max_tokens_in_paged_kv_cache and value derived from kv_cache_free_gpu_mem_fraction below. | | `max_attention_window_size` | Optional (default=max_sequence_length). When using techniques like sliding window attention, the maximum number of tokens that are attended to generate one token. Defaults attends to all tokens in sequence. | | `kv_cache_free_gpu_mem_fraction` | Optional (default=0.9). Set to a number between 0 and 1 to indicate the maximum fraction of GPU memory (after loading the model) that may be used for KV cache.| -| `max_num_sequences` | Optional (default=`max_batch_size` if `enable_trt_overlap` is `false` and to `2 * max_batch_size` if `enable_trt_overlap` is `true`, where `max_batch_size` is the TRT engine maximum batch size). Maximum number of sequences that the in-flight batching scheme can maintain state for. | `enable_trt_overlap` | Optional (default=`false`). Set to `true` to partition available requests into 2 'microbatches' that can be run concurrently to hide exposed CPU runtime | | `exclude_input_in_output` | Optional (default=`false`). Set to `true` to only return completion tokens in a response. Set to `false` to return the prompt tokens concatenated with the generated tokens | | `normalize_log_probs` | Optional (default=`true`). Set to `false` to skip normalization of `output_log_probs` | diff --git a/all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt b/all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt index fb615c3c..efd15018 100644 --- a/all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt +++ b/all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt @@ -332,12 +332,6 @@ parameters: { string_value: "${kv_cache_free_gpu_mem_fraction}" } } -parameters: { - key: "max_num_sequences" - value: { - string_value: "${max_num_sequences}" - } -} parameters: { key: "enable_trt_overlap" value: { diff --git a/ci/L0_backend_trtllm/custom_metrics_verification_tests.py b/ci/L0_backend_trtllm/custom_metrics_verification_tests.py index c10fe126..18c9ca98 100644 --- a/ci/L0_backend_trtllm/custom_metrics_verification_tests.py +++ b/ci/L0_backend_trtllm/custom_metrics_verification_tests.py @@ -46,6 +46,8 @@ "inflight_batcher_specific_metric=micro_batch_id": "MicroBatch ID", "inflight_batcher_specific_metric=generation_requests": "Generation Requests", + "inflight_batcher_specific_metric=terminated_requests": + "Terminated Requests", "v1_specific_metric=total_context_tokens": "Total Context Tokens", "v1_specific_metric=total_generation_tokens": "Total Generation Tokens", "v1_specific_metric=empty_generation_slots": "Empty Generation Slots", diff --git a/ci/L0_backend_trtllm/test.sh b/ci/L0_backend_trtllm/test.sh index ed01f532..810be232 100644 --- a/ci/L0_backend_trtllm/test.sh +++ b/ci/L0_backend_trtllm/test.sh @@ -144,185 +144,7 @@ python3 -m pip install --upgrade pip && \ RET=0 -reset_model_repo - -### 1-GPU TRT engine -SERVER_ARGS="--model_repo=${MODEL_DIR}" - -# inflight batching OFF (V1) -# streaming OFF -SERVER_LOG="./1gpu_v1_no_streaming_server.log" -cp -r /opt/tritonserver/tensorrtllm_backend/all_models/inflight_batcher_llm/* ${MODEL_DIR} -rm -rf ${MODEL_DIR}/tensorrt_llm_bls -replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_DIR}/ensemble/config.pbtxt" -replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_DIR}/preprocessing/config.pbtxt" -replace_config_tags '${tokenizer_dir}' "${TOKENIZER_DIR}/" "${MODEL_DIR}/preprocessing/config.pbtxt" -replace_config_tags '${tokenizer_type}' 'auto' "${MODEL_DIR}/preprocessing/config.pbtxt" -replace_config_tags '${preprocessing_instance_count}' '1' "${MODEL_DIR}/preprocessing/config.pbtxt" -replace_config_tags '${decoupled_mode}' 'False' "${MODEL_DIR}/tensorrt_llm/config.pbtxt" -replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_DIR}/tensorrt_llm/config.pbtxt" -replace_config_tags '${batching_strategy}' 'V1' "${MODEL_DIR}/tensorrt_llm/config.pbtxt" -replace_config_tags '${engine_dir}' "${MODEL_DIR}/tensorrt_llm/1/inflight_1_gpu/" "${MODEL_DIR}/tensorrt_llm/config.pbtxt" -replace_config_tags '${max_queue_delay_microseconds}' "1000000" "${MODEL_DIR}/tensorrt_llm/config.pbtxt" -replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_DIR}/postprocessing/config.pbtxt" -replace_config_tags '${tokenizer_dir}' "${TOKENIZER_DIR}/" "${MODEL_DIR}/postprocessing/config.pbtxt" -replace_config_tags '${tokenizer_type}' 'auto' "${MODEL_DIR}/postprocessing/config.pbtxt" -replace_config_tags '${postprocessing_instance_count}' '1' "${MODEL_DIR}/postprocessing/config.pbtxt" -# Copy the engine and place it into the model folder -cp -r ${BASE_DIR}/engines/inflight_1_gpu/ triton_model_repo/tensorrt_llm/1 - -run_server "${SERVER_ARGS}" -wait_for_server_ready ${SERVER_TIMEOUT} ${SERVER_PID[@]} -if [ "$WAIT_RET" != "0" ]; then - # Cleanup - kill $SERVER_PID > /dev/null 2>&1 || true - echo -e "\n***\n*** Failed to start $SERVER\n***" - cat $SERVER_LOG - exit 1 -fi - -set -e -python3 ${TOOLS_DIR}/inflight_batcher_llm/benchmark_core_model.py \ - --max-input-len=500 \ - dataset --dataset=${DATASET} \ - --tokenizer-dir=${TOKENIZER_DIR} - -if [ $? -ne 0 ]; then - cat $SERVER_LOG - echo -e "\n***\n*** Error executing inflight batching benchmark_core_model: line ${LINENO}\n***" - kill_server - wait_for_server_terminated ${SERVER_PID[@]} - RET=1 -fi -set +e - -set -e -python3 ${TOOLS_DIR}/inflight_batcher_llm/end_to_end_test.py \ - --max-input-len=500 \ - --dataset=${DATASET} - -if [ $? -ne 0 ]; then - cat $SERVER_LOG - echo -e "\n***\n*** Error executing v1 end-to-end test: line ${LINENO}\n***" - kill_server - wait_for_server_terminated ${SERVER_PID[@]} - RET=1 -fi -set +e - -curl localhost:8002/metrics -o 1gpu_v1_no_stream_metrics.out - -kill_server -wait_for_server_terminated ${SERVER_PID[@]} - -# inflight batching ON -# streaming OFF -SERVER_LOG="./1gpu_IFB_no_streaming_server.log" -replace_config_tags 'V1' 'inflight_fused_batching' "${MODEL_DIR}/tensorrt_llm/config.pbtxt" - -run_server "${SERVER_ARGS}" -wait_for_server_ready ${SERVER_TIMEOUT} ${SERVER_PID[@]} -if [ "$WAIT_RET" != "0" ]; then - # Cleanup - kill $SERVER_PID > /dev/null 2>&1 || true - echo -e "\n***\n*** Failed to start $SERVER\n***" - cat $SERVER_LOG - exit 1 -fi - -set -e -python3 ${TOOLS_DIR}/inflight_batcher_llm/benchmark_core_model.py \ - --max-input-len=500 \ - dataset --dataset=${DATASET} \ - --tokenizer-dir=${TOKENIZER_DIR} - -if [ $? -ne 0 ]; then - cat $SERVER_LOG - echo -e "\n***\n*** Error executing inflight batching benchmark_core_model: line ${LINENO}\n***" - kill_server - wait_for_server_terminated ${SERVER_PID[@]} - RET=1 -fi -set +e - -set -e -python3 ${TOOLS_DIR}/inflight_batcher_llm/end_to_end_test.py \ - --max-input-len=500 \ - --dataset=${DATASET} - -if [ $? -ne 0 ]; then - cat $SERVER_LOG - echo -e "\n***\n*** Error executing inflight batching end-to-end test: line ${LINENO}\n***" - kill_server - wait_for_server_terminated ${SERVER_PID[@]} - RET=1 -fi -set +e - -curl localhost:8002/metrics -o 1gpu_IFB_no_stream_metrics.out - -kill_server -wait_for_server_terminated ${SERVER_PID[@]} - -# Start a clean server to verify base metrics are being -# reported correctly -SERVER_LOG="./1gpu_IFB_no_streaming_base_metrics.log" -run_server "${SERVER_ARGS}" -wait_for_server_ready ${SERVER_TIMEOUT} ${SERVER_PID[@]} -if [ "$WAIT_RET" != "0" ]; then - # Cleanup - kill $SERVER_PID > /dev/null 2>&1 || true - echo -e "\n***\n*** Failed to start $SERVER\n***" - cat $SERVER_LOG - exit 1 -fi -set -e - -python3 ${BASE_METRICS_VERIFICATION_TEST} >> ${BASE_METRICS_VERIFICATION_LOG} 2>&1 -if [ $? -ne 0 ]; then - cat ${BASE_METRICS_VERIFICATION_LOG} - RET=1 -fi -set +e - -kill_server -wait_for_server_terminated ${SERVER_PID[@]} - -# inflight batching ON -# streaming ON -SERVER_LOG="./1gpu_IFB_streaming_server.log" -replace_config_tags 'decoupled: False' 'decoupled: True' "${MODEL_DIR}/tensorrt_llm/config.pbtxt" - -run_server "${SERVER_ARGS}" -wait_for_server_ready ${SERVER_TIMEOUT} ${SERVER_PID[@]} -if [ "$WAIT_RET" != "0" ]; then - # Cleanup - kill $SERVER_PID > /dev/null 2>&1 || true - echo -e "\n***\n*** Failed to start $SERVER\n***" - cat $SERVER_LOG - exit 1 -fi - -set -e -python3 ${STREAM_DIR}/end_to_end_grpc_client.py \ - --prompt="My name is" - -if [ $? -ne 0 ]; then - cat $SERVER_LOG - echo -e "\n***\n*** Error executing inflight batching end-to-end streaming test: line ${LINENO}\n***" - kill_server - wait_for_server_terminated ${SERVER_PID[@]} - RET=1 -fi -set +e - -curl localhost:8002/metrics -o 1gpu_IFB_stream_metrics.out - -kill_server -wait_for_server_terminated ${SERVER_PID[@]} - -### Multi GPU TRT engine -NUM_GPUS_TO_TEST=("2" "4") +NUM_GPUS_TO_TEST=("1" "2" "4") for NUM_GPU in "${NUM_GPUS_TO_TEST[@]}"; do AVAILABLE_GPUS=$(nvidia-smi -L | wc -l) if [ "$AVAILABLE_GPUS" -lt "$NUM_GPU" ]; then @@ -331,10 +153,6 @@ for NUM_GPU in "${NUM_GPUS_TO_TEST[@]}"; do SERVER_ARGS="--world_size=${NUM_GPU} --model_repo=${MODEL_DIR}" - # inflight batching OFF (V1) - # streaming OFF - SERVER_LOG="./${NUM_GPU}gpu_v1_no_streaming_server.log" - reset_model_repo cp -r /opt/tritonserver/tensorrtllm_backend/all_models/inflight_batcher_llm/* ${MODEL_DIR} @@ -346,9 +164,9 @@ for NUM_GPU in "${NUM_GPUS_TO_TEST[@]}"; do replace_config_tags '${preprocessing_instance_count}' '1' "${MODEL_DIR}/preprocessing/config.pbtxt" replace_config_tags '${decoupled_mode}' 'False' "${MODEL_DIR}/tensorrt_llm/config.pbtxt" replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_DIR}/tensorrt_llm/config.pbtxt" - replace_config_tags '${batching_strategy}' 'V1' "${MODEL_DIR}/tensorrt_llm/config.pbtxt" + replace_config_tags '${batching_strategy}' 'INVALID' "${MODEL_DIR}/tensorrt_llm/config.pbtxt" replace_config_tags '${engine_dir}' "${MODEL_DIR}/tensorrt_llm/1/inflight_${NUM_GPU}_gpu/" "${MODEL_DIR}/tensorrt_llm/config.pbtxt" - replace_config_tags '${max_queue_delay_microseconds}' "0" "${MODEL_DIR}/tensorrt_llm/config.pbtxt" + replace_config_tags '${max_queue_delay_microseconds}' "50000" "${MODEL_DIR}/tensorrt_llm/config.pbtxt" replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_DIR}/postprocessing/config.pbtxt" replace_config_tags '${tokenizer_dir}' "${TOKENIZER_DIR}/" "${MODEL_DIR}/postprocessing/config.pbtxt" replace_config_tags '${tokenizer_type}' 'auto' "${MODEL_DIR}/postprocessing/config.pbtxt" @@ -357,6 +175,24 @@ for NUM_GPU in "${NUM_GPUS_TO_TEST[@]}"; do # Copy the engine and place it into the model folder cp -r ${BASE_DIR}/engines/inflight_${NUM_GPU}_gpu/ triton_model_repo/tensorrt_llm/1 + # Invalid GPT model Type + SERVER_LOG="./${NUM_GPU}gpu_invalid_batch_strat.log" + + run_server "${SERVER_ARGS}" + wait_for_server_ready ${SERVER_TIMEOUT} ${SERVER_PID[@]} + + # Expect invalid GPT model type error to be gracefully handled + if [ `grep -c "Invalid gpt_model_type" $SERVER_LOG` == "0" ]; then + echo -e "\n***\n*** GPT model type error not handled gracefully: line ${LINENO}\n***" + cat $SERVER_LOG + exit 1 + fi + + # inflight batching OFF (V1) + # streaming OFF + SERVER_LOG="./${NUM_GPU}gpu_v1_no_streaming_server.log" + replace_config_tags 'INVALID' 'V1' "${MODEL_DIR}/tensorrt_llm/config.pbtxt" + run_server "${SERVER_ARGS}" wait_for_server_ready ${SERVER_TIMEOUT} ${SERVER_PID[@]} if [ "$WAIT_RET" != "0" ]; then @@ -375,7 +211,7 @@ for NUM_GPU in "${NUM_GPUS_TO_TEST[@]}"; do if [ $? -ne 0 ]; then cat $SERVER_LOG - echo -e "\n***\n*** Error executing v1 benchmark_core_model test with ${NUM_GPU}GPUs: line ${LINENO}\n***" + echo -e "\n***\n*** Error executing v1 benchmark_core_model test with ${NUM_GPU}GPU(s): line ${LINENO}\n***" kill_server wait_for_server_terminated ${SERVER_PID[@]} RET=1 @@ -389,7 +225,7 @@ for NUM_GPU in "${NUM_GPUS_TO_TEST[@]}"; do if [ $? -ne 0 ]; then cat $SERVER_LOG - echo -e "\n***\n*** Error executing v1 end-to-end test with ${NUM_GPU}GPUs: line ${LINENO}\n***" + echo -e "\n***\n*** Error executing v1 end-to-end test with ${NUM_GPU}GPU(s): line ${LINENO}\n***" kill_server wait_for_server_terminated ${SERVER_PID[@]} RET=1 @@ -423,7 +259,7 @@ for NUM_GPU in "${NUM_GPUS_TO_TEST[@]}"; do if [ $? -ne 0 ]; then cat $SERVER_LOG - echo -e "\n***\n*** Error executing inflight batching benchmark_core_model test with ${NUM_GPU}GPUs: line ${LINENO}\n***" + echo -e "\n***\n*** Error executing inflight batching benchmark_core_model test with ${NUM_GPU}GPU(s): line ${LINENO}\n***" kill_server wait_for_server_terminated ${SERVER_PID[@]} RET=1 @@ -437,7 +273,7 @@ for NUM_GPU in "${NUM_GPUS_TO_TEST[@]}"; do if [ $? -ne 0 ]; then cat $SERVER_LOG - echo -e "\n***\n*** Error executing inflight batching end-to-end test with ${NUM_GPU}GPUs: line ${LINENO}\n***" + echo -e "\n***\n*** Error executing inflight batching end-to-end test with ${NUM_GPU}GPU(s): line ${LINENO}\n***" kill_server wait_for_server_terminated ${SERVER_PID[@]} RET=1 @@ -493,7 +329,7 @@ for NUM_GPU in "${NUM_GPUS_TO_TEST[@]}"; do if [ $? -ne 0 ]; then cat $SERVER_LOG - echo -e "\n***\n*** Error executing inflight batching end-to-end streaming test with ${NUM_GPU}GPUs: line ${LINENO}\n***" + echo -e "\n***\n*** Error executing inflight batching end-to-end streaming test with ${NUM_GPU}GPU(s): line ${LINENO}\n***" kill_server wait_for_server_terminated ${SERVER_PID[@]} RET=1 diff --git a/dockerfile/Dockerfile.triton.trt_llm_backend b/dockerfile/Dockerfile.triton.trt_llm_backend index 208ee2c5..5aff5021 100644 --- a/dockerfile/Dockerfile.triton.trt_llm_backend +++ b/dockerfile/Dockerfile.triton.trt_llm_backend @@ -2,7 +2,7 @@ ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:23.11-py3-min FROM ${BASE_IMAGE} as base -RUN apt-get update -q=2 && apt-get install -y --no-install-recommends python3-pip +RUN apt-get update -q=2 && apt-get install -y --no-install-recommends python3-pip ccache git-lfs # Remove previous TRT installation # We didn't remove libnvinfer* here because tritonserver depends on the pre-installed libraries. RUN apt-get remove -y tensorrt* diff --git a/dockerfile/Dockerfile.trt_llm_backend b/dockerfile/Dockerfile.trt_llm_backend index 0fb2d027..c06e7968 100644 --- a/dockerfile/Dockerfile.trt_llm_backend +++ b/dockerfile/Dockerfile.trt_llm_backend @@ -3,7 +3,7 @@ ARG BASE_TAG=23.12-py3 FROM ${BASE_IMAGE}:${BASE_TAG} as base -RUN apt-get update && apt-get install -y --no-install-recommends rapidjson-dev python-is-python3 +RUN apt-get update && apt-get install -y --no-install-recommends rapidjson-dev python-is-python3 ccache git-lfs COPY requirements.txt /tmp/ RUN pip3 install -r /tmp/requirements.txt --extra-index-url https://pypi.ngc.nvidia.com diff --git a/inflight_batcher_llm/CMakeLists.txt b/inflight_batcher_llm/CMakeLists.txt index f7f08613..25564502 100644 --- a/inflight_batcher_llm/CMakeLists.txt +++ b/inflight_batcher_llm/CMakeLists.txt @@ -260,7 +260,6 @@ set_property( ) if(TRITON_ENABLE_METRICS) - message("Compiling statistics reporter...") list(APPEND REPORTER_SRCS src/custom_metrics_reporter/custom_metrics_reporter.cc) list(APPEND REPORTER_HDRS diff --git a/inflight_batcher_llm/README.md b/inflight_batcher_llm/README.md index fe040d7a..0c418093 100644 --- a/inflight_batcher_llm/README.md +++ b/inflight_batcher_llm/README.md @@ -151,6 +151,111 @@ You will find that the generation process is stopped early and therefore the num You can have a look at the client code to see how early stopping is achieved. +## Running LoRA inference with inflight batching + +Build a model with LoRA enable + +``` +BASE_MODEL=llama-7b-hf + +python3 tensorrt_llm/examples/llama/build.py --model_dir ${BASE_MODEL} \ + --dtype float16 \ + --remove_input_padding \ + --use_gpt_attention_plugin float16 \ + --enable_context_fmha \ + --use_gemm_plugin float16 \ + --output_dir "/tmp/llama_7b_with_lora_qkv/trt_engines/fp16/1-gpu/" \ + --max_batch_size 128 \ + --max_input_len 512 \ + --max_output_len 50 \ + --use_lora_plugin float16 \ + --lora_target_modules "attn_q" "attn_k" "attn_v" \ + --use_inflight_batching \ + --paged_kv_cache \ + --max_lora_rank 8 \ + --world_size 1 --tp_size 1 +``` + +Create a Triton model repository and launch the Triton server as described above. + +Now generate LoRA tensors that will be passed in with each request to triton. + +``` +git-lfs clone https://huggingface.co/qychen/luotuo-lora-7b-0.1 +git-lfs clone https://huggingface.co/kunishou/Japanese-Alpaca-LoRA-7b-v0 + +python3 tensorrt_llm/examples/hf_lora_convert.py -i Japanese-Alpaca-LoRA-7b-v0 -o Japanese-Alpaca-LoRA-7b-v0-weights --storage-type float16 +python3 tensorrt_llm/examples/hf_lora_convert.py -i luotuo-lora-7b-0.1 -o luotuo-lora-7b-0.1-weights --storage-type float16 +``` + +Launch tritonserver as describe above + +Run Multi-LoRA example by issuing multiple concurrent requests. +The inflight batcher will execute mixed batches with multiple LoRAs in the same batch. + +``` +INPUT_TEXT=("美国的首都在哪里? \n答案:" "美国的首都在哪里? \n答案:" "美国的首都在哪里? \n答案:" "アメリカ合衆国の首都はどこですか? \n答え:" "アメリカ合衆国の首都はどこですか? \n答え:" "アメリカ合衆国の首都はどこですか? \n答え:") +LORA_PATHS=("" "luotuo-lora-7b-0.1-weights" "Japanese-Alpaca-LoRA-7b-v0-weights" "" "luotuo-lora-7b-0.1-weights" "Japanese-Alpaca-LoRA-7b-v0-weights") + +for index in ${!INPUT_TEXT[@]}; do + text=${INPUT_TEXT[$index]} + lora_path=${LORA_PATHS[$index]} + lora_arg="" + if [ "${lora_path}" != "" ]; then + lora_arg="--lora-path ${lora_path}" + fi + + python3 inflight_batcher_llm/client/inflight_batcher_llm_client.py \ + --top-k 0 \ + --top-p 0.5 \ + --request-output-len 10 \ + --text "${text}" \ + --tokenizer-dir /home/scratch.trt_llm_data/llm-models/llama-models/llama-7b-hf \ + ${lora_arg} & +done + +wait +``` + +Example Output: +``` +Input sequence: [1, 29871, 30310, 30604, 30303, 30439, 30733, 235, 164, 137, 30356, 30199, 31688, 30769, 30449, 31250, 30589, 30499, 30427, 30412, 29973, 320, 29876, 234, 176, 151, 30914, 29901] +Input sequence: [1, 29871, 30630, 30356, 30210, 31688, 30769, 30505, 232, 150, 173, 30755, 29973, 320, 29876, 234, 176, 151, 233, 164, 139, 29901] +Input sequence: [1, 29871, 30630, 30356, 30210, 31688, 30769, 30505, 232, 150, 173, 30755, 29973, 320, 29876, 234, 176, 151, 233, 164, 139, 29901] +Input sequence: [1, 29871, 30310, 30604, 30303, 30439, 30733, 235, 164, 137, 30356, 30199, 31688, 30769, 30449, 31250, 30589, 30499, 30427, 30412, 29973, 320, 29876, 234, 176, 151, 30914, 29901] +Input sequence: [1, 29871, 30310, 30604, 30303, 30439, 30733, 235, 164, 137, 30356, 30199, 31688, 30769, 30449, 31250, 30589, 30499, 30427, 30412, 29973, 320, 29876, 234, 176, 151, 30914, 29901] +Input sequence: [1, 29871, 30630, 30356, 30210, 31688, 30769, 30505, 232, 150, 173, 30755, 29973, 320, 29876, 234, 176, 151, 233, 164, 139, 29901] +Got completed request +Input: アメリカ合衆国の首都はどこですか? \n答え: +Output beam 0: ワシントン D.C. +Output sequence: [1, 29871, 30310, 30604, 30303, 30439, 30733, 235, 164, 137, 30356, 30199, 31688, 30769, 30449, 31250, 30589, 30499, 30427, 30412, 29973, 320, 29876, 234, 176, 151, 30914, 29901, 29871, 31028, 30373, 30203, 30279, 30203, 360, 29889, 29907, 29889] +Got completed request +Input: 美国的首都在哪里? \n答案: +Output beam 0: Washington, D.C. +What is the +Output sequence: [1, 29871, 30630, 30356, 30210, 31688, 30769, 30505, 232, 150, 173, 30755, 29973, 320, 29876, 234, 176, 151, 233, 164, 139, 29901, 7660, 29892, 360, 29889, 29907, 29889, 13, 5618, 338, 278] +Got completed request +Input: 美国的首都在哪里? \n答案: +Output beam 0: Washington D.C. +Washington D. +Output sequence: [1, 29871, 30630, 30356, 30210, 31688, 30769, 30505, 232, 150, 173, 30755, 29973, 320, 29876, 234, 176, 151, 233, 164, 139, 29901, 7660, 360, 29889, 29907, 29889, 13, 29956, 7321, 360, 29889] +Got completed request +Input: アメリカ合衆国の首都はどこですか? \n答え: +Output beam 0: Washington, D.C. +Which of +Output sequence: [1, 29871, 30310, 30604, 30303, 30439, 30733, 235, 164, 137, 30356, 30199, 31688, 30769, 30449, 31250, 30589, 30499, 30427, 30412, 29973, 320, 29876, 234, 176, 151, 30914, 29901, 7660, 29892, 360, 29889, 29907, 29889, 13, 8809, 436, 310] +Got completed request +Input: アメリカ合衆国の首都はどこですか? \n答え: +Output beam 0: Washington D.C. +1. ア +Output sequence: [1, 29871, 30310, 30604, 30303, 30439, 30733, 235, 164, 137, 30356, 30199, 31688, 30769, 30449, 31250, 30589, 30499, 30427, 30412, 29973, 320, 29876, 234, 176, 151, 30914, 29901, 7660, 360, 29889, 29907, 29889, 13, 29896, 29889, 29871, 30310] +Got completed request +Input: 美国的首都在哪里? \n答案: +Output beam 0: 华盛顿 +W +Output sequence: [1, 29871, 30630, 30356, 30210, 31688, 30769, 30505, 232, 150, 173, 30755, 29973, 320, 29876, 234, 176, 151, 233, 164, 139, 29901, 29871, 31266, 234, 158, 158, 236, 164, 194, 13, 29956] +``` + ## Run the e2e/benchmark_core_model to benchmark ### End to end test diff --git a/inflight_batcher_llm/src/custom_metrics_reporter/custom_metrics_reporter.cc b/inflight_batcher_llm/src/custom_metrics_reporter/custom_metrics_reporter.cc index 851f5851..764c1ec7 100644 --- a/inflight_batcher_llm/src/custom_metrics_reporter/custom_metrics_reporter.cc +++ b/inflight_batcher_llm/src/custom_metrics_reporter/custom_metrics_reporter.cc @@ -52,9 +52,9 @@ const std::vector CustomMetricsReporter::v1_specific_labels_{ "total_context_tokens", "total_generation_tokens", "empty_generation_slots"}; const std::vector CustomMetricsReporter::IFB_specific_keys_{ - "Total Context Tokens", "Generation Requests", "MicroBatch ID"}; + "Total Context Tokens", "Generation Requests", "MicroBatch ID", "Terminated Requests"}; const std::vector CustomMetricsReporter::IFB_specific_labels_{ - "total_context_tokens", "generation_requests", "micro_batch_id"}; + "total_context_tokens", "generation_requests", "micro_batch_id", "terminated_requests"}; const std::vector CustomMetricsReporter::general_metric_keys_{"Timestamp", "Iteration Counter"}; const std::vector CustomMetricsReporter::general_metric_labels_{"timestamp", "iteration_counter"}; @@ -125,7 +125,7 @@ const std::vector& TritonMetricGroup::JsonKeys() const return json_keys_; } -TRITONSERVER_Error* CustomMetricsReporter::InitReporter( +TRITONSERVER_Error* CustomMetricsReporter::InitializeReporter( const std::string& model_name, const uint64_t version, const bool is_v1_model) { /* REQUEST METRIC GROUP */ @@ -194,7 +194,11 @@ TRITONSERVER_Error* CustomMetricsReporter::UpdateCustomMetrics(const std::string { triton::common::TritonJson::Value value_json; uint64_t value; - metrics.Find(key.c_str(), &value_json); + if (!metrics.Find(key.c_str(), &value_json)) + { + std::string errStr = std::string("Failed to find " + key + " in metrics."); + return TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL, errStr.c_str()); + } if (key == "Timestamp") { std::string timestamp; diff --git a/inflight_batcher_llm/src/custom_metrics_reporter/custom_metrics_reporter.h b/inflight_batcher_llm/src/custom_metrics_reporter/custom_metrics_reporter.h index cef12b5d..032e14ce 100644 --- a/inflight_batcher_llm/src/custom_metrics_reporter/custom_metrics_reporter.h +++ b/inflight_batcher_llm/src/custom_metrics_reporter/custom_metrics_reporter.h @@ -140,7 +140,7 @@ class CustomMetricsReporter /// \param is_v1_model Whether the model type is v1 or an inflight /// batching model. /// \return a TRITONSERVER_Error indicating success or failure. - TRITONSERVER_Error* InitReporter(const std::string& model, const uint64_t version, const bool is_v1_model); + TRITONSERVER_Error* InitializeReporter(const std::string& model, const uint64_t version, const bool is_v1_model); /// Updates the vector of TritonMetricGroup objects with a /// JSON-formatted statistics string. diff --git a/inflight_batcher_llm/src/libtensorrtllm.cc b/inflight_batcher_llm/src/libtensorrtllm.cc index e723cf75..c4233d87 100644 --- a/inflight_batcher_llm/src/libtensorrtllm.cc +++ b/inflight_batcher_llm/src/libtensorrtllm.cc @@ -65,22 +65,17 @@ extern "C" // TRITONBACKEND_Model. If anything goes wrong with initialization // of the model state then an error is returned and Triton will fail // to load the model. - ModelState* model_state; - RETURN_IF_ERROR(ModelState::Create(model, &model_state)); - RETURN_IF_ERROR(TRITONBACKEND_ModelSetState(model, reinterpret_cast(model_state))); - -#ifdef TRITON_ENABLE_METRICS const char* cname; RETURN_IF_ERROR(TRITONBACKEND_ModelName(model, &cname)); - std::string name(cname); + const std::string name(cname); uint64_t version; RETURN_IF_ERROR(TRITONBACKEND_ModelVersion(model, &version)); - bool is_v1_model = ((model_state->GetParameter("gpt_model_type") == "V1") - || (model_state->GetParameter("gpt_model_type") == "v1")); - LOG_IF_ERROR(model_state->InitCustomMetricsReporter(name, version, is_v1_model), "Failed initializing metrics"); -#endif // TRITON_ENABLE_METRICS + ModelState* model_state; + RETURN_IF_ERROR(ModelState::Create(model, name, version, &model_state)); + RETURN_IF_ERROR(TRITONBACKEND_ModelSetState(model, reinterpret_cast(model_state))); + return nullptr; // success } diff --git a/inflight_batcher_llm/src/model_instance_state.cc b/inflight_batcher_llm/src/model_instance_state.cc index 0aa4b420..0f2ab084 100644 --- a/inflight_batcher_llm/src/model_instance_state.cc +++ b/inflight_batcher_llm/src/model_instance_state.cc @@ -77,6 +77,12 @@ ModelInstanceState::ModelInstanceState(ModelState* model_state, TRITONBACKEND_Mo "v1/inflight_batching/inflight_fused_batching."); } +#ifdef TRITON_ENABLE_METRICS + custom_metrics_reporter_ = std::make_unique(); + custom_metrics_reporter_->InitializeReporter( + model_state->GetModelName(), model_state->GetModelVersion(), (mTrtGptModelType == TrtGptModelType::V1)); +#endif + // Check if model is in decoupled mode: triton::common::TritonJson::Value transaction_policy; model_state_->GetModelConfig().MemberAsObject("model_transaction_policy", &transaction_policy); @@ -163,17 +169,6 @@ ModelInstanceState::ModelInstanceState(ModelState* model_state, TRITONBACKEND_Mo "max_tokens_in_paged_kv_cache"); } - std::optional maxNumSequences = std::nullopt; - try - { - maxNumSequences = model_state_->GetParameter("max_num_sequences"); - } - catch (const std::exception& e) - { - // If parameter is not specified, just ignore - TLLM_LOG_WARNING("max_num_sequences is not specified, will be set to the TRT engine max_batch_size"); - } - bool enableTrtOverlap = false; try { @@ -232,7 +227,6 @@ ModelInstanceState::ModelInstanceState(ModelState* model_state, TRITONBACKEND_Mo } TrtGptModelOptionalParams optionalParams; - optionalParams.maxNumSequences = maxNumSequences; optionalParams.kvCacheConfig.maxTokens = maxTokensInPagedKvCache; optionalParams.kvCacheConfig.freeGpuMemoryFraction = kvCacheFreeGpuMemFraction; optionalParams.kvCacheConfig.maxAttentionWindow = maxAttentionWindow; @@ -479,7 +473,7 @@ void ModelInstanceState::logStats(const std::string& s) { LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, s.c_str()); #ifdef TRITON_ENABLE_METRICS - LOG_IF_ERROR(model_state_->UpdateCustomMetrics(s), "Failed updating TRT LLM statistics"); + LOG_IF_ERROR(custom_metrics_reporter_->UpdateCustomMetrics(s), "Failed updating TRT LLM statistics"); #endif } diff --git a/inflight_batcher_llm/src/model_instance_state.h b/inflight_batcher_llm/src/model_instance_state.h index 42495829..d23cf75f 100644 --- a/inflight_batcher_llm/src/model_instance_state.h +++ b/inflight_batcher_llm/src/model_instance_state.h @@ -36,16 +36,20 @@ #include "tensorrt_llm/batch_manager/BatchManager.h" #include "tensorrt_llm/batch_manager/GptManager.h" -#include "tensorrt_llm/batch_manager/batchScheduler.h" #include "tensorrt_llm/batch_manager/callbacks.h" #include "tensorrt_llm/batch_manager/kvCacheConfig.h" #include "tensorrt_llm/batch_manager/namedTensor.h" +#include "tensorrt_llm/batch_manager/schedulerPolicy.h" #include "tensorrt_llm/batch_manager/trtGptModelOptionalParams.h" #include "model_state.h" #include "work_item.h" #include "work_items_queue.h" +#ifdef TRITON_ENABLE_METRICS +#include "custom_metrics_reporter/custom_metrics_reporter.h" +#endif + using namespace tensorrt_llm::batch_manager; using namespace tensorrt_llm::batch_manager::batch_scheduler; @@ -132,6 +136,9 @@ class ModelInstanceState std::unique_ptr mWorkItemsQueue; std::unordered_map mRequestIdStrMap; +#ifdef TRITON_ENABLE_METRICS + std::unique_ptr custom_metrics_reporter_; +#endif }; } // namespace triton::backend::inflight_batcher_llm diff --git a/inflight_batcher_llm/src/model_state.cc b/inflight_batcher_llm/src/model_state.cc index 0f02eaa8..2eadbde1 100644 --- a/inflight_batcher_llm/src/model_state.cc +++ b/inflight_batcher_llm/src/model_state.cc @@ -29,11 +29,11 @@ namespace triton::backend::inflight_batcher_llm { -TRITONSERVER_Error* ModelState::Create(TRITONBACKEND_Model* triton_model, ModelState** state) +TRITONSERVER_Error* ModelState::Create( + TRITONBACKEND_Model* triton_model, const std::string& name, const uint64_t version, ModelState** state) { TRITONSERVER_Message* config_message; RETURN_IF_ERROR(TRITONBACKEND_ModelConfig(triton_model, 1 /* config_version */, &config_message)); - // We can get the model configuration as a json string from // config_message, parse it with our favorite json parser to create // DOM that we can access when we need to example the @@ -52,7 +52,7 @@ TRITONSERVER_Error* ModelState::Create(TRITONBACKEND_Model* triton_model, ModelS try { - *state = new ModelState(triton_model, std::move(model_config)); + *state = new ModelState(triton_model, name, version, std::move(model_config)); } catch (const std::exception& ex) { @@ -68,6 +68,16 @@ common::TritonJson::Value& ModelState::GetModelConfig() return model_config_; } +const std::string& ModelState::GetModelName() const +{ + return model_name_; +} + +uint64_t ModelState::GetModelVersion() const +{ + return model_version_; +} + template <> std::string ModelState::GetParameter(const std::string& name) { @@ -140,19 +150,4 @@ bool ModelState::GetParameter(const std::string& name) } } -#ifdef TRITON_ENABLE_METRICS -TRITONSERVER_Error* ModelState::InitCustomMetricsReporter( - const std::string& model_name, const uint64_t version, const bool is_v1_model) -{ - RETURN_IF_ERROR(custom_metrics_reporter_->InitReporter(model_name, version, is_v1_model)); - return nullptr; // success -} - -TRITONSERVER_Error* ModelState::UpdateCustomMetrics(const std::string& custom_metrics) -{ - RETURN_IF_ERROR(custom_metrics_reporter_->UpdateCustomMetrics(custom_metrics)); - return nullptr; // success -} -#endif - } // namespace triton::backend::inflight_batcher_llm diff --git a/inflight_batcher_llm/src/model_state.h b/inflight_batcher_llm/src/model_state.h index a4fd8a4a..2e1b59bf 100644 --- a/inflight_batcher_llm/src/model_state.h +++ b/inflight_batcher_llm/src/model_state.h @@ -35,10 +35,6 @@ #include "triton/core/tritonbackend.h" #include "triton/core/tritonserver.h" -#ifdef TRITON_ENABLE_METRICS -#include "custom_metrics_reporter/custom_metrics_reporter.h" -#endif - using namespace ::triton::common; // TritonJson namespace triton::backend::inflight_batcher_llm @@ -53,7 +49,8 @@ namespace triton::backend::inflight_batcher_llm class ModelState { public: - static TRITONSERVER_Error* Create(TRITONBACKEND_Model* triton_model, ModelState** state); + static TRITONSERVER_Error* Create( + TRITONBACKEND_Model* triton_model, const std::string& name, const uint64_t version, ModelState** state); template T GetParameter(const std::string& name) @@ -65,28 +62,24 @@ class ModelState virtual ~ModelState() = default; -#ifdef TRITON_ENABLE_METRICS - TRITONSERVER_Error* InitCustomMetricsReporter( - const std::string& model_name, const uint64_t version, const bool is_v1_model); - TRITONSERVER_Error* UpdateCustomMetrics(const std::string& statistics); -#endif common::TritonJson::Value& GetModelConfig(); + const std::string& GetModelName() const; + uint64_t GetModelVersion() const; private: -#ifdef TRITON_ENABLE_METRICS - std::unique_ptr custom_metrics_reporter_; -#endif + const std::string model_name_; + uint64_t model_version_; common::TritonJson::Value model_config_; std::shared_ptr mTrtLogger{}; - ModelState(TRITONBACKEND_Model* triton_model, TritonJson::Value&& model_config) - : model_config_(std::move(model_config)) + ModelState( + TRITONBACKEND_Model* triton_model, const std::string& name, uint64_t version, TritonJson::Value&& model_config) + : model_name_(name) + , model_version_(version) + , model_config_(std::move(model_config)) { mTrtLogger = std::make_shared(); initTrtLlmPlugins(mTrtLogger.get()); -#ifdef TRITON_ENABLE_METRICS - custom_metrics_reporter_ = std::make_unique(); -#endif } }; diff --git a/tensorrt_llm b/tensorrt_llm index c8965302..b57221b7 160000 --- a/tensorrt_llm +++ b/tensorrt_llm @@ -1 +1 @@ -Subproject commit c89653021e66ca78c55f02b366f404455bc12e8d +Subproject commit b57221b764bc579cbb2490154916a871f620e2c4 diff --git a/tools/version.txt b/tools/version.txt index 24b8df30..a5d5936f 100644 --- a/tools/version.txt +++ b/tools/version.txt @@ -1 +1 @@ -ad7d4adac6bebead80be01388b94d1f57a50245a +dcc9252db07dc9e8d4584fd92dbbf743a6c18b4e