triton-inference-server
diff --git a/‎README.md
Lines changed: 1 addition & 1 deletion b/‎README.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎all_models/inflight_batcher_llm/postprocessing/1/model.py
Lines changed: 1 addition & 7 deletions b/‎all_models/inflight_batcher_llm/postprocessing/1/model.py
Lines changed: 1 addition & 7 deletions
diff --git a/‎all_models/inflight_batcher_llm/preprocessing/1/model.py
Lines changed: 38 additions & 8 deletions b/‎all_models/inflight_batcher_llm/preprocessing/1/model.py
Lines changed: 38 additions & 8 deletions
diff --git a/‎all_models/inflight_batcher_llm/tensorrt_llm/1/model.py
Lines changed: 11 additions & 7 deletions b/‎all_models/inflight_batcher_llm/tensorrt_llm/1/model.py
Lines changed: 11 additions & 7 deletions
diff --git a/‎all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt
Lines changed: 13 additions & 1 deletion b/‎all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt
Lines changed: 13 additions & 1 deletion
diff --git a/‎all_models/multimodal/multimodal_encoders/1/model.py
Lines changed: 1 addition & 0 deletions b/‎all_models/multimodal/multimodal_encoders/1/model.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎all_models/multimodal/multimodal_encoders/config.pbtxt
Lines changed: 1 addition & 1 deletion b/‎all_models/multimodal/multimodal_encoders/config.pbtxt
Lines changed: 1 addition & 1 deletion
diff --git a/‎ci/L0_backend_trtllm/test.sh
Lines changed: 1 addition & 0 deletions b/‎ci/L0_backend_trtllm/test.sh
Lines changed: 1 addition & 0 deletions
diff --git a/‎dockerfile/Dockerfile.triton.trt_llm_backend
Lines changed: 5 additions & 1 deletion b/‎dockerfile/Dockerfile.triton.trt_llm_backend
Lines changed: 5 additions & 1 deletion
diff --git a/‎docs/baichuan.md
Lines changed: 2 additions & 2 deletions b/‎docs/baichuan.md
Lines changed: 2 additions & 2 deletions
@@ -219,7 +219,7 @@ DECOUPLED_MODE=false
 
 python3 ${FILL_TEMPLATE_SCRIPT} -i ${MODEL_FOLDER}/ensemble/config.pbtxt triton_max_batch_size:${TRITON_MAX_BATCH_SIZE}
 python3 ${FILL_TEMPLATE_SCRIPT} -i ${MODEL_FOLDER}/preprocessing/config.pbtxt tokenizer_dir:${TOKENIZER_DIR},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},preprocessing_instance_count:${INSTANCE_COUNT}
-python3 ${FILL_TEMPLATE_SCRIPT} -i ${MODEL_FOLDER}/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},decoupled_mode:${DECOUPLED_MODE},engine_dir:${ENGINE_DIR},max_queue_delay_microseconds:${MAX_QUEUE_DELAY_MS},batching_strategy:inflight_fused_batching,max_queue_size:${MAX_QUEUE_SIZE}
+python3 ${FILL_TEMPLATE_SCRIPT} -i ${MODEL_FOLDER}/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},decoupled_mode:${DECOUPLED_MODE},engine_dir:${ENGINE_DIR},max_queue_delay_microseconds:${MAX_QUEUE_DELAY_MS},batching_strategy:inflight_fused_batching,max_queue_size:${MAX_QUEUE_SIZE},encoder_input_features_data_type:TYPE_FP16
 python3 ${FILL_TEMPLATE_SCRIPT} -i ${MODEL_FOLDER}/postprocessing/config.pbtxt tokenizer_dir:${TOKENIZER_DIR},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},postprocessing_instance_count:${INSTANCE_COUNT},max_queue_size:${MAX_QUEUE_SIZE}
 python3 ${FILL_TEMPLATE_SCRIPT} -i ${MODEL_FOLDER}/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},decoupled_mode:${DECOUPLED_MODE},bls_instance_count:${INSTANCE_COUNT}
 ```
 
@@ -132,13 +132,7 @@ def execute(self, requests):
             for batch_idx, beam_tokens in enumerate(token_batch):
                 for beam_idx, tokens in enumerate(beam_tokens):
                     seq_len = sequence_lengths[idx][batch_idx][beam_idx]
-                    # Exclude fake ids in multimodal models
-                    fake_id_len = 0
-                    for i in range(seq_len):
-                        if tokens[i] < self.tokenizer.vocab_size:
-                            fake_id_len = i
-                            break
-                    list_of_tokens.append(tokens[fake_id_len:seq_len])
+                    list_of_tokens.append(tokens[:seq_len])
                     req_idx_offset += 1
 
             req_idx_offsets.append(req_idx_offset)
 
@@ -24,12 +24,16 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import base64
+import io
 import json
 import os
 from typing import List
 
 import numpy as np
+import requests
 import triton_python_backend_utils as pb_utils
+from PIL import Image
 from transformers import AutoProcessor, AutoTokenizer, T5Tokenizer
 
 
@@ -659,18 +663,28 @@ def __init__(self,
                  vision_model_processor,
                  preprocessor_model_config={}):
         # import libraries that are only relevant for multimodal models
-        import requests
         import torch
-        from PIL import Image
         from torch.utils.dlpack import from_dlpack
 
-        from tensorrt_llm._utils import str_dtype_to_torch
+        # NOTE: Due to the behavior of MPI initialization, it is recommended to avoid using import tensorrt_llm
+        #       except for the specific modules tensorrt_llm and multimodal_encoders.
+        #       As a result, the function str_dtype_to_torch has been copied directly from tensorrt_llm._utils.
+        _str_to_torch_dtype_dict = dict(
+            bfloat16=torch.bfloat16,
+            float16=torch.float16,
+            float32=torch.float32,
+            int64=torch.int64,
+            int32=torch.int32,
+            int8=torch.int8,
+            bool=torch.bool,
+            fp8=torch.float8_e4m3fn,
+        )
+
+        def str_dtype_to_torch(dtype):
+            ret = _str_to_torch_dtype_dict.get(dtype)
+            assert ret is not None, f'Unsupported dtype: {dtype}'
+            return ret
 
-        # create method for loading image from urls
-        self.load_images_from_urls = lambda img_urls: [
-            Image.open(requests.get(img_url.decode(), stream=True).raw)
-            for img_url in img_urls
-        ]
         self.load_images_tensor = lambda tensor: tensor if not hasattr(
             tensor, 'to_dlpack') else from_dlpack(tensor.to_dlpack())
 
@@ -695,6 +709,22 @@ def __init__(self,
         self.vision_model_processor = vision_model_processor
         self.vision_model_type = vision_model_type
 
+    def load_images_from_urls(self, img_urls):
+        images = []
+        for img_url in img_urls:
+            img_url = img_url.decode()
+            if img_url.startswith("data:image/jpeg;base64,"):
+                image_base64 = img_url.split(",")[1]
+                # Decode the base64 string
+                image_data = base64.b64decode(image_base64)
+                # Create a BytesIO object from the decoded data
+                image_buffer = io.BytesIO(image_data)
+                images.append(Image.open(image_buffer))
+            else:
+                images.append(
+                    Image.open(requests.get(img_url, stream=True).raw))
+        return images
+
     def process(self, queries, img_urls=None, image_bytes=None):
         vision_processed_tensors = {}
         if img_urls is not None or image_bytes is not None:
 
@@ -193,6 +193,8 @@ def get_sampling_config_from_request(request, batch_size=1, batch_index=0):
         request, 'beam_search_diversity_rate', batch_size, batch_index)
     kwargs['early_stopping'] = get_input_scalar_by_name(
         request, 'early_stopping', batch_size, batch_index)
+    kwargs['num_return_sequences'] = get_input_scalar_by_name(
+        request, 'num_return_sequences', batch_size, batch_index) or 1
     kwargs = {k: v for k, v in kwargs.items() if v is not None}
     return trtllm.SamplingConfig(**kwargs)
 
@@ -336,9 +338,6 @@ def convert_request(request, exclude_input_from_output, decoupled):
             raise pb_utils.TritonModelException(
                 "Streaming is only supported in decoupled mode.")
 
-        inputs['num_return_sequences'] = get_input_scalar_by_name(
-            request, 'num_return_sequences', batch_size, batch_index) or 1
-
         inputs['end_id'] = get_input_scalar_by_name(request, 'end_id',
                                                     batch_size, batch_index)
         inputs['pad_id'] = get_input_scalar_by_name(request, 'pad_id',
@@ -364,7 +363,7 @@ def convert_request(request, exclude_input_from_output, decoupled):
             # if request doesn't specify exclude_input_from_output, try to use the parameter
             output_config.exclude_input_from_output = (
                 exclude_input_from_output
-                if exclude_input_from_output is not None else false)
+                if exclude_input_from_output is not None else False)
         else:
             output_config.exclude_input_from_output = req_exclude_input_from_output
 
@@ -642,7 +641,11 @@ def get_extended_runtime_perf_knob_config(self, model_config):
             "multi_block_mode":
             get_parameter(model_config, "multi_block_mode", bool),
             "enable_context_fmha_fp32_acc":
-            get_parameter(model_config, "enable_context_fmha_fp32_acc", bool)
+            get_parameter(model_config, "enable_context_fmha_fp32_acc", bool),
+            "cuda_graph_mode":
+            get_parameter(model_config, "cuda_graph_mode", bool),
+            "cuda_graph_cache_size":
+            get_parameter(model_config, "cuda_graph_cache_size", int),
         }
         kwargs = {k: v for k, v in kwargs.items() if v is not None}
         return trtllm.ExtendedRuntimePerfKnobConfig(**kwargs)
@@ -1000,8 +1003,9 @@ def execute(self, requests):
 
                 self.req_id_to_request_data[req_id] = RequestData(
                     triton_req_id, triton_user_id, batch_index,
-                    len(batch_indices), executor_request.num_return_sequences,
-                    0, 0, triton_request.get_response_sender())
+                    len(batch_indices),
+                    executor_request.sampling_config.num_return_sequences, 0,
+                    0, triton_request.get_response_sender())
                 self.triton_req_id_to_req_ids[triton_req_id].add(req_id)
                 input_len = len(
                     executor_request.input_token_ids
 
@@ -48,7 +48,7 @@ input [
   },
   {
     name: "encoder_input_features"
-    data_type: TYPE_FP16
+    data_type: ${encoder_input_features_data_type}
     dims: [ -1, -1 ]
     allow_ragged_batch: true
     optional: true
@@ -648,6 +648,18 @@ parameters: {
     string_value: "${multi_block_mode}"
   }
 }
+parameters: {
+  key: "cuda_graph_mode"
+  value: {
+    string_value: "${cuda_graph_mode}"
+  }
+}
+parameters: {
+  key: "cuda_graph_cache_size"
+  value: {
+    string_value: "${cuda_graph_cache_size}"
+  }
+}
 parameters: {
   key: "speculative_decoding_fast_logits"
   value: {
 
@@ -49,6 +49,7 @@ def triton_string_to_torch(dtype):
         "TYPE_FP16": torch.float16,
         "TYPE_FP32": torch.float32,
         "TYPE_FP64": torch.float64,
+        "TYPE_BF16": torch.bfloat16
     }
     return type_map[dtype]
 
 
@@ -85,7 +85,7 @@ output [
     # Output for visual encoders of type mllama
     {
         name: "ENCODER_INPUT_FEATURES"
-        data_type: TYPE_FP16
+        data_type: ${encoder_input_features_data_type}
         dims: [ -1, -1 ]
     },
     {
 
@@ -195,6 +195,7 @@ for NUM_GPU in "${NUM_GPUS_TO_TEST[@]}"; do
     replace_config_tags '${engine_dir}' "${MODEL_DIR}/tensorrt_llm/1/inflight_${NUM_GPU}_gpu/" "${MODEL_DIR}/tensorrt_llm/config.pbtxt"
     replace_config_tags '${max_queue_delay_microseconds}' "50000" "${MODEL_DIR}/tensorrt_llm/config.pbtxt"
     replace_config_tags '${triton_backend}' "tensorrtllm" "${MODEL_DIR}/tensorrt_llm/config.pbtxt"
+    replace_config_tags '${encoder_input_features_data_type}' "TYPE_FP16" "${MODEL_DIR}/tensorrt_llm/config.pbtxt"
     replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_DIR}/postprocessing/config.pbtxt"
     replace_config_tags '${tokenizer_dir}' "${TOKENIZER_DIR}/" "${MODEL_DIR}/postprocessing/config.pbtxt"
     replace_config_tags '${postprocessing_instance_count}' '1' "${MODEL_DIR}/postprocessing/config.pbtxt"
 
@@ -1,6 +1,6 @@
 ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:24.10-py3-min
 ARG PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:24.10-py3
-ARG NVRTC_VER=12.6.68-1
+ARG NVRTC_VER=12.6.77-1
 ARG TRT_VER=10.6.0.26
 ARG RELEASE_URL_TRT_x86=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.6.0/tars/TensorRT-${TRT_VER}.Linux.x86_64-gnu.cuda-12.6.tar.gz
 ARG RELEASE_URL_TRT_ARM=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.6.0/tars/TensorRT-${TRT_VER}.ubuntu-24.04.aarch64-gnu.cuda-12.6.tar.gz
@@ -29,6 +29,8 @@ COPY --from=pytorch_image /usr/local/lib/python3.10/dist-packages/torch-2.5.0a0+
 COPY --from=pytorch_image /usr/local/lib/python3.10/dist-packages/torchgen /usr/local/lib/python3.10/dist-packages/torchgen
 COPY --from=pytorch_image /usr/local/lib/python3.10/dist-packages/torchvision /usr/local/lib/python3.10/dist-packages/torchvision
 COPY --from=pytorch_image /usr/local/lib/python3.10/dist-packages/torchvision-0.20.0a0.dist-info /usr/local/lib/python3.10/dist-packages/torchvision-0.20.0a0.dist-info
+COPY --from=pytorch_image /usr/local/lib/python3.10/dist-packages/setuptools /usr/local/lib/python3.10/dist-packages/setuptools
+COPY --from=pytorch_image /usr/local/lib/python3.10/dist-packages/setuptools-70.3.0.dist-info /usr/local/lib/python3.10/dist-packages/setuptools-70.3.0.dist-info
 
 # Might not need to copy cusparseLt in the future once it's included in DLFW cuda container
 COPY --from=pytorch_image /usr/local/cuda/lib64/libcusparseLt* /usr/local/cuda/lib64/
@@ -109,6 +111,8 @@ COPY --from=pytorch_image /usr/local/lib/python3.10/dist-packages/torch-2.5.0a0+
 COPY --from=pytorch_image /usr/local/lib/python3.10/dist-packages/torchgen /usr/local/lib/python3.10/dist-packages/torchgen
 COPY --from=pytorch_image /usr/local/lib/python3.10/dist-packages/torchvision /usr/local/lib/python3.10/dist-packages/torchvision
 COPY --from=pytorch_image /usr/local/lib/python3.10/dist-packages/torchvision-0.20.0a0.dist-info /usr/local/lib/python3.10/dist-packages/torchvision-0.20.0a0.dist-info
+COPY --from=pytorch_image /usr/local/lib/python3.10/dist-packages/setuptools /usr/local/lib/python3.10/dist-packages/setuptools
+COPY --from=pytorch_image /usr/local/lib/python3.10/dist-packages/setuptools-70.3.0.dist-info /usr/local/lib/python3.10/dist-packages/setuptools-70.3.0.dist-info
 
 # Might not need to copy cusparseLt in the future once it's included in DLFW cuda container
 COPY --from=pytorch_image /usr/local/cuda/lib64/libcusparseLt* /usr/local/cuda/lib64/
 
@@ -44,7 +44,7 @@ python3 tools/fill_template.py -i baichuan_ifb/preprocessing/config.pbtxt tokeni
 python3 tools/fill_template.py -i baichuan_ifb/postprocessing/config.pbtxt tokenizer_dir:${HF_BAICHUAN_MODEL},triton_max_batch_size:64,postprocessing_instance_count:1
 python3 tools/fill_template.py -i baichuan_ifb/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:64,decoupled_mode:False,bls_instance_count:1,accumulate_tokens:False
 python3 tools/fill_template.py -i baichuan_ifb/ensemble/config.pbtxt triton_max_batch_size:64
-python3 tools/fill_template.py -i baichuan_ifb/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:64,decoupled_mode:False,max_beam_width:1,engine_dir:/tmp/baichuan/13B/trt_engines/fp16/1-gpu/,max_tokens_in_paged_kv_cache:2560,max_attention_window_size:2560,kv_cache_free_gpu_mem_fraction:0.5,exclude_input_in_output:True,enable_kv_cache_reuse:False,batching_strategy:inflight_fused_batching,max_queue_delay_microseconds:0
+python3 tools/fill_template.py -i baichuan_ifb/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:64,decoupled_mode:False,max_beam_width:1,engine_dir:/tmp/baichuan/13B/trt_engines/fp16/1-gpu/,max_tokens_in_paged_kv_cache:2560,max_attention_window_size:2560,kv_cache_free_gpu_mem_fraction:0.5,exclude_input_in_output:True,enable_kv_cache_reuse:False,batching_strategy:inflight_fused_batching,max_queue_delay_microseconds:0,encoder_input_features_data_type:TYPE_FP16
 ```
 
 * Launch server
@@ -178,7 +178,7 @@ python3 tools/fill_template.py -i baichuan_ifb/preprocessing/config.pbtxt tokeni
 python3 tools/fill_template.py -i baichuan_ifb/postprocessing/config.pbtxt tokenizer_dir:${HF_BAICHUAN_MODEL},triton_max_batch_size:64,postprocessing_instance_count:1
 python3 tools/fill_template.py -i baichuan_ifb/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:64,decoupled_mode:True,bls_instance_count:1,accumulate_tokens:True
 python3 tools/fill_template.py -i baichuan_ifb/ensemble/config.pbtxt triton_max_batch_size:64
-python3 tools/fill_template.py -i baichuan_ifb/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:64,decoupled_mode:True,max_beam_width:1,engine_dir:/tmp/baichuan/13B/trt_engines/fp16/1-gpu/,max_tokens_in_paged_kv_cache:2560,max_attention_window_size:2560,kv_cache_free_gpu_mem_fraction:0.5,exclude_input_in_output:True,enable_kv_cache_reuse:False,batching_strategy:inflight_fused_batching,max_queue_delay_microseconds:0
+python3 tools/fill_template.py -i baichuan_ifb/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:64,decoupled_mode:True,max_beam_width:1,engine_dir:/tmp/baichuan/13B/trt_engines/fp16/1-gpu/,max_tokens_in_paged_kv_cache:2560,max_attention_window_size:2560,kv_cache_free_gpu_mem_fraction:0.5,exclude_input_in_output:True,enable_kv_cache_reuse:False,batching_strategy:inflight_fused_batching,max_queue_delay_microseconds:0,encoder_input_features_data_type:TYPE_FP16
 
 pip install SentencePiece
 # please add `trust_remote_code=True` in tokenizer of preprocessing and postprocessing. Considering the security, we don't add it by default.
Original file line number	Diff line number	Diff line change
`@@ -49,6 +49,7 @@ def triton_string_to_torch(dtype):`
`49`	`49`	`"TYPE_FP16": torch.float16,`
`50`	`50`	`"TYPE_FP32": torch.float32,`
`51`	`51`	`"TYPE_FP64": torch.float64,`
	`52`	`+ "TYPE_BF16": torch.bfloat16`
`52`	`53`	`}`
`53`	`54`	`return type_map[dtype]`
`54`	`55`
Original file line number	Diff line number	Diff line change
`@@ -85,7 +85,7 @@ output [`
`85`	`85`	`# Output for visual encoders of type mllama`
`86`	`86`	`{`
`87`	`87`	`name: "ENCODER_INPUT_FEATURES"`
`88`		`- data_type: TYPE_FP16`
	`88`	`+ data_type: ${encoder_input_features_data_type}`
`89`	`89`	`dims: [ -1, -1 ]`
`90`	`90`	`},`
`91`	`91`	`{`