triton-inference-server
diff --git a/‎all_models/inflight_batcher_llm/ensemble/config.pbtxt
Lines changed: 54 additions & 1 deletion b/‎all_models/inflight_batcher_llm/ensemble/config.pbtxt
Lines changed: 54 additions & 1 deletion
diff --git a/‎all_models/inflight_batcher_llm/tensorrt_llm_bls/1/lib/decode.py
Lines changed: 15 additions & 6 deletions b/‎all_models/inflight_batcher_llm/tensorrt_llm_bls/1/lib/decode.py
Lines changed: 15 additions & 6 deletions
diff --git a/‎all_models/inflight_batcher_llm/tensorrt_llm_bls/1/lib/triton_decoder.py
Lines changed: 6 additions & 0 deletions b/‎all_models/inflight_batcher_llm/tensorrt_llm_bls/1/lib/triton_decoder.py
Lines changed: 6 additions & 0 deletions
diff --git a/‎all_models/inflight_batcher_llm/tensorrt_llm_bls/config.pbtxt
Lines changed: 42 additions & 0 deletions b/‎all_models/inflight_batcher_llm/tensorrt_llm_bls/config.pbtxt
Lines changed: 42 additions & 0 deletions
diff --git a/‎ci/L0_backend_trtllm/base_metrics_verification_tests.py
Lines changed: 10 additions & 2 deletions b/‎ci/L0_backend_trtllm/base_metrics_verification_tests.py
Lines changed: 10 additions & 2 deletions
diff --git a/‎ci/L0_backend_trtllm/generate_engines.sh
Lines changed: 4 additions & 3 deletions b/‎ci/L0_backend_trtllm/generate_engines.sh
Lines changed: 4 additions & 3 deletions
@@ -187,6 +187,47 @@ input [
     data_type: TYPE_FP32
     dims: [ -1 ]
     optional: true
+  },
+  # the unique task ID for the given LoRA.
+  # To perform inference with a specific LoRA for the first time `lora_task_id` `lora_weights` and `lora_config` must all be given.
+  # The LoRA will be cached, so that subsequent requests for the same task only require `lora_task_id`.
+  # If the cache is full the oldest LoRA will be evicted to make space for new ones.  An error is returned if `lora_task_id` is not cached.
+  {
+    name: "lora_task_id"
+    data_type: TYPE_UINT64
+    dims: [ 1 ]
+    optional: true
+  },
+  # weights for a lora adapter shape [ num_lora_modules_layers, D x Hi + Ho x D ]
+  # where the last dimension holds the in / out adapter weights for the associated module (e.g. attn_qkv) and model layer
+  # each of the in / out tensors are first flattened and then concatenated together in the format above.
+  # D=adapter_size (R value), Hi=hidden_size_in, Ho=hidden_size_out.
+  {
+    name: "lora_weights"
+    data_type: TYPE_FP16
+    dims: [ -1, -1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  # module identifier (same size a first dimension of lora_weights)
+  # See LoraModule::ModuleType for model id mapping
+  #
+  # "attn_qkv": 0     # compbined qkv adapter
+  # "attn_q": 1       # q adapter
+  # "attn_k": 2       # k adapter
+  # "attn_v": 3       # v adapter
+  # "attn_dense": 4   # adapter for the dense layer in attention
+  # "mlp_h_to_4h": 5  # for llama2 adapter for gated mlp layer after attention / RMSNorm: up projection
+  # "mlp_4h_to_h": 6  # for llama2 adapter for gated mlp layer after attention / RMSNorm: down projection
+  # "mlp_gate": 7     # for llama2 adapter for gated mlp later after attention / RMSNorm: gate
+  #
+  # last dim holds [ module_id, layer_idx, adapter_size (D aka R value) ]
+  {
+    name: "lora_config"
+    data_type: TYPE_INT32
+    dims: [ -1, 3 ]
+    optional: true
+    allow_ragged_batch: true
   }
 ]
 output [
@@ -430,7 +471,19 @@ ensemble_scheduling {
       input_map {
         key: "prompt_table_extra_ids"
         value: "_OUT_PROMPT_TABLE_EXTRA_IDS"
-      }
+      },
+      input_map {
+        key: "lora_task_id",
+        value: "lora_task_id"
+      },
+      input_map {
+        key: "lora_weights",
+        value: "lora_weights"
+      },
+      input_map {
+        key: "lora_config",
+        value: "lora_config"
+      },
       output_map {
         key: "output_ids"
         value: "_TOKENS_BATCH"
 
@@ -90,6 +90,9 @@ class Request:
     random_seed: Optional[np.ndarray] = None
     presence_penalty: Optional[np.ndarray] = None
     frequency_penalty: Optional[np.ndarray] = None
+    lora_task_id: Optional[np.ndarray] = None
+    lora_weights: Optional[np.ndarray] = None
+    lora_config: Optional[np.ndarray] = None
 
     def validate(self):
         _validate_non_empty(self.text_input, "text_input is required")
@@ -263,6 +266,8 @@ def _spec_generate(
 
             draft_request = None
             if num_draft_tokens > 0:
+                request.min_length = np.array([num_draft_tokens],
+                                              dtype=np.int32)
                 draft_response: GenerationResponse = self._draft_generate_non_streaming(
                     cur_preproc, request, num_draft_tokens)
                 seq_len: int = draft_response.sequence_length[0][0]
@@ -275,12 +280,16 @@ def _spec_generate(
                         draft_logits = draft_response.generation_logits[0][0]
 
                 input_draft_tokens = draft_output_ids[len(input_ids):seq_len]
-                draft_request = DraftRequest(
-                    draft_input_ids=np.expand_dims(input_draft_tokens, 0))
-                if request.use_draft_logits is not None and request.use_draft_logits[
-                        0]:
-                    draft_request.draft_logits = np.expand_dims(
-                        draft_logits[-len(input_draft_tokens):], 0)
+                if len(input_draft_tokens) > 0:
+                    draft_request = DraftRequest(
+                        draft_input_ids=np.expand_dims(input_draft_tokens, 0))
+                    if request.use_draft_logits is not None and request.use_draft_logits[
+                            0]:
+                        draft_request.draft_logits = np.expand_dims(
+                            draft_logits[-len(input_draft_tokens):], 0)
+                else:
+                    draft_request = DraftRequest()
+                request.min_length = None
             else:
                 draft_request = DraftRequest()
             target_response = self._generate_non_streaming(
 
@@ -108,6 +108,9 @@ def __init__(self,
             "embedding_bias_weights",
             "num_draft_tokens",
             "use_draft_logits",
+            "lora_task_id",
+            "lora_weights",
+            "lora_config",
         ]
 
         self.__undo_reshape_whitelist = {
@@ -409,6 +412,9 @@ def _get_llm_tensors_from_request(
             "stream": "streaming",
             "prompt_embedding_table": "prompt_embedding_table",
             "prompt_vocab_size": "prompt_vocab_size",
+            "lora_task_id": "lora_task_id",
+            "lora_weights": "lora_weights",
+            "lora_config": "lora_config",
         }
         batch_size = request.text_input.shape[0]
         tensors = self.create_triton_tensors(request, name_map)
 
@@ -215,6 +215,48 @@ input [
       dims: [ 1 ]
       reshape: { shape: [ ] }
       optional: true
+  },
+  # the unique task ID for the given LoRA.
+  # To perform inference with a specific LoRA for the first time `lora_task_id` `lora_weights` and `lora_config` must all be given.
+  # The LoRA will be cached, so that subsequent requests for the same task only require `lora_task_id`.
+  # If the cache is full the oldest LoRA will be evicted to make space for new ones.  An error is returned if `lora_task_id` is not cached.
+  {
+    name: "lora_task_id"
+	data_type: TYPE_UINT64
+	dims: [ 1 ]
+    reshape: { shape: [ ] }
+	optional: true
+  },
+  # weights for a lora adapter shape [ num_lora_modules_layers, D x Hi + Ho x D ]
+  # where the last dimension holds the in / out adapter weights for the associated module (e.g. attn_qkv) and model layer
+  # each of the in / out tensors are first flattened and then concatenated together in the format above.
+  # D=adapter_size (R value), Hi=hidden_size_in, Ho=hidden_size_out.
+  {
+    name: "lora_weights"
+	data_type: TYPE_FP16
+	dims: [ -1, -1 ]
+	optional: true
+	allow_ragged_batch: true
+  },
+  # module identifier (same size a first dimension of lora_weights)
+  # See LoraModule::ModuleType for model id mapping
+  #
+  # "attn_qkv": 0     # compbined qkv adapter
+  # "attn_q": 1       # q adapter
+  # "attn_k": 2       # k adapter
+  # "attn_v": 3       # v adapter
+  # "attn_dense": 4   # adapter for the dense layer in attention
+  # "mlp_h_to_4h": 5  # for llama2 adapter for gated mlp layer after attention / RMSNorm: up projection
+  # "mlp_4h_to_h": 6  # for llama2 adapter for gated mlp layer after attention / RMSNorm: down projection
+  # "mlp_gate": 7     # for llama2 adapter for gated mlp later after attention / RMSNorm: gate
+  #
+  # last dim holds [ module_id, layer_idx, adapter_size (D aka R value) ]
+  {
+    name: "lora_config"
+	data_type: TYPE_INT32
+	dims: [ -1, 3 ]
+	optional: true
+	allow_ragged_batch: true
   }
 ]
 output [
 
@@ -25,13 +25,16 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 import json
+import os
 import sys
 from collections import defaultdict
 
 import numpy as np
 import requests
 
-sys.path.append("/opt/tritonserver/tensorrtllm_backend/tools/utils")
+BACKEND_ROOT = os.environ.get('BACKEND_ROOT',
+                              "/opt/tritonserver/tensorrtllm_backend")
+sys.path.append(os.path.join(BACKEND_ROOT, "tools/utils"))
 import unittest
 
 import utils
@@ -75,9 +78,14 @@ def _run_infer(self, client, prompts, output_lens):
                 utils.prepare_tensor("bad_words", bad_words_list, "http"),
                 utils.prepare_tensor("stop_words", stop_words_list, "http"),
             ]
+            # Request minimal outputs
+            outputs = utils.prepare_outputs("http")
 
             async_requests.append(
-                client.async_infer(model_name, inputs, request_id=str(i)))
+                client.async_infer(model_name,
+                                   inputs,
+                                   outputs=outputs,
+                                   request_id=str(i)))
 
         try:
             utils.get_http_results(async_requests)
 
@@ -25,9 +25,10 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-BASE_DIR=/opt/tritonserver/tensorrtllm_backend/ci/L0_backend_trtllm
-GPT_DIR=/opt/tritonserver/tensorrtllm_backend/tensorrt_llm/examples/gpt
-TRTLLM_DIR=/opt/tritonserver/tensorrtllm_backend/tensorrt_llm/
+BACKEND_ROOT=${BACKEND_ROOT:='/opt/tritonserver/tensorrtllm_backend'}
+BASE_DIR=${BACKEND_ROOT}/ci/L0_backend_trtllm
+GPT_DIR=${BACKEND_ROOT}/tensorrt_llm/examples/gpt
+TRTLLM_DIR=${BACKEND_ROOT}/tensorrt_llm/
 
 function build_base_model {
     local NUM_GPUS=$1