TheCodeWrangler
diff --git a/‎all_models/inflight_batcher_llm/ensemble/config.pbtxt
Lines changed: 28 additions & 8 deletions b/‎all_models/inflight_batcher_llm/ensemble/config.pbtxt
Lines changed: 28 additions & 8 deletions
diff --git a/‎all_models/inflight_batcher_llm/postprocessing/config.pbtxt
Lines changed: 1 addition & 1 deletion b/‎all_models/inflight_batcher_llm/postprocessing/config.pbtxt
Lines changed: 1 addition & 1 deletion
diff --git a/‎all_models/inflight_batcher_llm/preprocessing/1/model.py
Lines changed: 86 additions & 12 deletions b/‎all_models/inflight_batcher_llm/preprocessing/1/model.py
Lines changed: 86 additions & 12 deletions
diff --git a/‎all_models/inflight_batcher_llm/preprocessing/config.pbtxt
Lines changed: 22 additions & 3 deletions b/‎all_models/inflight_batcher_llm/preprocessing/config.pbtxt
Lines changed: 22 additions & 3 deletions
diff --git a/‎all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt
Lines changed: 19 additions & 3 deletions b/‎all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt
Lines changed: 19 additions & 3 deletions
@@ -26,7 +26,7 @@
 
 name: "ensemble"
 platform: "ensemble"
-max_batch_size: 128
+max_batch_size: ${triton_max_batch_size}
 input [
   {
     name: "text_input"
@@ -42,11 +42,13 @@ input [
    name: "bad_words"
    data_type: TYPE_STRING
    dims: [ -1 ]
+   optional: true
   },
   {
    name: "stop_words"
    data_type: TYPE_STRING
    dims: [ -1 ]
+   optional: true
   },
   {
     name: "end_id"
@@ -60,12 +62,6 @@ input [
     dims: [ 1 ]
     optional: true
   },
-  {
-    name: "embedding_bias"
-    data_type: TYPE_FP16
-    dims: [ -1 ]
-    optional: true
-  },
   {
     name: "top_k"
     data_type: TYPE_UINT32
@@ -137,6 +133,18 @@ input [
     data_type: TYPE_UINT32
     dims: [ 1 ]
     optional: true
+  },
+  {
+      name: "embedding_bias_words"
+      data_type: TYPE_STRING
+      dims: [ -1 ]
+      optional: true
+  },
+  {
+      name: "embedding_bias_weights"
+      data_type: TYPE_FP32
+      dims: [ -1 ]
+      optional: true
   }
 ]
 output [
@@ -167,6 +175,14 @@ ensemble_scheduling {
         key: "STOP_WORDS_DICT"
         value: "stop_words"
       }
+      input_map {
+        key: "EMBEDDING_BIAS_WORDS"
+        value: "embedding_bias_words"
+      }
+      input_map {
+        key: "EMBEDDING_BIAS_WEIGHTS"
+        value: "embedding_bias_weights"
+      }
       output_map {
         key: "REQUEST_INPUT_LEN"
         value: "_REQUEST_INPUT_LEN"
@@ -187,6 +203,10 @@ ensemble_scheduling {
         key: "BAD_WORDS_IDS"
         value: "_BAD_WORDS_IDS"
       }
+      output_map {
+        key: "EMBEDDING_BIAS"
+        value: "_EMBEDDING_BIAS"
+      }
     },
     {
       model_name: "tensorrt_llm"
@@ -213,7 +233,7 @@ ensemble_scheduling {
       }
       input_map {
           key: "embedding_bias"
-          value: "embedding_bias"
+          value: "_EMBEDDING_BIAS"
       }
       input_map {
           key: "runtime_top_k"
 
@@ -26,7 +26,7 @@
 
 name: "postprocessing"
 backend: "python"
-max_batch_size: 128
+max_batch_size: ${triton_max_batch_size}
 input [
   {
     name: "TOKENS_BATCH"
 
@@ -78,17 +78,26 @@ def initialize(self, args):
                                             add_special_tokens=False)[0]
 
         # Parse model output configs and convert Triton types to numpy types
-        input_names = [
+        output_names = [
             "INPUT_ID", "REQUEST_INPUT_LEN", "BAD_WORDS_IDS", "STOP_WORDS_IDS"
         ]
+        input_names = ["EMBEDDING_BIAS_WORDS", "EMBEDDING_BIAS_WEIGHTS"]
         for input_name in input_names:
             setattr(
                 self,
                 input_name.lower() + "_dtype",
                 pb_utils.triton_string_to_numpy(
-                    pb_utils.get_output_config_by_name(
+                    pb_utils.get_input_config_by_name(
                         model_config, input_name)['data_type']))
 
+        for output_name in output_names:
+            setattr(
+                self,
+                output_name.lower() + "_dtype",
+                pb_utils.triton_string_to_numpy(
+                    pb_utils.get_output_config_by_name(
+                        model_config, output_name)['data_type']))
+
     def execute(self, requests):
         """`execute` must be implemented in every Python model. `execute`
         function receives a list of pb_utils.InferenceRequest as the only
@@ -113,23 +122,54 @@ def execute(self, requests):
 
         # Every Python backend must iterate over everyone of the requests
         # and create a pb_utils.InferenceResponse for each of them.
+        logger = pb_utils.Logger
         for idx, request in enumerate(requests):
             # Get input tensors
             query = pb_utils.get_input_tensor_by_name(request,
                                                       'QUERY').as_numpy()
+            batch_dim = query.shape[0]
+            if batch_dim != 1:
+
+                err_str = "Inflight batching backend expects requests with batch size of 1."
+                logger.log_error(err_str)
+                responses.append(
+                    pb_utils.InferenceResponse(
+                        output_tensors=[],
+                        error=pb_utils.TritonError(err_str)))
+                continue
+
             request_output_len = pb_utils.get_input_tensor_by_name(
                 request, 'REQUEST_OUTPUT_LEN').as_numpy()
 
             bad_words_dict = pb_utils.get_input_tensor_by_name(
-                request, 'BAD_WORDS_DICT').as_numpy()
+                request, 'BAD_WORDS_DICT')
+            if bad_words_dict is not None:
+                bad_words_dict = bad_words_dict.as_numpy()
+
             stop_words_dict = pb_utils.get_input_tensor_by_name(
-                request, 'STOP_WORDS_DICT').as_numpy()
+                request, 'STOP_WORDS_DICT')
+            if stop_words_dict is not None:
+                stop_words_dict = stop_words_dict.as_numpy()
+
+            embedding_bias_words = pb_utils.get_input_tensor_by_name(
+                request, 'EMBEDDING_BIAS_WORDS')
+            if embedding_bias_words is not None:
+                embedding_bias_words = embedding_bias_words.as_numpy()
+
+            embedding_bias_weights = pb_utils.get_input_tensor_by_name(
+                request, 'EMBEDDING_BIAS_WEIGHTS')
+            if embedding_bias_weights is not None:
+                embedding_bias_weights = embedding_bias_weights.as_numpy()
 
             # Preprocessing input data.
             input_id, request_input_len = self._create_request(query)
             bad_words = self._to_word_list_format(bad_words_dict)
             stop_words = self._to_word_list_format(stop_words_dict)
 
+            embedding_bias = self._get_embedding_bias(
+                embedding_bias_words, embedding_bias_weights,
+                self.embedding_bias_weights_dtype)
+
             # Create output tensors. You need pb_utils.Tensor
             # objects to create pb_utils.InferenceResponse.
             input_id_tensor = pb_utils.Tensor(
@@ -142,17 +182,13 @@ def execute(self, requests):
             bad_words_ids_tensor = pb_utils.Tensor('BAD_WORDS_IDS', bad_words)
             stop_words_ids_tensor = pb_utils.Tensor('STOP_WORDS_IDS',
                                                     stop_words)
+            embedding_bias_tensor = pb_utils.Tensor('EMBEDDING_BIAS',
+                                                    embedding_bias)
 
-            # Create InferenceResponse. You can set an error here in case
-            # there was a problem with handling this inference request.
-            # Below is an example of how you can set errors in inference
-            # response:
-            #
-            # pb_utils.InferenceResponse(
-            #    output_tensors=..., TritonError("An error occurred"))
             inference_response = pb_utils.InferenceResponse(output_tensors=[
                 input_id_tensor, bad_words_ids_tensor, stop_words_ids_tensor,
-                request_input_len_tensor, request_output_len_tensor
+                request_input_len_tensor, request_output_len_tensor,
+                embedding_bias_tensor
             ])
             responses.append(inference_response)
 
@@ -200,6 +236,10 @@ def _to_word_list_format(self, word_dict: List[List[str]]):
         '''
         assert self.tokenizer != None, "need to set tokenizer"
 
+        if word_dict is None:
+            # Return an empty array of shape (1,2,0)
+            return np.empty([1, 2, 0], dtype="int32")
+
         flat_ids = []
         offsets = []
         for word_dict_item in word_dict:
@@ -232,3 +272,37 @@ def _to_word_list_format(self, word_dict: List[List[str]]):
 
         return np.array([flat_ids, offsets], dtype="int32").transpose(
             (1, 0, 2))
+
+    def _get_embedding_bias(self, embedding_bias_words, embedding_bias_weights,
+                            bias_dtype):
+
+        assert self.tokenizer != None, "need to set tokenizer"
+
+        if embedding_bias_words is None or embedding_bias_weights is None:
+            return np.empty([1, 0], dtype=self.embedding_bias_weights_dtype)
+
+        batch_embedding_bias = []
+        for words, weights in zip(embedding_bias_words,
+                                  embedding_bias_weights):
+
+            vocab_size = self.tokenizer.vocab_size
+            embedding_bias = [0.] * vocab_size
+
+            assert len(words) == len(
+                weights
+            ), "Embedding bias words must have same dimension as embedding bias weights"
+
+            for word, weight in zip(words, weights):
+                if isinstance(word, bytes):
+                    word = word.decode()
+                ids = self.tokenizer.encode(word)
+
+                if len(ids) == 0:
+                    continue
+
+                for id in ids:
+                    embedding_bias[id] += weight
+
+            batch_embedding_bias.append(np.array(embedding_bias))
+
+        return np.array(batch_embedding_bias, dtype=bias_dtype)
@@ -26,27 +26,41 @@
 
 name: "preprocessing"
 backend: "python"
-max_batch_size: 128
+max_batch_size: ${triton_max_batch_size}
 input [
     {
         name: "QUERY"
         data_type: TYPE_STRING
         dims: [ -1 ]
     },
+    {
+        name: "REQUEST_OUTPUT_LEN"
+        data_type: TYPE_UINT32
+        dims: [ -1 ]
+    },
     {
         name: "BAD_WORDS_DICT"
         data_type: TYPE_STRING
         dims: [ -1 ]
+        optional: true
     },
     {
         name: "STOP_WORDS_DICT"
         data_type: TYPE_STRING
         dims: [ -1 ]
+        optional: true
     },
     {
-        name: "REQUEST_OUTPUT_LEN"
-        data_type: TYPE_UINT32
+        name: "EMBEDDING_BIAS_WORDS"
+        data_type: TYPE_STRING
+        dims: [ -1 ]
+        optional: true
+    },
+    {
+        name: "EMBEDDING_BIAS_WEIGHTS"
+        data_type: TYPE_FP32
         dims: [ -1 ]
+        optional: true
     }
 ]
 output [
@@ -70,6 +84,11 @@ output [
         data_type: TYPE_INT32
         dims: [ 2, -1 ]
     },
+    {
+        name: "EMBEDDING_BIAS"
+        data_type: TYPE_FP32
+        dims: [ -1 ]
+    },
     {
         name: "REQUEST_OUTPUT_LEN"
         data_type: TYPE_UINT32
 
@@ -26,17 +26,23 @@
 
 name: "tensorrt_llm"
 backend: "tensorrtllm"
-max_batch_size: 128
+max_batch_size: ${triton_max_batch_size}
 
 model_transaction_policy {
   decoupled: ${decoupled_mode}
 }
 
+dynamic_batching {
+    preferred_batch_size: [ ${triton_max_batch_size} ]
+    max_queue_delay_microseconds: ${max_queue_delay_microseconds}
+}
+
 input [
   {
     name: "input_ids"
     data_type: TYPE_INT32
     dims: [ -1 ]
+    allow_ragged_batch: true
   },
   {
     name: "input_lengths"
@@ -68,18 +74,21 @@ input [
     data_type: TYPE_INT32
     dims: [ 2, -1 ]
     optional: true
+    allow_ragged_batch: true
   },
   {
     name: "bad_words_list"
     data_type: TYPE_INT32
     dims: [ 2, -1 ]
     optional: true
+    allow_ragged_batch: true
   },
   {
     name: "embedding_bias"
-    data_type: TYPE_FP16
+    data_type: TYPE_FP32
     dims: [ -1 ]
     optional: true
+    allow_ragged_batch: true
   },
   {
     name: "beam_width"
@@ -161,6 +170,7 @@ input [
     data_type: TYPE_FP16
     dims: [ -1, -1 ]
     optional: true
+    allow_ragged_batch: true
   },
   {
     name: "prompt_vocab_size"
@@ -191,7 +201,7 @@ instance_group [
 parameters: {
   key: "max_beam_width"
   value: {
-    string_value: "1"
+    string_value: "${max_beam_width}"
   }
 }
 parameters: {
@@ -218,6 +228,12 @@ parameters: {
     string_value: "${max_tokens_in_paged_kv_cache}"
   }
 }
+parameters: {
+  key: "max_kv_cache_length"
+  value: {
+    string_value: "${max_kv_cache_length}"
+  }
+}
 parameters: {
   key: "batch_scheduler_policy"
   value: {
Original file line number	Diff line number	Diff line change
`@@ -26,7 +26,7 @@`
`26`	`26`
`27`	`27`	`name: "postprocessing"`
`28`	`28`	`backend: "python"`
`29`		`-max_batch_size: 128`
	`29`	`+max_batch_size: ${triton_max_batch_size}`
`30`	`30`	`input [`
`31`	`31`	`{`
`32`	`32`	`name: "TOKENS_BATCH"`