TheCodeWrangler
diff --git a/‎README.md
Lines changed: 50 additions & 1 deletion b/‎README.md
Lines changed: 50 additions & 1 deletion
diff --git a/‎all_models/gpt/ensemble/config.pbtxt
Lines changed: 10 additions & 0 deletions b/‎all_models/gpt/ensemble/config.pbtxt
Lines changed: 10 additions & 0 deletions
diff --git a/‎all_models/gpt/tensorrt_llm/1/model.py
Lines changed: 4 additions & 0 deletions b/‎all_models/gpt/tensorrt_llm/1/model.py
Lines changed: 4 additions & 0 deletions
diff --git a/‎all_models/gpt/tensorrt_llm/config.pbtxt
Lines changed: 7 additions & 0 deletions b/‎all_models/gpt/tensorrt_llm/config.pbtxt
Lines changed: 7 additions & 0 deletions
diff --git a/‎all_models/inflight_batcher_llm/ensemble/config.pbtxt
100755100644
Lines changed: 82 additions & 2 deletions b/‎all_models/inflight_batcher_llm/ensemble/config.pbtxt
100755100644
Lines changed: 82 additions & 2 deletions
diff --git a/‎all_models/inflight_batcher_llm/postprocessing/1/model.py
Lines changed: 16 additions & 1 deletion b/‎all_models/inflight_batcher_llm/postprocessing/1/model.py
Lines changed: 16 additions & 1 deletion
diff --git a/‎all_models/inflight_batcher_llm/postprocessing/config.pbtxt
100755100644
Lines changed: 22 additions & 0 deletions b/‎all_models/inflight_batcher_llm/postprocessing/config.pbtxt
100755100644
Lines changed: 22 additions & 0 deletions
@@ -218,7 +218,7 @@ The following table shows the fields that may to be modified before deployment:
 | `max_beam_width` | Optional (default=1). The maximum beam width that any request may ask for when using beam search.|
 | `max_tokens_in_paged_kv_cache` | Optional (default=unspecified). The maximum size of the KV cache in number of tokens. If unspecified, value is interpreted as 'infinite'. KV cache allocation is the min of max_tokens_in_paged_kv_cache and value derived from kv_cache_free_gpu_mem_fraction below. |
 | `max_attention_window_size` | Optional (default=max_sequence_length). When using techniques like sliding window attention, the maximum number of tokens that are attended to generate one token. Defaults attends to all tokens in sequence. |
-| `kv_cache_free_gpu_mem_fraction` | Optional (default=0.85). Set to a number between 0 and 1 to indicate the maximum fraction of GPU memory (after loading the model) that may be used for KV cache.|
+| `kv_cache_free_gpu_mem_fraction` | Optional (default=0.9). Set to a number between 0 and 1 to indicate the maximum fraction of GPU memory (after loading the model) that may be used for KV cache.|
 | `max_num_sequences` | Optional (default=`max_batch_size` if `enable_trt_overlap` is `false` and to `2 * max_batch_size` if `enable_trt_overlap` is `true`, where `max_batch_size` is the TRT engine maximum batch size). Maximum number of sequences that the in-flight batching scheme can maintain state for.
 | `enable_trt_overlap` | Optional (default=`true`). Set to `true` to partition available requests into 2 'microbatches' that can be run concurrently to hide exposed CPU runtime |
 | `exclude_input_in_output` | Optional (default=`false`). Set to `true` to only return completion tokens in a response. Set to `false` to return the prompt tokens concatenated with the generated tokens  |
@@ -346,6 +346,7 @@ He was a member of the French Academy of Sciences and the French Academy of Arts
 Soyer was a member of the French Academy of Sciences and
 ```
 
+#### Early stopping
 You can also stop the generation process early by using the `--stop-after-ms`
 option to send a stop request after a few milliseconds:
 
@@ -357,6 +358,54 @@ You will find that the generation process is stopped early and therefore the
 number of generated tokens is lower than 200. You can have a look at the
 client code to see how early stopping is achieved.
 
+#### Return context logits and/or generation logits
+If you want to get context logits and/or generation logits, you need to enable `--gather_context_logits` and/or `--gather_generation_logits` when building the engine (or `--enable gather_all_token_logits` to enable both at the same time). For more setting details about these two flags, please refer to [build.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/gpt/build.py) or [gpt_runtime](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/gpt_runtime.md).
+
+After launching the server, you could get the output of logits by passing the corresponding parameters `--return-context-logits` and/or `--return-generation-logits` in the client scripts (`end_to_end_grpc_client.py` and `inflight_batcher_llm_client.py`). For example:
+```bash
+python3 inflight_batcher_llm/client/inflight_batcher_llm_client.py --request-output-len 20 --tokenizer-dir /path/to/tokenizer/ \
+--return-context-logits \
+--return-generation-logits
+```
+
+The result should be similar to the following:
+```
+Input sequence:  [28524, 287, 5093, 12, 23316, 4881, 11, 30022, 263, 8776, 355, 257]
+Got completed request
+Input: Born in north-east France, Soyer trained as a
+Output beam 0:  has since worked in restaurants in London,
+Output sequence:  [21221, 878, 3867, 284, 3576, 287, 262, 1903, 6303, 82, 13, 679, 468, 1201, 3111, 287, 10808, 287, 3576, 11]
+context_logits.shape: (1, 12, 50257)
+context_logits: [[[ -65.9822     -62.267445   -70.08991   ...  -76.16964    -78.8893
+    -65.90678  ]
+  [-103.40278   -102.55243   -106.119026  ... -108.925415  -109.408585
+   -101.37687  ]
+  [ -63.971176   -64.03466    -67.58809   ...  -72.141235   -71.16892
+    -64.23846  ]
+  ...
+  [ -80.776375   -79.1815     -85.50916   ...  -87.07368    -88.02817
+    -79.28435  ]
+  [ -10.551408    -7.786484   -14.524468  ...  -13.805856   -15.767286
+     -7.9322424]
+  [-106.33096   -105.58956   -111.44852   ... -111.04858   -111.994194
+   -105.40376  ]]]
+generation_logits.shape: (1, 1, 20, 50257)
+generation_logits: [[[[-106.33096  -105.58956  -111.44852  ... -111.04858  -111.994194
+    -105.40376 ]
+   [ -77.867424  -76.96638   -83.119095 ...  -87.82542   -88.53957
+     -75.64877 ]
+   [-136.92282  -135.02484  -140.96051  ... -141.78284  -141.55045
+    -136.01668 ]
+   ...
+   [-100.03721   -98.98237  -105.25507  ... -108.49254  -109.45882
+     -98.95136 ]
+   [-136.78777  -136.16165  -139.13437  ... -142.21495  -143.57468
+    -134.94667 ]
+   [  19.222942   19.127287   14.804495 ...   10.556551    9.685863
+      19.625107]]]]
+```
+
+
 ### Launch Triton server *within Slurm based clusters*
 
 #### Prepare some scripts
 
@@ -76,6 +76,12 @@ input [
     dims: [ 1 ]
     optional: true
   },
+  {
+    name: "frequency_penalty"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    optional: true
+  },
   {
     name: "random_seed"
     data_type: TYPE_UINT64
@@ -187,6 +193,10 @@ ensemble_scheduling {
           key: "presence_penalty"
           value: "presence_penalty"
       }
+      input_map {
+          key: "frequency_penalty"
+          value: "frequency_penalty"
+      }
       input_map {
           key: "random_seed"
           value: "random_seed"
 
@@ -173,6 +173,8 @@ def execute(self, requests):
                     request, 'min_length')
                 inputs['presence_penalty'] = get_input_scalar_by_name(
                     request, 'presence_penalty')
+                inputs['frequency_penalty'] = get_input_scalar_by_name(
+                    request, 'frequency_penalty')
                 inputs['random_seed'] = get_input_scalar_by_name(
                     request, 'random_seed')
                 inputs['output_log_probs'] = get_input_scalar_by_name(
@@ -203,6 +205,8 @@ def execute(self, requests):
                 sampling_config.min_length = inputs['min_length']
             if inputs['presence_penalty'] is not None:
                 sampling_config.presence_penalty = inputs['presence_penalty']
+            if inputs['frequency_penalty'] is not None:
+                sampling_config.frequency_penalty = inputs['frequency_penalty']
             sampling_config.random_seed = inputs['random_seed']
             sampling_config.output_log_probs = inputs['output_log_probs']
             if self.remove_input_padding:
 
@@ -92,6 +92,13 @@ input [
     reshape: { shape: [ ] }
     optional: true
   },
+  {
+    name: "frequency_penalty"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
   {
     name: "random_seed"
     data_type: TYPE_UINT64
 
@@ -104,6 +104,12 @@ input [
     dims: [ 1 ]
     optional: true
   },
+  {
+    name: "frequency_penalty"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    optional: true
+  },
   {
     name: "random_seed"
     data_type: TYPE_UINT64
@@ -116,6 +122,18 @@ input [
     dims: [ 1 ]
     optional: true
   },
+  {
+    name: "return_context_logits"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "return_generation_logits"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    optional: true
+  },
   {
     name: "beam_width"
     data_type: TYPE_INT32
@@ -168,6 +186,16 @@ output [
     name: "output_log_probs"
     data_type: TYPE_FP32
     dims: [ -1, -1 ]
+  },
+  {
+    name: "context_logits"
+    data_type: TYPE_FP32
+    dims: [ -1, -1 ]
+  },
+  {
+    name: "generation_logits"
+    data_type: TYPE_FP32
+    dims: [ -1, -1, -1 ]
   }
 ]
 ensemble_scheduling {
@@ -199,6 +227,14 @@ ensemble_scheduling {
         key: "EMBEDDING_BIAS_WEIGHTS"
         value: "embedding_bias_weights"
       }
+      input_map {
+        key: "END_ID"
+        value: "end_id"
+      }
+      input_map {
+        key: "PAD_ID"
+        value: "pad_id"
+      }
       output_map {
         key: "REQUEST_INPUT_LEN"
         value: "_REQUEST_INPUT_LEN"
@@ -223,6 +259,14 @@ ensemble_scheduling {
         key: "EMBEDDING_BIAS"
         value: "_EMBEDDING_BIAS"
       }
+      output_map {
+        key: "OUT_END_ID"
+        value: "_PREPROCESSOR_END_ID"
+      }
+      output_map {
+        key: "OUT_PAD_ID"
+        value: "_PREPROCESSOR_PAD_ID"
+      }
     },
     {
       model_name: "tensorrt_llm"
@@ -241,11 +285,11 @@ ensemble_scheduling {
       }
       input_map {
           key: "end_id"
-          value: "end_id"
+          value: "_PREPROCESSOR_END_ID"
       }
       input_map {
           key: "pad_id"
-          value: "pad_id"
+          value: "_PREPROCESSOR_PAD_ID"
       }
       input_map {
           key: "embedding_bias"
@@ -279,6 +323,10 @@ ensemble_scheduling {
           key: "presence_penalty"
           value: "presence_penalty"
       }
+      input_map {
+          key: "frequency_penalty"
+          value: "frequency_penalty"
+      }
       input_map {
           key: "random_seed"
           value: "random_seed"
@@ -287,6 +335,14 @@ ensemble_scheduling {
           key: "return_log_probs"
           value: "return_log_probs"
       }
+      input_map {
+          key: "return_context_logits"
+          value: "return_context_logits"
+      }
+      input_map {
+          key: "return_generation_logits"
+          value: "return_generation_logits"
+      }
       input_map {
           key: "beam_width"
           value: "beam_width"
@@ -326,6 +382,14 @@ ensemble_scheduling {
       output_map {
         key: "output_log_probs"
         value: "_OUTPUT_LOG_PROBS"
+      },
+      output_map {
+        key: "context_logits"
+        value: "_CONTEXT_LOGITS"
+      },
+      output_map {
+        key: "generation_logits"
+        value: "_GENERATION_LOGITS"
       }
     },
     {
@@ -343,6 +407,14 @@ ensemble_scheduling {
         key: "OUTPUT_LOG_PROBS"
         value: "_OUTPUT_LOG_PROBS"
       }
+      input_map {
+        key: "CONTEXT_LOGITS"
+        value: "_CONTEXT_LOGITS"
+      }
+      input_map {
+        key: "GENERATION_LOGITS"
+        value: "_GENERATION_LOGITS"
+      }
       input_map {
         key: "SEQUENCE_LENGTH"
         value: "_SEQUENCE_LENGTH"
@@ -359,6 +431,14 @@ ensemble_scheduling {
         key: "OUT_CUM_LOG_PROBS"
         value: "cum_log_probs"
       }
+      output_map {
+        key: "OUT_CONTEXT_LOGITS"
+        value: "context_logits"
+      }
+      output_map {
+        key: "OUT_GENERATION_LOGITS"
+        value: "generation_logits"
+      }
     }
   ]
 }
@@ -126,6 +126,14 @@ def execute(self, requests):
             output_log_probs = pb_utils.get_input_tensor_by_name(
                 request, 'OUTPUT_LOG_PROBS').as_numpy()
 
+            # Get context logits
+            context_logits = pb_utils.get_input_tensor_by_name(
+                request, 'CONTEXT_LOGITS').as_numpy()
+
+            # Get generation logits
+            generation_logits = pb_utils.get_input_tensor_by_name(
+                request, 'GENERATION_LOGITS').as_numpy()
+
             # Reshape Input
             # tokens_batch = tokens_batch.reshape([-1, tokens_batch.shape[0]])
             # tokens_batch = tokens_batch.T
@@ -145,6 +153,12 @@ def execute(self, requests):
             out_output_log_probs = pb_utils.Tensor('OUT_OUTPUT_LOG_PROBS',
                                                    output_log_probs)
 
+            out_context_logits = pb_utils.Tensor('OUT_CONTEXT_LOGITS',
+                                                 context_logits)
+
+            out_generation_logits = pb_utils.Tensor('OUT_GENERATION_LOGITS',
+                                                    generation_logits)
+
             # Create InferenceResponse. You can set an error here in case
             # there was a problem with handling this inference request.
             # Below is an example of how you can set errors in inference
@@ -153,7 +167,8 @@ def execute(self, requests):
             # pb_utils.InferenceResponse(
             #    output_tensors=..., TritonError("An error occurred"))
             inference_response = pb_utils.InferenceResponse(output_tensors=[
-                output_tensor, out_cum_log_probs, out_output_log_probs
+                output_tensor, out_cum_log_probs, out_output_log_probs,
+                out_context_logits, out_generation_logits
             ])
             responses.append(inference_response)
 
 
@@ -47,6 +47,18 @@ input [
     name: "OUTPUT_LOG_PROBS"
     data_type: TYPE_FP32
     dims: [ -1, -1 ]
+  },
+  {
+    name: "CONTEXT_LOGITS"
+    data_type: TYPE_FP32
+    dims: [ -1, -1 ]
+    optional: true
+  },
+  {
+    name: "GENERATION_LOGITS"
+    data_type: TYPE_FP32
+    dims: [ -1, -1, -1 ]
+    optional: true
   }
 ]
 output [
@@ -64,6 +76,16 @@ output [
     name: "OUT_OUTPUT_LOG_PROBS"
     data_type: TYPE_FP32
     dims: [ -1, -1 ]
+  },
+  {
+    name: "OUT_CONTEXT_LOGITS"
+    data_type: TYPE_FP32
+    dims: [ -1, -1 ]
+  },
+  {
+    name: "OUT_GENERATION_LOGITS"
+    data_type: TYPE_FP32
+    dims: [ -1, -1, -1 ]
   }
 ]