TheCodeWrangler
diff --git a/‎README.md
Lines changed: 5 additions & 5 deletions b/‎README.md
Lines changed: 5 additions & 5 deletions
diff --git a/‎all_models/inflight_batcher_llm/postprocessing/1/model.py
Lines changed: 22 additions & 5 deletions b/‎all_models/inflight_batcher_llm/postprocessing/1/model.py
Lines changed: 22 additions & 5 deletions
diff --git a/‎all_models/inflight_batcher_llm/postprocessing/config.pbtxt
Lines changed: 1 addition & 1 deletion b/‎all_models/inflight_batcher_llm/postprocessing/config.pbtxt
Lines changed: 1 addition & 1 deletion
diff --git a/‎all_models/inflight_batcher_llm/preprocessing/1/model.py
Lines changed: 21 additions & 5 deletions b/‎all_models/inflight_batcher_llm/preprocessing/1/model.py
Lines changed: 21 additions & 5 deletions
@@ -70,10 +70,10 @@ The below commands will build the same Triton TRT-LLM container as the one on th
 # Prepare the TRT-LLM base image using the dockerfile from tensorrtllm_backend.
 cd tensorrtllm_backend
 # Specify the build args for the dockerfile.
-BASE_IMAGE=nvcr.io/nvidia/pytorch:24.02-py3
-TRT_VERSION=9.3.0.1
-TRT_URL_x86=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/9.3.0/tensorrt-9.3.0.1.linux.x86_64-gnu.cuda-12.2.tar.gz
-TRT_URL_ARM=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/9.3.0/tensorrt-9.3.0.1.ubuntu-22.04.aarch64-gnu.cuda-12.2.tar.gz
+BASE_IMAGE=nvcr.io/nvidia/pytorch:24.03-py3
+TRT_VERSION=10.0.1.6
+TRT_URL_x86=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.0.1/tars/TensorRT-10.0.1.6.Linux.x86_64-gnu.cuda-12.4.tar.gz
+TRT_URL_ARM=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.0.1/tars/TensorRT-10.0.1.6.ubuntu-22.04.aarch64-gnu.cuda-12.4.tar.gz
 
 docker build -t trtllm_base \
              --build-arg BASE_IMAGE="${BASE_IMAGE}" \
@@ -297,9 +297,9 @@ The following table shows the fields that may to be modified before deployment:
 | `max_tokens_in_paged_kv_cache` | Optional (default=unspecified). The maximum size of the KV cache in number of tokens. If unspecified, value is interpreted as 'infinite'. KV cache allocation is the min of max_tokens_in_paged_kv_cache and value derived from kv_cache_free_gpu_mem_fraction below. |
 | `max_attention_window_size` | Optional (default=max_sequence_length). When using techniques like sliding window attention, the maximum number of tokens that are attended to generate one token. Defaults attends to all tokens in sequence. |
 | `kv_cache_free_gpu_mem_fraction` | Optional (default=0.9). Set to a number between 0 and 1 to indicate the maximum fraction of GPU memory (after loading the model) that may be used for KV cache.|
-| `enable_trt_overlap` | Optional (default=`false`). Set to `true` to partition available requests into 2 'microbatches' that can be run concurrently to hide exposed CPU runtime |
 | `exclude_input_in_output` | Optional (default=`false`). Set to `true` to only return completion tokens in a response. Set to `false` to return the prompt tokens concatenated with the generated tokens  |
 | `cancellation_check_period_ms` | Optional (default=100). The time for cancellation check thread to sleep before doing the next check. It checks if any of the current active requests are cancelled through triton and prevent further execution of them. |
+| `stats_check_period_ms` | Optional (default=100). The time for the statistics reporting thread to sleep before doing the next check. |
 | `iter_stats_max_iterations` | Optional (default=executor::kDefaultIterStatsMaxIterations). The numbers of iteration stats to be kept. |
 | `request_stats_max_iterations` | Optional (default=executor::kDefaultRequestStatsMaxIterations). The numbers of request stats to be kept. |
 | `normalize_log_probs` | Optional (default=`true`). Set to `false` to skip normalization of `output_log_probs`  |
 
@@ -55,11 +55,28 @@ def initialize(self, args):
         model_config = json.loads(args['model_config'])
         tokenizer_dir = model_config['parameters']['tokenizer_dir'][
             'string_value']
-        self.skip_special_tokens = model_config['parameters'].get(
-            'skip_special_tokens',
-            {'string_value': "true"})['string_value'].lower() in [
-                'true', '1', 't', 'y', 'yes'
-            ]
+
+        skip_special_tokens = model_config['parameters'].get(
+            'skip_special_tokens')
+        if skip_special_tokens is not None:
+            skip_special_tokens_str = skip_special_tokens[
+                'string_value'].lower()
+            if skip_special_tokens_str in [
+                    'true', 'false', '1', '0', 't', 'f', 'y', 'n', 'yes', 'no'
+            ]:
+                self.skip_special_tokens = skip_special_tokens_str in [
+                    'true', '1', 't', 'y', 'yes'
+                ]
+            else:
+                print(
+                    f"[TensorRT-LLM][WARNING] Don't setup 'skip_special_tokens' correctly (set value is {skip_special_tokens['string_value']}). Set it as True by default."
+                )
+                self.skip_special_tokens = True
+        else:
+            print(
+                f"[TensorRT-LLM][WARNING] Don't setup 'skip_special_tokens'. Set it as True by default."
+            )
+            self.skip_special_tokens = True
 
         self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
                                                        legacy=False,
 
@@ -101,7 +101,7 @@ parameters {
 parameters {
   key: "skip_special_tokens"
   value: {
-    string_value: "True"
+    string_value: "${skip_special_tokens}"
   }
 }
 
 
@@ -56,11 +56,27 @@ def initialize(self, args):
         model_config = json.loads(args['model_config'])
         tokenizer_dir = model_config['parameters']['tokenizer_dir'][
             'string_value']
-        self.add_special_tokens = model_config['parameters'].get(
-            'add_special_tokens',
-            {'string_value': "false"})['string_value'].lower() in [
-                'true', '1', 't', 'y', 'yes'
-            ]
+
+        add_special_tokens = model_config['parameters'].get(
+            'add_special_tokens')
+        if add_special_tokens is not None:
+            add_special_tokens_str = add_special_tokens['string_value'].lower()
+            if add_special_tokens_str in [
+                    'true', 'false', '1', '0', 't', 'f', 'y', 'n', 'yes', 'no'
+            ]:
+                self.add_special_tokens = add_special_tokens_str in [
+                    'true', '1', 't', 'y', 'yes'
+                ]
+            else:
+                print(
+                    f"[TensorRT-LLM][WARNING] Don't setup 'add_special_tokens' correctly (set value is {add_special_tokens['string_value']}). Set it as True by default."
+                )
+                self.add_special_tokens = True
+        else:
+            print(
+                f"[TensorRT-LLM][WARNING] Don't setup 'add_special_tokens'. Set it as True by default."
+            )
+            self.add_special_tokens = True
 
         self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
                                                        legacy=False,
Original file line number	Diff line number	Diff line change
`@@ -101,7 +101,7 @@ parameters {`
`101`	`101`	`parameters {`
`102`	`102`	`key: "skip_special_tokens"`
`103`	`103`	`value: {`
`104`		`- string_value: "True"`
	`104`	`+ string_value: "${skip_special_tokens}"`
`105`	`105`	`}`
`106`	`106`	`}`
`107`	`107`