TheCodeWrangler
diff --git a/‎.gitignore
Lines changed: 0 additions & 1 deletion b/‎.gitignore
Lines changed: 0 additions & 1 deletion
diff --git a/‎.pre-commit-config.yaml
Lines changed: 1 addition & 0 deletions b/‎.pre-commit-config.yaml
Lines changed: 1 addition & 0 deletions
diff --git a/‎README.md
Lines changed: 14 additions & 9 deletions b/‎README.md
Lines changed: 14 additions & 9 deletions
diff --git a/‎all_models/inflight_batcher_llm/postprocessing/1/model.py
Lines changed: 8 additions & 1 deletion b/‎all_models/inflight_batcher_llm/postprocessing/1/model.py
Lines changed: 8 additions & 1 deletion
diff --git a/‎all_models/inflight_batcher_llm/postprocessing/config.pbtxt
Lines changed: 8 additions & 1 deletion b/‎all_models/inflight_batcher_llm/postprocessing/config.pbtxt
Lines changed: 8 additions & 1 deletion
diff --git a/‎all_models/inflight_batcher_llm/preprocessing/1/model.py
Lines changed: 9 additions & 1 deletion b/‎all_models/inflight_batcher_llm/preprocessing/1/model.py
Lines changed: 9 additions & 1 deletion
diff --git a/‎all_models/inflight_batcher_llm/preprocessing/config.pbtxt
Lines changed: 8 additions & 1 deletion b/‎all_models/inflight_batcher_llm/preprocessing/config.pbtxt
Lines changed: 8 additions & 1 deletion
@@ -7,6 +7,5 @@ build/
 *.so
 *.egg-info/
 .coverage
-*.csv
 *.onnx
 tmp/
@@ -44,5 +44,6 @@ repos:
     rev: v2.2.4
     hooks:
     -   id: codespell
+        exclude: tools/dataset/
         args:
         - --skip=".git,tensorrt_llm"
@@ -162,16 +162,16 @@ python3 build.py --model_dir=./c-model/gpt2/4-gpu/ \
 
 ### Create the model repository
 
-There are four models in the [`all_models/inflight_batcher_llm`](./all_models/inflight_batcher_llm/)
+There are five models in the [`all_models/inflight_batcher_llm`](./all_models/inflight_batcher_llm/)
 directory that will be used in this example:
 - "preprocessing": This model is used for tokenizing, meaning the conversion from prompts(string) to input_ids(list of ints).
 - "tensorrt_llm": This model is a wrapper of your TensorRT-LLM model and is used for inferencing
 - "postprocessing": This model is used for de-tokenizing, meaning the conversion from output_ids(list of ints) to outputs(string).
-- "ensemble": This model is used to chain the three models above together:
-preprocessing -> tensorrt_llm -> postprocessing
+- "ensemble": This model can be used to chain the preprocessing, tensorrt_llm and postprocessing models together.
+- "tensorrt_llm_bls": This model can also be used to chain the preprocessing, tensorrt_llm and postprocessing models together. The BLS model has an optional parameter `accumulate_tokens` which can be used in streaming mode to call the preprocessing model with all accumulated tokens, instead of only one token. This might be necessary for certain tokenizers.
 
-To learn more about ensemble model, please see
-[here](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/architecture.md#ensemble-models).
+To learn more about ensemble and BLS models, please see the
+[Ensemble Models](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/architecture.md#ensemble-models) and [Business Logic Scripting](https://github.com/triton-inference-server/python_backend#business-logic-scripting) sections of the Triton Inference Server documentation.
 
 ```bash
 # Create the model repository that will be used by the Triton server
@@ -258,8 +258,8 @@ environment/container:
 curl -X POST localhost:8000/v2/models/${MODEL_NAME}/generate -d '{"{PARAM1_KEY}": "{PARAM1_VALUE}", ... }'
 ```
 
-In the case of the models used in this example, you can replace MODEL_NAME with `ensemble`. Examining the
-ensemble model's config.pbtxt file, you can see that 4 parameters are required to generate a response
+In the case of the models used in this example, you can replace MODEL_NAME with `ensemble` or `tensorrt_llm_bls`. Examining the
+`ensemble` and `tensorrt_llm_bls` model's config.pbtxt file, you can see that 4 parameters are required to generate a response
 for this model:
 
 - "text_input": Input text to generate a response from
@@ -272,6 +272,11 @@ Therefore, we can query the server in the following way:
 ```bash
 curl -X POST localhost:8000/v2/models/ensemble/generate -d '{"text_input": "What is machine learning?", "max_tokens": 20, "bad_words": "", "stop_words": ""}'
 ```
+if using the `ensemble` model or
+```
+curl -X POST localhost:8000/v2/models/tensorrt_llm_bls/generate -d '{"text_input": "What is machine learning?", "max_tokens": 20, "bad_words": "", "stop_words": ""}'
+```
+if using the `tensorrt_llm_bls` model.
 
 Which should return a result similar to (formatted for readability):
 ```json
@@ -292,7 +297,7 @@ You can send requests to the "tensorrt_llm" model with the provided
 as following:
 
 ```bash
-python3 inflight_batcher_llm/client/inflight_batcher_llm_client.py --request-output-len 200 --tokenizer_dir /workspace/tensorrtllm_backend/tensorrt_llm/examples/gpt/gpt2
+python3 inflight_batcher_llm/client/inflight_batcher_llm_client.py --request-output-len 200 --tokenizer-dir /workspace/tensorrtllm_backend/tensorrt_llm/examples/gpt/gpt2
 ```
 
 The result should be similar to the following:
@@ -323,7 +328,7 @@ Soyer was a member of the French Academy of Sciences and
 You can also stop the generation process early by using the `--stop-after-ms` option to send a stop request after a few milliseconds:
 
 ```bash
-python inflight_batcher_llm/client/inflight_batcher_llm_client.py --stop-after-ms 200 --request-output-len 200 --tokenizer_dir /workspace/tensorrtllm_backend/tensorrt_llm/examples/gpt/gpt2
+python inflight_batcher_llm/client/inflight_batcher_llm_client.py --stop-after-ms 200 --request-output-len 200 --tokenizer-dir /workspace/tensorrtllm_backend/tensorrt_llm/examples/gpt/gpt2
 ```
 
 You will find that the generation process is stopped early and therefore the number of generated tokens is lower than 200.
 
@@ -57,6 +57,11 @@ def initialize(self, args):
             'string_value']
         tokenizer_type = model_config['parameters']['tokenizer_type'][
             'string_value']
+        self.skip_special_tokens = model_config['parameters'].get(
+            'skip_special_tokens',
+            {'string_value': "true"})['string_value'].lower() in [
+                'true', '1', 't', 'y', 'yes'
+            ]
 
         if tokenizer_type == 't5':
             self.tokenizer = T5Tokenizer(vocab_file=tokenizer_dir,
@@ -168,6 +173,8 @@ def _postprocessing(self, tokens_batch, sequence_lengths):
         for batch_idx, beam_tokens in enumerate(tokens_batch):
             for beam_idx, tokens in enumerate(beam_tokens):
                 seq_len = sequence_lengths[batch_idx][beam_idx]
-                output = self.tokenizer.decode(tokens[:seq_len])
+                output = self.tokenizer.decode(
+                    tokens[:seq_len],
+                    skip_special_tokens=self.skip_special_tokens)
                 outputs.append(output.encode('utf8'))
         return outputs
@@ -81,9 +81,16 @@ parameters {
   }
 }
 
+parameters {
+  key: "skip_special_tokens"
+  value: {
+    string_value: "True"
+  }
+}
+
 instance_group [
     {
-        count: 1
+        count: ${postprocessing_instance_count}
         kind: KIND_CPU
     }
 ]
@@ -58,6 +58,11 @@ def initialize(self, args):
             'string_value']
         tokenizer_type = model_config['parameters']['tokenizer_type'][
             'string_value']
+        self.add_special_tokens = model_config['parameters'].get(
+            'add_special_tokens',
+            {'string_value': "false"})['string_value'].lower() in [
+                'true', '1', 't', 'y', 'yes'
+            ]
 
         if tokenizer_type == 't5':
             self.tokenizer = T5Tokenizer(vocab_file=tokenizer_dir,
@@ -207,7 +212,10 @@ def _create_request(self, query):
             query : batch string (2D numpy array)
         """
         start_ids = [
-            np.array(self.tokenizer.encode(s[0].decode())).astype(int)
+            np.array(
+                self.tokenizer.encode(
+                    s[0].decode(),
+                    add_special_tokens=self.add_special_tokens)).astype(int)
             for s in query
         ]
         start_lengths = np.array([[len(ids)] for ids in start_ids]).astype(int)
 
@@ -110,9 +110,16 @@ parameters {
   }
 }
 
+parameters {
+  key: "add_special_tokens"
+  value: {
+    string_value: "False"
+  }
+}
+
 instance_group [
     {
-        count: 1
+        count: ${preprocessing_instance_count}
         kind: KIND_CPU
     }
 ]
Original file line number	Diff line number	Diff line change
`@@ -81,9 +81,16 @@ parameters {`
`81`	`81`	`}`
`82`	`82`	`}`
`83`	`83`
	`84`	`+parameters {`
	`85`	`+ key: "skip_special_tokens"`
	`86`	`+ value: {`
	`87`	`+ string_value: "True"`
	`88`	`+ }`
	`89`	`+}`
	`90`	`+`
`84`	`91`	`instance_group [`
`85`	`92`	`{`
`86`		`- count: 1`
	`93`	`+ count: ${postprocessing_instance_count}`
`87`	`94`	`kind: KIND_CPU`
`88`	`95`	`}`
`89`	`96`	`]`
Original file line number	Diff line number	Diff line change
`@@ -110,9 +110,16 @@ parameters {`
`110`	`110`	`}`
`111`	`111`	`}`
`112`	`112`
	`113`	`+parameters {`
	`114`	`+ key: "add_special_tokens"`
	`115`	`+ value: {`
	`116`	`+ string_value: "False"`
	`117`	`+ }`
	`118`	`+}`
	`119`	`+`
`113`	`120`	`instance_group [`
`114`	`121`	`{`
`115`		`- count: 1`
	`122`	`+ count: ${preprocessing_instance_count}`
`116`	`123`	`kind: KIND_CPU`
`117`	`124`	`}`
`118`	`125`	`]`