TheCodeWrangler
diff --git a/‎all_models/gpt/preprocessing/1/model.py
Lines changed: 13 additions & 15 deletions b/‎all_models/gpt/preprocessing/1/model.py
Lines changed: 13 additions & 15 deletions
diff --git a/‎all_models/inflight_batcher_llm/preprocessing/1/model.py
Lines changed: 10 additions & 17 deletions b/‎all_models/inflight_batcher_llm/preprocessing/1/model.py
Lines changed: 10 additions & 17 deletions
diff --git a/‎ci/L0_backend_trtllm/test.sh
Lines changed: 4 additions & 4 deletions b/‎ci/L0_backend_trtllm/test.sh
Lines changed: 4 additions & 4 deletions
diff --git a/‎ci/README.md
Lines changed: 5 additions & 5 deletions b/‎ci/README.md
Lines changed: 5 additions & 5 deletions
diff --git a/‎dockerfile/Dockerfile.trt_llm_backend
Lines changed: 10 additions & 2 deletions b/‎dockerfile/Dockerfile.trt_llm_backend
Lines changed: 10 additions & 2 deletions
diff --git a/‎inflight_batcher_llm/README.md
Lines changed: 6 additions & 6 deletions b/‎inflight_batcher_llm/README.md
Lines changed: 6 additions & 6 deletions
diff --git a/‎inflight_batcher_llm/client/end_to_end_grpc_client.py
Lines changed: 0 additions & 2 deletions b/‎inflight_batcher_llm/client/end_to_end_grpc_client.py
Lines changed: 0 additions & 2 deletions
@@ -1,5 +1,4 @@
 # -*- coding: utf-8 -*-
-import csv
 import json
 from typing import List
 
@@ -164,30 +163,29 @@ def _create_request(self, query):
 
         return start_ids, start_lengths
 
-    def _to_word_list_format(self, word_dict: List[List[str]]):
+    def _to_word_list_format(self, word_lists: List[List[str | bytes]]):
         '''
-        format of word_dict
-            len(word_dict) should be same to batch_size
-            word_dict[i] means the words for batch i
-            len(word_dict[i]) must be 1, which means it only contains 1 string
-            This string can contains several sentences and split by ",".
-            For example, if word_dict[2] = " I am happy, I am sad", then this function will return
-            the ids for two short sentences " I am happy" and " I am sad".
+        word_lists format:
+            len(word_lists) == batch_size
+            word_lists[i] means the words associated to batch item i. A "word" may actually be any string. Like "lorem" or "lorem ipsum".
         '''
         assert self.tokenizer != None, "need to set tokenizer"
 
+        if word_lists is None:
+            # Return an empty array of shape (1,2,0)
+            return np.empty([1, 2, 0], dtype="int32")
+
         flat_ids = []
         offsets = []
-        for word_dict_item in word_dict:
+        for word_list in word_lists:
             item_flat_ids = []
             item_offsets = []
 
-            if isinstance(word_dict_item[0], bytes):
-                word_dict_item = [word_dict_item[0].decode()]
+            for word in word_list:
+                if isinstance(word, bytes):
+                    word = word.decode()
 
-            words = list(csv.reader(word_dict_item))[0]
-            for word in words:
-                ids = self.tokenizer.encode(word)
+                ids = self.tokenizer.encode(word, add_special_tokens=False)
 
                 if len(ids) == 0:
                     continue
 
@@ -24,7 +24,6 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import csv
 import json
 from typing import List
 
@@ -224,35 +223,29 @@ def _create_request(self, query):
 
         return start_ids, start_lengths
 
-    def _to_word_list_format(self, word_dict: List[List[str]]):
+    def _to_word_list_format(self, word_lists: List[List[str | bytes]]):
         '''
-        format of word_dict
-            len(word_dict) should be same to batch_size
-            word_dict[i] means the words for batch i
-            len(word_dict[i]) must be 1, which means it only contains 1 string
-            This string can contains several sentences and split by ",".
-            For example, if word_dict[2] = " I am happy, I am sad", then this function will return
-            the ids for two short sentences " I am happy" and " I am sad".
+        word_lists format:
+            len(word_lists) == batch_size
+            word_lists[i] means the words associated to batch item i. A "word" may actually be any string. Like "lorem" or "lorem ipsum".
         '''
         assert self.tokenizer != None, "need to set tokenizer"
 
-        if word_dict is None:
+        if word_lists is None:
             # Return an empty array of shape (1,2,0)
             return np.empty([1, 2, 0], dtype="int32")
 
         flat_ids = []
         offsets = []
-        for word_dict_item in word_dict:
+        for word_list in word_lists:
             item_flat_ids = []
             item_offsets = []
 
-            if isinstance(word_dict_item[0], bytes):
-                word_dict_item = [word_dict_item[0].decode()]
-
-            words = list(csv.reader(word_dict_item))[0]
-            for word in words:
-                ids = self.tokenizer.encode(word)
+            for word in word_list:
+                if isinstance(word, bytes):
+                    word = word.decode()
 
+                ids = self.tokenizer.encode(word, add_special_tokens=False)
                 if len(ids) == 0:
                     continue
 
 
@@ -136,15 +136,15 @@ if [ "$WAIT_RET" != "0" ]; then
 fi
 
 set -e
-python3 ${TOOLS_DIR}/inflight_batcher_llm/identity_test.py \
+python3 ${TOOLS_DIR}/inflight_batcher_llm/benchmark_core_model.py \
     --max-input-len=500 \
     dataset \
     --dataset=${DATASET} \
     --tokenizer-dir=${TOKENIZER_DIR}
 
 if [ $? -ne 0 ]; then
     cat $SERVER_LOG
-    echo -e "\n***\n*** Error executing inflight batching identity test: line ${LINENO}\n***"
+    echo -e "\n***\n*** Error executing inflight batching benchmark_core_model: line ${LINENO}\n***"
     RET=1
 fi
 set +e
@@ -180,14 +180,14 @@ if [ "$WAIT_RET" != "0" ]; then
 fi
 
 set -e
-python3 ${TOOLS_DIR}/inflight_batcher_llm/identity_test.py \
+python3 ${TOOLS_DIR}/inflight_batcher_llm/benchmark_core_model.py \
     --max-input-len=500 \
     --dataset=${DATASET} \
     --tokenizer-dir=${TOKENIZER_DIR}
 
 if [ $? -ne 0 ]; then
     cat $SERVER_LOG
-    echo -e "\n***\n*** Error executing inflight batching identity test: line ${LINENO}\n***"
+    echo -e "\n***\n*** Error executing inflight batching benchmark_core_model: line ${LINENO}\n***"
     RET=1
 fi
 set +e
 
@@ -47,7 +47,7 @@ cd /tensorrtllm_backend/ci/<test directory>
 bash -x ./test.sh
 ```
 
-## Run the e2e/identity test to benchmark
+## Run the e2e/benchmark_core_model to benchmark
 
 These two tests are ran in the [L0_backend_trtllm](./L0_backend_trtllm/)
 test. Below are the instructions to run the tests manually.
@@ -89,17 +89,17 @@ Expected outputs
 [INFO] Total Latency: 11099.243 ms
 ```
 
-### Identity test
+### benchmark_core_model
 
-[Identity test script](../tools/inflight_batcher_llm/identity_test.py)
-sends requests directly to the deployed `tensorrt_llm` model, the identity test
+[benchmark_core_model script](../tools/inflight_batcher_llm/benchmark_core_model.py)
+sends requests directly to the deployed `tensorrt_llm` model, the benchmark_core_model
 latency indicates the inference latency of TensorRT-LLM, not including the
 pre/post-processing latency which is usually handled by a third-party library
 such as HuggingFace.
 
 ```bash
 cd tools/inflight_batcher_llm
-python3 identity_test.py dataset --dataset <dataset path>
+python3 benchmark_core_model.py dataset --dataset <dataset path>
 ```
 
 Expected outputs
 
@@ -15,8 +15,16 @@ RUN pip uninstall -y tensorrt
 
 FROM base as dev
 
-ENV SHINIT_FILE=${BASH_ENV}
-
+ARG TRT_VER="9.1.0.4"
+ENV TRT_VER=$TRT_VER
+ARG CUDA_VER="12.2"
+ENV CUDA_VER=$CUDA_VER
+ARG CUDNN_VER="8.9.4.25-1+cuda12.2"
+ENV CUDNN_VER=$CUDNN_VER
+ARG NCCL_VER="2.18.3-1+cuda12.2"
+ENV NCCL_VER=$NCCL_VER
+ARG CUBLAS_VER="12.2.5.6-1"
+ENV CUBLAS_VER=$CUBLAS_VER
 # Download & install internal TRT release
 COPY tensorrt_llm/docker/common/install_tensorrt.sh /tmp/
 RUN bash /tmp/install_tensorrt.sh && rm /tmp/install_tensorrt.sh
 
@@ -132,7 +132,7 @@ You will find that the generation process is stopped early and therefore the num
 
 You can have a look at the client code to see how early stopping is achieved.
 
-## Run the e2e/identity test to benchmark
+## Run the e2e/benchmark_core_model to benchmark
 
 ### End to end test
 End to end test script sends requests to deployed ensemble model.
@@ -156,11 +156,11 @@ Expected outputs
 [INFO] Total Latency: 11099.243 ms
 ```
 
-### Identity test
+### benchmark core model
 
-Identity test script sends requests directly to deployed tensorrt_llm model, the identity test latency indicates the inference latency of TensorRT-LLM, not including the pre/post-processing latency which is usually handled by a third-party library such as HuggingFace.
+benchmark_core_model script sends requests directly to deployed tensorrt_llm model, the benchmark core model latency indicates the inference latency of TensorRT-LLM, not including the pre/post-processing latency which is usually handled by a third-party library such as HuggingFace.
 
-Identity test can generate traffic from 2 sources.
+benchmark_core_model can generate traffic from 2 sources.
 1 - dataset (json file containning prompts and optional responses)
 2 - token normal distribution (user specified input, output seqlen)
 
@@ -171,11 +171,11 @@ cd tools/inflight_batcher_llm
 ```
 Example: Run dataset with 10 req/sec requested rate with provided tokenizer.
 ```
-python3 identity_test.py -i grpc --request_rate 10 dataset --dataset <dataset path> --tokenizer_dir <> --tokenizer_type <>
+python3 benchmark_core_model.py -i grpc --request_rate 10 dataset --dataset <dataset path> --tokenizer_dir <> --tokenizer_type <> --num_requests 5000
 ```
 Example: Generate I/O seqlen tokens with input normal distribution with mean_seqlen=128, stdev=10. Output normal distribution with mean_seqlen=20, stdev=2. Set stdev=0 to get constant seqlens.
 ```
-python3 identity_test.py -i grpc --request_rate 10 token_norm_dist --input_mean 128 --input_stdev 5 --output_mean 20 --output_stdev 2 --num_requests 5000
+python3 benchmark_core_model.py -i grpc --request_rate 10 token_norm_dist --input_mean 128 --input_stdev 5 --output_mean 20 --output_stdev 2 --num_requests 5000
 ```
 Expected outputs
 ```
 
@@ -47,8 +47,6 @@ def test(triton_client, prompt, request_id, repetition_penalty,
     input0 = [[prompt]]
     input0_data = np.array(input0).astype(object)
     output0_len = np.ones_like(input0).astype(np.uint32) * FLAGS.output_len
-    bad_words_list = np.array([bad_words], dtype=object)
-    stop_words_list = np.array([stop_words], dtype=object)
     streaming = [[FLAGS.streaming]]
     streaming_data = np.array(streaming, dtype=bool)
     beam_width = [[FLAGS.beam_width]]