diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml
new file mode 100644
index 00000000..21518a54
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug_report.yml
@@ -0,0 +1,117 @@
+name: "Bug Report"
+description: Submit a bug report to help us improve TensorRT-LLM backend
+labels: [ "bug" ]
+body:
+  - type: textarea
+    id: system-info
+    attributes:
+      label: System Info
+      description: Please share your system info with us.
+      placeholder: |
+        - CPU architecture (e.g., x86_64, aarch64)
+        - CPU/Host memory size (if known)
+        - GPU properties
+          - GPU name (e.g., NVIDIA H100, NVIDIA A100, NVIDIA L40S)
+          - GPU memory size (if known)
+          - Clock frequencies used (if applicable)
+        - Libraries
+          - TensorRT-LLM branch or tag (e.g., main, v0.7.1)
+          - TensorRT-LLM commit (if known)
+          - Versions of TensorRT, AMMO, CUDA, cuBLAS, etc. used
+          - Container used (if running TensorRT-LLM in a container)
+        - NVIDIA driver version
+        - OS (Ubuntu 22.04, CentOS 7, Windows 10)
+        - Docker image version
+        - Any other information that may be useful in reproducing the bug
+    validations:
+      required: true
+
+  - type: textarea
+    id: who-can-help
+    attributes:
+      label: Who can help?
+      description: |
+        To expedite the response to your issue, it would be helpful if you could identify the appropriate person
+        to tag using the **@** symbol. Here is a general guideline on **whom to tag**.
+
+        Rest assured that all issues are reviewed by the core maintainers. If you are unsure about whom to tag,
+        you can leave it blank, and a core maintainer will make sure to involve the appropriate person.
+
+        Please tag fewer than 3 people.
+
+        Quantization: @Tracin
+
+        Documentation: @juney-nvidia
+
+        Feature request: @ncomly-nvidia
+
+        Performance: @kaiyux
+
+        Others: @byshiue @schetlur-nv
+
+      placeholder: "@Username ..."
+
+  - type: checkboxes
+    id: information-scripts-examples
+    attributes:
+      label: Information
+      description: 'The problem arises when using:'
+      options:
+        - label: "The official example scripts"
+        - label: "My own modified scripts"
+
+  - type: checkboxes
+    id: information-tasks
+    attributes:
+      label: Tasks
+      description: "The tasks I am working on are:"
+      options:
+        - label: "An officially supported task in the `examples` folder (such as GLUE/SQuAD, ...)"
+        - label: "My own task or dataset (give details below)"
+
+  - type: textarea
+    id: reproduction
+    validations:
+      required: true
+    attributes:
+      label: Reproduction
+      description: |
+        Kindly share a code example that demonstrates the issue you encountered. It is recommending to provide a code snippet directly.
+        Additionally, if you have any error messages, or stack traces related to the problem, please include them here.
+
+        Remember to use code tags to properly format your code. You can refer to the
+        link https://help.github.com/en/github/writing-on-github/creating-and-highlighting-code-blocks#syntax-highlighting for guidance on code formatting.
+
+        Please refrain from using screenshots, as they can be difficult to read and prevent others from copying and pasting your code.
+        It would be most helpful if we could reproduce your issue by simply copying and pasting your scripts and codes.
+
+      placeholder: |
+        Steps to reproduce the behavior:
+
+          1.
+          2.
+          3.
+
+  - type: textarea
+    id: expected-behavior
+    validations:
+      required: true
+    attributes:
+      label: Expected behavior
+      description: "Provide a brief summary of the expected behavior of the software. Provide output files or examples if possible."
+
+  - type: textarea
+    id: actual-behavior
+    validations:
+      required: true
+    attributes:
+      label: actual behavior
+      description: "Describe the actual behavior of the software and how it deviates from the expected behavior. Provide output files or examples if possible."
+
+  - type: textarea
+    id: additioanl-notes
+    validations:
+      required: true
+    attributes:
+      label: additional notes
+      description: "Provide any additional context here you think might be useful for the TensorRT-LLM team to help debug this issue (such as experiments done, potential things to investigate)."
diff --git a/README.md b/README.md
index 799d60cb..74154082 100644
--- a/README.md
+++ b/README.md
@@ -219,7 +219,6 @@ The following table shows the fields that may to be modified before deployment:
 | `max_tokens_in_paged_kv_cache` | Optional (default=unspecified). The maximum size of the KV cache in number of tokens. If unspecified, value is interpreted as 'infinite'. KV cache allocation is the min of max_tokens_in_paged_kv_cache and value derived from kv_cache_free_gpu_mem_fraction below. |
 | `max_attention_window_size` | Optional (default=max_sequence_length). When using techniques like sliding window attention, the maximum number of tokens that are attended to generate one token. Defaults attends to all tokens in sequence. |
 | `kv_cache_free_gpu_mem_fraction` | Optional (default=0.9). Set to a number between 0 and 1 to indicate the maximum fraction of GPU memory (after loading the model) that may be used for KV cache.|
-| `max_num_sequences` | Optional (default=`max_batch_size` if `enable_trt_overlap` is `false` and to `2 * max_batch_size` if `enable_trt_overlap` is `true`, where `max_batch_size` is the TRT engine maximum batch size). Maximum number of sequences that the in-flight batching scheme can maintain state for.
 | `enable_trt_overlap` | Optional (default=`false`). Set to `true` to partition available requests into 2 'microbatches' that can be run concurrently to hide exposed CPU runtime |
 | `exclude_input_in_output` | Optional (default=`false`). Set to `true` to only return completion tokens in a response. Set to `false` to return the prompt tokens concatenated with the generated tokens  |
 | `normalize_log_probs` | Optional (default=`true`). Set to `false` to skip normalization of `output_log_probs`  |
diff --git a/all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt b/all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt
index fb615c3c..efd15018 100644
--- a/all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt
+++ b/all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt
@@ -332,12 +332,6 @@ parameters: {
     string_value: "${kv_cache_free_gpu_mem_fraction}"
   }
 }
-parameters: {
-  key: "max_num_sequences"
-  value: {
-    string_value: "${max_num_sequences}"
-  }
-}
 parameters: {
   key: "enable_trt_overlap"
   value: {
diff --git a/ci/L0_backend_trtllm/custom_metrics_verification_tests.py b/ci/L0_backend_trtllm/custom_metrics_verification_tests.py
index c10fe126..18c9ca98 100644
--- a/ci/L0_backend_trtllm/custom_metrics_verification_tests.py
+++ b/ci/L0_backend_trtllm/custom_metrics_verification_tests.py
@@ -46,6 +46,8 @@
     "inflight_batcher_specific_metric=micro_batch_id": "MicroBatch ID",
     "inflight_batcher_specific_metric=generation_requests":
     "Generation Requests",
+    "inflight_batcher_specific_metric=terminated_requests":
+    "Terminated Requests",
     "v1_specific_metric=total_context_tokens": "Total Context Tokens",
     "v1_specific_metric=total_generation_tokens": "Total Generation Tokens",
     "v1_specific_metric=empty_generation_slots": "Empty Generation Slots",
diff --git a/ci/L0_backend_trtllm/test.sh b/ci/L0_backend_trtllm/test.sh
index ed01f532..810be232 100644
--- a/ci/L0_backend_trtllm/test.sh
+++ b/ci/L0_backend_trtllm/test.sh
@@ -144,185 +144,7 @@ python3 -m pip install --upgrade pip && \
 
 RET=0
 
-reset_model_repo
-
-### 1-GPU TRT engine
-SERVER_ARGS="--model_repo=${MODEL_DIR}"
-
-# inflight batching OFF (V1)
-# streaming OFF
-SERVER_LOG="./1gpu_v1_no_streaming_server.log"
-cp -r /opt/tritonserver/tensorrtllm_backend/all_models/inflight_batcher_llm/* ${MODEL_DIR}
-rm -rf ${MODEL_DIR}/tensorrt_llm_bls
-replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_DIR}/ensemble/config.pbtxt"
-replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_DIR}/preprocessing/config.pbtxt"
-replace_config_tags '${tokenizer_dir}' "${TOKENIZER_DIR}/" "${MODEL_DIR}/preprocessing/config.pbtxt"
-replace_config_tags '${tokenizer_type}' 'auto' "${MODEL_DIR}/preprocessing/config.pbtxt"
-replace_config_tags '${preprocessing_instance_count}' '1' "${MODEL_DIR}/preprocessing/config.pbtxt"
-replace_config_tags '${decoupled_mode}' 'False' "${MODEL_DIR}/tensorrt_llm/config.pbtxt"
-replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_DIR}/tensorrt_llm/config.pbtxt"
-replace_config_tags '${batching_strategy}' 'V1' "${MODEL_DIR}/tensorrt_llm/config.pbtxt"
-replace_config_tags '${engine_dir}' "${MODEL_DIR}/tensorrt_llm/1/inflight_1_gpu/" "${MODEL_DIR}/tensorrt_llm/config.pbtxt"
-replace_config_tags '${max_queue_delay_microseconds}' "1000000" "${MODEL_DIR}/tensorrt_llm/config.pbtxt"
-replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_DIR}/postprocessing/config.pbtxt"
-replace_config_tags '${tokenizer_dir}' "${TOKENIZER_DIR}/" "${MODEL_DIR}/postprocessing/config.pbtxt"
-replace_config_tags '${tokenizer_type}' 'auto' "${MODEL_DIR}/postprocessing/config.pbtxt"
-replace_config_tags '${postprocessing_instance_count}' '1' "${MODEL_DIR}/postprocessing/config.pbtxt"
-# Copy the engine and place it into the model folder
-cp -r ${BASE_DIR}/engines/inflight_1_gpu/ triton_model_repo/tensorrt_llm/1
-
-run_server "${SERVER_ARGS}"
-wait_for_server_ready ${SERVER_TIMEOUT} ${SERVER_PID[@]}
-if [ "$WAIT_RET" != "0" ]; then
-    # Cleanup
-    kill $SERVER_PID > /dev/null 2>&1 || true
-    echo -e "\n***\n*** Failed to start $SERVER\n***"
-    cat $SERVER_LOG
-    exit 1
-fi
-
-set -e
-python3 ${TOOLS_DIR}/inflight_batcher_llm/benchmark_core_model.py \
-    --max-input-len=500 \
-    dataset --dataset=${DATASET} \
-    --tokenizer-dir=${TOKENIZER_DIR}
-
-if [ $? -ne 0 ]; then
-    cat $SERVER_LOG
-    echo -e "\n***\n*** Error executing inflight batching benchmark_core_model: line ${LINENO}\n***"
-    kill_server
-    wait_for_server_terminated ${SERVER_PID[@]}
-    RET=1
-fi
-set +e
-
-set -e
-python3 ${TOOLS_DIR}/inflight_batcher_llm/end_to_end_test.py \
-    --max-input-len=500 \
-    --dataset=${DATASET}
-
-if [ $? -ne 0 ]; then
-    cat $SERVER_LOG
-    echo -e "\n***\n*** Error executing v1 end-to-end test: line ${LINENO}\n***"
-    kill_server
-    wait_for_server_terminated ${SERVER_PID[@]}
-    RET=1
-fi
-set +e
-
-curl localhost:8002/metrics -o 1gpu_v1_no_stream_metrics.out
-
-kill_server
-wait_for_server_terminated ${SERVER_PID[@]}
-
-# inflight batching ON
-# streaming OFF
-SERVER_LOG="./1gpu_IFB_no_streaming_server.log"
-replace_config_tags 'V1' 'inflight_fused_batching' "${MODEL_DIR}/tensorrt_llm/config.pbtxt"
-
-run_server "${SERVER_ARGS}"
-wait_for_server_ready ${SERVER_TIMEOUT} ${SERVER_PID[@]}
-if [ "$WAIT_RET" != "0" ]; then
-    # Cleanup
-    kill $SERVER_PID > /dev/null 2>&1 || true
-    echo -e "\n***\n*** Failed to start $SERVER\n***"
-    cat $SERVER_LOG
-    exit 1
-fi
-
-set -e
-python3 ${TOOLS_DIR}/inflight_batcher_llm/benchmark_core_model.py \
-    --max-input-len=500 \
-    dataset --dataset=${DATASET} \
-    --tokenizer-dir=${TOKENIZER_DIR}
-
-if [ $? -ne 0 ]; then
-    cat $SERVER_LOG
-    echo -e "\n***\n*** Error executing inflight batching benchmark_core_model: line ${LINENO}\n***"
-    kill_server
-    wait_for_server_terminated ${SERVER_PID[@]}
-    RET=1
-fi
-set +e
-
-set -e
-python3 ${TOOLS_DIR}/inflight_batcher_llm/end_to_end_test.py \
-    --max-input-len=500 \
-    --dataset=${DATASET}
-
-if [ $? -ne 0 ]; then
-    cat $SERVER_LOG
-    echo -e "\n***\n*** Error executing inflight batching end-to-end test: line ${LINENO}\n***"
-    kill_server
-    wait_for_server_terminated ${SERVER_PID[@]}
-    RET=1
-fi
-set +e
-
-curl localhost:8002/metrics -o 1gpu_IFB_no_stream_metrics.out
-
-kill_server
-wait_for_server_terminated ${SERVER_PID[@]}
-
-# Start a clean server to verify base metrics are being
-# reported correctly
-SERVER_LOG="./1gpu_IFB_no_streaming_base_metrics.log"
-run_server "${SERVER_ARGS}"
-wait_for_server_ready ${SERVER_TIMEOUT} ${SERVER_PID[@]}
-if [ "$WAIT_RET" != "0" ]; then
-    # Cleanup
-    kill $SERVER_PID > /dev/null 2>&1 || true
-    echo -e "\n***\n*** Failed to start $SERVER\n***"
-    cat $SERVER_LOG
-    exit 1
-fi
-set -e
-
-python3 ${BASE_METRICS_VERIFICATION_TEST} >> ${BASE_METRICS_VERIFICATION_LOG} 2>&1
-if [ $? -ne 0 ]; then
-    cat ${BASE_METRICS_VERIFICATION_LOG}
-    RET=1
-fi
-set +e
-
-kill_server
-wait_for_server_terminated ${SERVER_PID[@]}
-
-# inflight batching ON
-# streaming ON
-SERVER_LOG="./1gpu_IFB_streaming_server.log"
-replace_config_tags 'decoupled: False' 'decoupled: True' "${MODEL_DIR}/tensorrt_llm/config.pbtxt"
-
-run_server "${SERVER_ARGS}"
-wait_for_server_ready ${SERVER_TIMEOUT} ${SERVER_PID[@]}
-if [ "$WAIT_RET" != "0" ]; then
-    # Cleanup
-    kill $SERVER_PID > /dev/null 2>&1 || true
-    echo -e "\n***\n*** Failed to start $SERVER\n***"
-    cat $SERVER_LOG
-    exit 1
-fi
-
-set -e
-python3 ${STREAM_DIR}/end_to_end_grpc_client.py \
-    --prompt="My name is"
-
-if [ $? -ne 0 ]; then
-    cat $SERVER_LOG
-    echo -e "\n***\n*** Error executing inflight batching end-to-end streaming test: line ${LINENO}\n***"
-    kill_server
-    wait_for_server_terminated ${SERVER_PID[@]}
-    RET=1
-fi
-set +e
-
-curl localhost:8002/metrics -o 1gpu_IFB_stream_metrics.out
-
-kill_server
-wait_for_server_terminated ${SERVER_PID[@]}
-
-### Multi GPU TRT engine
-NUM_GPUS_TO_TEST=("2" "4")
+NUM_GPUS_TO_TEST=("1" "2" "4")
 for NUM_GPU in "${NUM_GPUS_TO_TEST[@]}"; do
     AVAILABLE_GPUS=$(nvidia-smi -L | wc -l)
     if [ "$AVAILABLE_GPUS" -lt "$NUM_GPU" ]; then
@@ -331,10 +153,6 @@ for NUM_GPU in "${NUM_GPUS_TO_TEST[@]}"; do
 
     SERVER_ARGS="--world_size=${NUM_GPU} --model_repo=${MODEL_DIR}"
 
-    # inflight batching OFF (V1)
-    # streaming OFF
-    SERVER_LOG="./${NUM_GPU}gpu_v1_no_streaming_server.log"
-
     reset_model_repo
 
     cp -r /opt/tritonserver/tensorrtllm_backend/all_models/inflight_batcher_llm/* ${MODEL_DIR}
@@ -346,9 +164,9 @@ for NUM_GPU in "${NUM_GPUS_TO_TEST[@]}"; do
     replace_config_tags '${preprocessing_instance_count}' '1' "${MODEL_DIR}/preprocessing/config.pbtxt"
     replace_config_tags '${decoupled_mode}' 'False' "${MODEL_DIR}/tensorrt_llm/config.pbtxt"
     replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_DIR}/tensorrt_llm/config.pbtxt"
-    replace_config_tags '${batching_strategy}' 'V1' "${MODEL_DIR}/tensorrt_llm/config.pbtxt"
+    replace_config_tags '${batching_strategy}' 'INVALID' "${MODEL_DIR}/tensorrt_llm/config.pbtxt"
     replace_config_tags '${engine_dir}' "${MODEL_DIR}/tensorrt_llm/1/inflight_${NUM_GPU}_gpu/" "${MODEL_DIR}/tensorrt_llm/config.pbtxt"
-    replace_config_tags '${max_queue_delay_microseconds}' "0" "${MODEL_DIR}/tensorrt_llm/config.pbtxt"
+    replace_config_tags '${max_queue_delay_microseconds}' "50000" "${MODEL_DIR}/tensorrt_llm/config.pbtxt"
     replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_DIR}/postprocessing/config.pbtxt"
     replace_config_tags '${tokenizer_dir}' "${TOKENIZER_DIR}/" "${MODEL_DIR}/postprocessing/config.pbtxt"
     replace_config_tags '${tokenizer_type}' 'auto' "${MODEL_DIR}/postprocessing/config.pbtxt"
@@ -357,6 +175,24 @@ for NUM_GPU in "${NUM_GPUS_TO_TEST[@]}"; do
     # Copy the engine and place it into the model folder
     cp -r ${BASE_DIR}/engines/inflight_${NUM_GPU}_gpu/ triton_model_repo/tensorrt_llm/1
 
+    # Invalid GPT model Type
+    SERVER_LOG="./${NUM_GPU}gpu_invalid_batch_strat.log"
+
+    run_server "${SERVER_ARGS}"
+    wait_for_server_ready ${SERVER_TIMEOUT} ${SERVER_PID[@]}
+
+    # Expect invalid GPT model type error to be gracefully handled
+    if [ `grep -c "Invalid gpt_model_type" $SERVER_LOG` == "0" ]; then
+        echo -e "\n***\n*** GPT model type error not handled gracefully: line ${LINENO}\n***"
+        cat $SERVER_LOG
+        exit 1
+    fi
+
+    # inflight batching OFF (V1)
+    # streaming OFF
+    SERVER_LOG="./${NUM_GPU}gpu_v1_no_streaming_server.log"
+    replace_config_tags 'INVALID' 'V1' "${MODEL_DIR}/tensorrt_llm/config.pbtxt"
+
     run_server "${SERVER_ARGS}"
     wait_for_server_ready ${SERVER_TIMEOUT} ${SERVER_PID[@]}
     if [ "$WAIT_RET" != "0" ]; then
@@ -375,7 +211,7 @@ for NUM_GPU in "${NUM_GPUS_TO_TEST[@]}"; do
 
     if [ $? -ne 0 ]; then
         cat $SERVER_LOG
-        echo -e "\n***\n*** Error executing v1 benchmark_core_model test with ${NUM_GPU}GPUs: line ${LINENO}\n***"
+        echo -e "\n***\n*** Error executing v1 benchmark_core_model test with ${NUM_GPU}GPU(s): line ${LINENO}\n***"
         kill_server
         wait_for_server_terminated ${SERVER_PID[@]}
         RET=1
@@ -389,7 +225,7 @@ for NUM_GPU in "${NUM_GPUS_TO_TEST[@]}"; do
 
     if [ $? -ne 0 ]; then
         cat $SERVER_LOG
-        echo -e "\n***\n*** Error executing v1 end-to-end test with ${NUM_GPU}GPUs: line ${LINENO}\n***"
+        echo -e "\n***\n*** Error executing v1 end-to-end test with ${NUM_GPU}GPU(s): line ${LINENO}\n***"
         kill_server
         wait_for_server_terminated ${SERVER_PID[@]}
         RET=1
@@ -423,7 +259,7 @@ for NUM_GPU in "${NUM_GPUS_TO_TEST[@]}"; do
 
     if [ $? -ne 0 ]; then
         cat $SERVER_LOG
-        echo -e "\n***\n*** Error executing inflight batching benchmark_core_model test with ${NUM_GPU}GPUs: line ${LINENO}\n***"
+        echo -e "\n***\n*** Error executing inflight batching benchmark_core_model test with ${NUM_GPU}GPU(s): line ${LINENO}\n***"
         kill_server
         wait_for_server_terminated ${SERVER_PID[@]}
         RET=1
@@ -437,7 +273,7 @@ for NUM_GPU in "${NUM_GPUS_TO_TEST[@]}"; do
 
     if [ $? -ne 0 ]; then
         cat $SERVER_LOG
-        echo -e "\n***\n*** Error executing inflight batching end-to-end test with ${NUM_GPU}GPUs: line ${LINENO}\n***"
+        echo -e "\n***\n*** Error executing inflight batching end-to-end test with ${NUM_GPU}GPU(s): line ${LINENO}\n***"
         kill_server
         wait_for_server_terminated ${SERVER_PID[@]}
         RET=1
@@ -493,7 +329,7 @@ for NUM_GPU in "${NUM_GPUS_TO_TEST[@]}"; do
 
     if [ $? -ne 0 ]; then
         cat $SERVER_LOG
-        echo -e "\n***\n*** Error executing inflight batching end-to-end streaming test with ${NUM_GPU}GPUs: line ${LINENO}\n***"
+        echo -e "\n***\n*** Error executing inflight batching end-to-end streaming test with ${NUM_GPU}GPU(s): line ${LINENO}\n***"
         kill_server
         wait_for_server_terminated ${SERVER_PID[@]}
         RET=1
diff --git a/dockerfile/Dockerfile.triton.trt_llm_backend b/dockerfile/Dockerfile.triton.trt_llm_backend
index 208ee2c5..5aff5021 100644
--- a/dockerfile/Dockerfile.triton.trt_llm_backend
+++ b/dockerfile/Dockerfile.triton.trt_llm_backend
@@ -2,7 +2,7 @@ ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:23.11-py3-min
 
 FROM ${BASE_IMAGE} as base
 
-RUN apt-get update -q=2 && apt-get install -y --no-install-recommends python3-pip
+RUN apt-get update -q=2 && apt-get install -y --no-install-recommends python3-pip ccache git-lfs
 # Remove previous TRT installation
 # We didn't remove libnvinfer* here because tritonserver depends on the pre-installed libraries.
 RUN apt-get remove -y tensorrt*
diff --git a/dockerfile/Dockerfile.trt_llm_backend b/dockerfile/Dockerfile.trt_llm_backend
index 0fb2d027..c06e7968 100644
--- a/dockerfile/Dockerfile.trt_llm_backend
+++ b/dockerfile/Dockerfile.trt_llm_backend
@@ -3,7 +3,7 @@ ARG BASE_TAG=23.12-py3
 
 FROM ${BASE_IMAGE}:${BASE_TAG} as base
 
-RUN apt-get update && apt-get install -y --no-install-recommends rapidjson-dev python-is-python3
+RUN apt-get update && apt-get install -y --no-install-recommends rapidjson-dev python-is-python3 ccache git-lfs
 
 COPY requirements.txt /tmp/
 RUN pip3 install -r /tmp/requirements.txt --extra-index-url https://pypi.ngc.nvidia.com
diff --git a/inflight_batcher_llm/CMakeLists.txt b/inflight_batcher_llm/CMakeLists.txt
index f7f08613..25564502 100644
--- a/inflight_batcher_llm/CMakeLists.txt
+++ b/inflight_batcher_llm/CMakeLists.txt
@@ -260,7 +260,6 @@ set_property(
 )
 
 if(TRITON_ENABLE_METRICS)
-  message("Compiling statistics reporter...")
   list(APPEND REPORTER_SRCS
        src/custom_metrics_reporter/custom_metrics_reporter.cc)
   list(APPEND REPORTER_HDRS
diff --git a/inflight_batcher_llm/README.md b/inflight_batcher_llm/README.md
index fe040d7a..0c418093 100644
--- a/inflight_batcher_llm/README.md
+++ b/inflight_batcher_llm/README.md
@@ -151,6 +151,111 @@ You will find that the generation process is stopped early and therefore the num
 
 You can have a look at the client code to see how early stopping is achieved.
 
+## Running LoRA inference with inflight batching
+
+Build a model with LoRA enable
+
+```
+BASE_MODEL=llama-7b-hf
+
+python3 tensorrt_llm/examples/llama/build.py --model_dir ${BASE_MODEL} \
+                --dtype float16 \
+                --remove_input_padding \
+                --use_gpt_attention_plugin float16 \
+                --enable_context_fmha \
+                --use_gemm_plugin float16 \
+                --output_dir "/tmp/llama_7b_with_lora_qkv/trt_engines/fp16/1-gpu/" \
+                --max_batch_size 128 \
+                --max_input_len 512 \
+                --max_output_len 50 \
+                --use_lora_plugin float16 \
+                --lora_target_modules "attn_q" "attn_k" "attn_v" \
+                --use_inflight_batching \
+		        --paged_kv_cache \
+                --max_lora_rank 8 \
+                --world_size 1 --tp_size 1
+```
+
+Create a Triton model repository and launch the Triton server as described above.
+
+Now generate LoRA tensors that will be passed in with each request to triton.
+
+```
+git-lfs clone https://huggingface.co/qychen/luotuo-lora-7b-0.1
+git-lfs clone https://huggingface.co/kunishou/Japanese-Alpaca-LoRA-7b-v0
+
+python3 tensorrt_llm/examples/hf_lora_convert.py -i Japanese-Alpaca-LoRA-7b-v0 -o Japanese-Alpaca-LoRA-7b-v0-weights --storage-type float16
+python3 tensorrt_llm/examples/hf_lora_convert.py -i luotuo-lora-7b-0.1 -o luotuo-lora-7b-0.1-weights --storage-type float16
+```
+
+Launch tritonserver as describe above
+
+Run Multi-LoRA example by issuing  multiple concurrent requests.
+The inflight batcher will execute mixed batches with multiple LoRAs in the same batch.
+
+```
+INPUT_TEXT=("美国的首都在哪里? \n答案:" "美国的首都在哪里? \n答案:" "美国的首都在哪里? \n答案:" "アメリカ合衆国の首都はどこですか? \n答え:" "アメリカ合衆国の首都はどこですか? \n答え:" "アメリカ合衆国の首都はどこですか? \n答え:")
+LORA_PATHS=("" "luotuo-lora-7b-0.1-weights" "Japanese-Alpaca-LoRA-7b-v0-weights" "" "luotuo-lora-7b-0.1-weights" "Japanese-Alpaca-LoRA-7b-v0-weights")
+
+for index in ${!INPUT_TEXT[@]}; do
+    text=${INPUT_TEXT[$index]}
+    lora_path=${LORA_PATHS[$index]}
+    lora_arg=""
+    if [ "${lora_path}" != "" ]; then
+        lora_arg="--lora-path ${lora_path}"
+    fi
+
+    python3 inflight_batcher_llm/client/inflight_batcher_llm_client.py \
+        --top-k 0 \
+        --top-p 0.5 \
+        --request-output-len 10 \
+        --text "${text}" \
+        --tokenizer-dir /home/scratch.trt_llm_data/llm-models/llama-models/llama-7b-hf \
+        ${lora_arg} &
+done
+
+wait
+```
+
+Example Output:
+```
+Input sequence:  [1, 29871, 30310, 30604, 30303, 30439, 30733, 235, 164, 137, 30356, 30199, 31688, 30769, 30449, 31250, 30589, 30499, 30427, 30412, 29973, 320, 29876, 234, 176, 151, 30914, 29901]
+Input sequence:  [1, 29871, 30630, 30356, 30210, 31688, 30769, 30505, 232, 150, 173, 30755, 29973, 320, 29876, 234, 176, 151, 233, 164, 139, 29901]
+Input sequence:  [1, 29871, 30630, 30356, 30210, 31688, 30769, 30505, 232, 150, 173, 30755, 29973, 320, 29876, 234, 176, 151, 233, 164, 139, 29901]
+Input sequence:  [1, 29871, 30310, 30604, 30303, 30439, 30733, 235, 164, 137, 30356, 30199, 31688, 30769, 30449, 31250, 30589, 30499, 30427, 30412, 29973, 320, 29876, 234, 176, 151, 30914, 29901]
+Input sequence:  [1, 29871, 30310, 30604, 30303, 30439, 30733, 235, 164, 137, 30356, 30199, 31688, 30769, 30449, 31250, 30589, 30499, 30427, 30412, 29973, 320, 29876, 234, 176, 151, 30914, 29901]
+Input sequence:  [1, 29871, 30630, 30356, 30210, 31688, 30769, 30505, 232, 150, 173, 30755, 29973, 320, 29876, 234, 176, 151, 233, 164, 139, 29901]
+Got completed request
+Input: アメリカ合衆国の首都はどこですか? \n答え:
+Output beam 0: ワシントン D.C.
+Output sequence:  [1, 29871, 30310, 30604, 30303, 30439, 30733, 235, 164, 137, 30356, 30199, 31688, 30769, 30449, 31250, 30589, 30499, 30427, 30412, 29973, 320, 29876, 234, 176, 151, 30914, 29901, 29871, 31028, 30373, 30203, 30279, 30203, 360, 29889, 29907, 29889]
+Got completed request
+Input: 美国的首都在哪里? \n答案:
+Output beam 0: Washington, D.C.
+What is the
+Output sequence:  [1, 29871, 30630, 30356, 30210, 31688, 30769, 30505, 232, 150, 173, 30755, 29973, 320, 29876, 234, 176, 151, 233, 164, 139, 29901, 7660, 29892, 360, 29889, 29907, 29889, 13, 5618, 338, 278]
+Got completed request
+Input: 美国的首都在哪里? \n答案:
+Output beam 0: Washington D.C.
+Washington D.
+Output sequence:  [1, 29871, 30630, 30356, 30210, 31688, 30769, 30505, 232, 150, 173, 30755, 29973, 320, 29876, 234, 176, 151, 233, 164, 139, 29901, 7660, 360, 29889, 29907, 29889, 13, 29956, 7321, 360, 29889]
+Got completed request
+Input: アメリカ合衆国の首都はどこですか? \n答え:
+Output beam 0: Washington, D.C.
+Which of
+Output sequence:  [1, 29871, 30310, 30604, 30303, 30439, 30733, 235, 164, 137, 30356, 30199, 31688, 30769, 30449, 31250, 30589, 30499, 30427, 30412, 29973, 320, 29876, 234, 176, 151, 30914, 29901, 7660, 29892, 360, 29889, 29907, 29889, 13, 8809, 436, 310]
+Got completed request
+Input: アメリカ合衆国の首都はどこですか? \n答え:
+Output beam 0: Washington D.C.
+1. ア
+Output sequence:  [1, 29871, 30310, 30604, 30303, 30439, 30733, 235, 164, 137, 30356, 30199, 31688, 30769, 30449, 31250, 30589, 30499, 30427, 30412, 29973, 320, 29876, 234, 176, 151, 30914, 29901, 7660, 360, 29889, 29907, 29889, 13, 29896, 29889, 29871, 30310]
+Got completed request
+Input: 美国的首都在哪里? \n答案:
+Output beam 0: 华盛顿
+W
+Output sequence:  [1, 29871, 30630, 30356, 30210, 31688, 30769, 30505, 232, 150, 173, 30755, 29973, 320, 29876, 234, 176, 151, 233, 164, 139, 29901, 29871, 31266, 234, 158, 158, 236, 164, 194, 13, 29956]
+```
+
 ## Run the e2e/benchmark_core_model to benchmark
 
 ### End to end test
diff --git a/inflight_batcher_llm/src/custom_metrics_reporter/custom_metrics_reporter.cc b/inflight_batcher_llm/src/custom_metrics_reporter/custom_metrics_reporter.cc
index 851f5851..764c1ec7 100644
--- a/inflight_batcher_llm/src/custom_metrics_reporter/custom_metrics_reporter.cc
+++ b/inflight_batcher_llm/src/custom_metrics_reporter/custom_metrics_reporter.cc
@@ -52,9 +52,9 @@ const std::vector<std::string> CustomMetricsReporter::v1_specific_labels_{
     "total_context_tokens", "total_generation_tokens", "empty_generation_slots"};
 
 const std::vector<std::string> CustomMetricsReporter::IFB_specific_keys_{
-    "Total Context Tokens", "Generation Requests", "MicroBatch ID"};
+    "Total Context Tokens", "Generation Requests", "MicroBatch ID", "Terminated Requests"};
 const std::vector<std::string> CustomMetricsReporter::IFB_specific_labels_{
-    "total_context_tokens", "generation_requests", "micro_batch_id"};
+    "total_context_tokens", "generation_requests", "micro_batch_id", "terminated_requests"};
 
 const std::vector<std::string> CustomMetricsReporter::general_metric_keys_{"Timestamp", "Iteration Counter"};
 const std::vector<std::string> CustomMetricsReporter::general_metric_labels_{"timestamp", "iteration_counter"};
@@ -125,7 +125,7 @@ const std::vector<std::string>& TritonMetricGroup::JsonKeys() const
     return json_keys_;
 }
 
-TRITONSERVER_Error* CustomMetricsReporter::InitReporter(
+TRITONSERVER_Error* CustomMetricsReporter::InitializeReporter(
     const std::string& model_name, const uint64_t version, const bool is_v1_model)
 {
     /* REQUEST METRIC GROUP */
@@ -194,7 +194,11 @@ TRITONSERVER_Error* CustomMetricsReporter::UpdateCustomMetrics(const std::string
         {
             triton::common::TritonJson::Value value_json;
             uint64_t value;
-            metrics.Find(key.c_str(), &value_json);
+            if (!metrics.Find(key.c_str(), &value_json))
+            {
+                std::string errStr = std::string("Failed to find " + key + " in metrics.");
+                return TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL, errStr.c_str());
+            }
             if (key == "Timestamp")
             {
                 std::string timestamp;
diff --git a/inflight_batcher_llm/src/custom_metrics_reporter/custom_metrics_reporter.h b/inflight_batcher_llm/src/custom_metrics_reporter/custom_metrics_reporter.h
index cef12b5d..032e14ce 100644
--- a/inflight_batcher_llm/src/custom_metrics_reporter/custom_metrics_reporter.h
+++ b/inflight_batcher_llm/src/custom_metrics_reporter/custom_metrics_reporter.h
@@ -140,7 +140,7 @@ class CustomMetricsReporter
     /// \param is_v1_model Whether the model type is v1 or an inflight
     /// batching model.
     /// \return a TRITONSERVER_Error indicating success or failure.
-    TRITONSERVER_Error* InitReporter(const std::string& model, const uint64_t version, const bool is_v1_model);
+    TRITONSERVER_Error* InitializeReporter(const std::string& model, const uint64_t version, const bool is_v1_model);
 
     /// Updates the vector of TritonMetricGroup objects with a
     /// JSON-formatted statistics string.
diff --git a/inflight_batcher_llm/src/libtensorrtllm.cc b/inflight_batcher_llm/src/libtensorrtllm.cc
index e723cf75..c4233d87 100644
--- a/inflight_batcher_llm/src/libtensorrtllm.cc
+++ b/inflight_batcher_llm/src/libtensorrtllm.cc
@@ -65,22 +65,17 @@ extern "C"
         // TRITONBACKEND_Model. If anything goes wrong with initialization
         // of the model state then an error is returned and Triton will fail
         // to load the model.
-        ModelState* model_state;
-        RETURN_IF_ERROR(ModelState::Create(model, &model_state));
-        RETURN_IF_ERROR(TRITONBACKEND_ModelSetState(model, reinterpret_cast<void*>(model_state)));
-
-#ifdef TRITON_ENABLE_METRICS
         const char* cname;
         RETURN_IF_ERROR(TRITONBACKEND_ModelName(model, &cname));
-        std::string name(cname);
+        const std::string name(cname);
 
         uint64_t version;
         RETURN_IF_ERROR(TRITONBACKEND_ModelVersion(model, &version));
 
-        bool is_v1_model = ((model_state->GetParameter<std::string>("gpt_model_type") == "V1")
-            || (model_state->GetParameter<std::string>("gpt_model_type") == "v1"));
-        LOG_IF_ERROR(model_state->InitCustomMetricsReporter(name, version, is_v1_model), "Failed initializing metrics");
-#endif                  // TRITON_ENABLE_METRICS
+        ModelState* model_state;
+        RETURN_IF_ERROR(ModelState::Create(model, name, version, &model_state));
+        RETURN_IF_ERROR(TRITONBACKEND_ModelSetState(model, reinterpret_cast<void*>(model_state)));
+
         return nullptr; // success
     }
 
diff --git a/inflight_batcher_llm/src/model_instance_state.cc b/inflight_batcher_llm/src/model_instance_state.cc
index 0aa4b420..0f2ab084 100644
--- a/inflight_batcher_llm/src/model_instance_state.cc
+++ b/inflight_batcher_llm/src/model_instance_state.cc
@@ -77,6 +77,12 @@ ModelInstanceState::ModelInstanceState(ModelState* model_state, TRITONBACKEND_Mo
             "v1/inflight_batching/inflight_fused_batching.");
     }
 
+#ifdef TRITON_ENABLE_METRICS
+    custom_metrics_reporter_ = std::make_unique<custom_metrics_reporter::CustomMetricsReporter>();
+    custom_metrics_reporter_->InitializeReporter(
+        model_state->GetModelName(), model_state->GetModelVersion(), (mTrtGptModelType == TrtGptModelType::V1));
+#endif
+
     // Check if model is in decoupled mode:
     triton::common::TritonJson::Value transaction_policy;
     model_state_->GetModelConfig().MemberAsObject("model_transaction_policy", &transaction_policy);
@@ -163,17 +169,6 @@ ModelInstanceState::ModelInstanceState(ModelState* model_state, TRITONBACKEND_Mo
             "max_tokens_in_paged_kv_cache");
     }
 
-    std::optional<int32_t> maxNumSequences = std::nullopt;
-    try
-    {
-        maxNumSequences = model_state_->GetParameter<int32_t>("max_num_sequences");
-    }
-    catch (const std::exception& e)
-    {
-        // If parameter is not specified, just ignore
-        TLLM_LOG_WARNING("max_num_sequences is not specified, will be set to the TRT engine max_batch_size");
-    }
-
     bool enableTrtOverlap = false;
     try
     {
@@ -232,7 +227,6 @@ ModelInstanceState::ModelInstanceState(ModelState* model_state, TRITONBACKEND_Mo
     }
 
     TrtGptModelOptionalParams optionalParams;
-    optionalParams.maxNumSequences = maxNumSequences;
     optionalParams.kvCacheConfig.maxTokens = maxTokensInPagedKvCache;
     optionalParams.kvCacheConfig.freeGpuMemoryFraction = kvCacheFreeGpuMemFraction;
     optionalParams.kvCacheConfig.maxAttentionWindow = maxAttentionWindow;
@@ -479,7 +473,7 @@ void ModelInstanceState::logStats(const std::string& s)
 {
     LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, s.c_str());
 #ifdef TRITON_ENABLE_METRICS
-    LOG_IF_ERROR(model_state_->UpdateCustomMetrics(s), "Failed updating TRT LLM statistics");
+    LOG_IF_ERROR(custom_metrics_reporter_->UpdateCustomMetrics(s), "Failed updating TRT LLM statistics");
 #endif
 }
 
diff --git a/inflight_batcher_llm/src/model_instance_state.h b/inflight_batcher_llm/src/model_instance_state.h
index 42495829..d23cf75f 100644
--- a/inflight_batcher_llm/src/model_instance_state.h
+++ b/inflight_batcher_llm/src/model_instance_state.h
@@ -36,16 +36,20 @@
 
 #include "tensorrt_llm/batch_manager/BatchManager.h"
 #include "tensorrt_llm/batch_manager/GptManager.h"
-#include "tensorrt_llm/batch_manager/batchScheduler.h"
 #include "tensorrt_llm/batch_manager/callbacks.h"
 #include "tensorrt_llm/batch_manager/kvCacheConfig.h"
 #include "tensorrt_llm/batch_manager/namedTensor.h"
+#include "tensorrt_llm/batch_manager/schedulerPolicy.h"
 #include "tensorrt_llm/batch_manager/trtGptModelOptionalParams.h"
 
 #include "model_state.h"
 #include "work_item.h"
 #include "work_items_queue.h"
 
+#ifdef TRITON_ENABLE_METRICS
+#include "custom_metrics_reporter/custom_metrics_reporter.h"
+#endif
+
 using namespace tensorrt_llm::batch_manager;
 using namespace tensorrt_llm::batch_manager::batch_scheduler;
 
@@ -132,6 +136,9 @@ class ModelInstanceState
     std::unique_ptr<WorkItemsQueue> mWorkItemsQueue;
 
     std::unordered_map<uint64_t, std::string> mRequestIdStrMap;
+#ifdef TRITON_ENABLE_METRICS
+    std::unique_ptr<custom_metrics_reporter::CustomMetricsReporter> custom_metrics_reporter_;
+#endif
 };
 
 } // namespace triton::backend::inflight_batcher_llm
diff --git a/inflight_batcher_llm/src/model_state.cc b/inflight_batcher_llm/src/model_state.cc
index 0f02eaa8..2eadbde1 100644
--- a/inflight_batcher_llm/src/model_state.cc
+++ b/inflight_batcher_llm/src/model_state.cc
@@ -29,11 +29,11 @@
 namespace triton::backend::inflight_batcher_llm
 {
 
-TRITONSERVER_Error* ModelState::Create(TRITONBACKEND_Model* triton_model, ModelState** state)
+TRITONSERVER_Error* ModelState::Create(
+    TRITONBACKEND_Model* triton_model, const std::string& name, const uint64_t version, ModelState** state)
 {
     TRITONSERVER_Message* config_message;
     RETURN_IF_ERROR(TRITONBACKEND_ModelConfig(triton_model, 1 /* config_version */, &config_message));
-
     // We can get the model configuration as a json string from
     // config_message, parse it with our favorite json parser to create
     // DOM that we can access when we need to example the
@@ -52,7 +52,7 @@ TRITONSERVER_Error* ModelState::Create(TRITONBACKEND_Model* triton_model, ModelS
 
     try
     {
-        *state = new ModelState(triton_model, std::move(model_config));
+        *state = new ModelState(triton_model, name, version, std::move(model_config));
     }
     catch (const std::exception& ex)
     {
@@ -68,6 +68,16 @@ common::TritonJson::Value& ModelState::GetModelConfig()
     return model_config_;
 }
 
+const std::string& ModelState::GetModelName() const
+{
+    return model_name_;
+}
+
+uint64_t ModelState::GetModelVersion() const
+{
+    return model_version_;
+}
+
 template <>
 std::string ModelState::GetParameter<std::string>(const std::string& name)
 {
@@ -140,19 +150,4 @@ bool ModelState::GetParameter<bool>(const std::string& name)
     }
 }
 
-#ifdef TRITON_ENABLE_METRICS
-TRITONSERVER_Error* ModelState::InitCustomMetricsReporter(
-    const std::string& model_name, const uint64_t version, const bool is_v1_model)
-{
-    RETURN_IF_ERROR(custom_metrics_reporter_->InitReporter(model_name, version, is_v1_model));
-    return nullptr; // success
-}
-
-TRITONSERVER_Error* ModelState::UpdateCustomMetrics(const std::string& custom_metrics)
-{
-    RETURN_IF_ERROR(custom_metrics_reporter_->UpdateCustomMetrics(custom_metrics));
-    return nullptr; // success
-}
-#endif
-
 } // namespace triton::backend::inflight_batcher_llm
diff --git a/inflight_batcher_llm/src/model_state.h b/inflight_batcher_llm/src/model_state.h
index a4fd8a4a..2e1b59bf 100644
--- a/inflight_batcher_llm/src/model_state.h
+++ b/inflight_batcher_llm/src/model_state.h
@@ -35,10 +35,6 @@
 #include "triton/core/tritonbackend.h"
 #include "triton/core/tritonserver.h"
 
-#ifdef TRITON_ENABLE_METRICS
-#include "custom_metrics_reporter/custom_metrics_reporter.h"
-#endif
-
 using namespace ::triton::common; // TritonJson
 
 namespace triton::backend::inflight_batcher_llm
@@ -53,7 +49,8 @@ namespace triton::backend::inflight_batcher_llm
 class ModelState
 {
 public:
-    static TRITONSERVER_Error* Create(TRITONBACKEND_Model* triton_model, ModelState** state);
+    static TRITONSERVER_Error* Create(
+        TRITONBACKEND_Model* triton_model, const std::string& name, const uint64_t version, ModelState** state);
 
     template <typename T>
     T GetParameter(const std::string& name)
@@ -65,28 +62,24 @@ class ModelState
 
     virtual ~ModelState() = default;
 
-#ifdef TRITON_ENABLE_METRICS
-    TRITONSERVER_Error* InitCustomMetricsReporter(
-        const std::string& model_name, const uint64_t version, const bool is_v1_model);
-    TRITONSERVER_Error* UpdateCustomMetrics(const std::string& statistics);
-#endif
     common::TritonJson::Value& GetModelConfig();
+    const std::string& GetModelName() const;
+    uint64_t GetModelVersion() const;
 
 private:
-#ifdef TRITON_ENABLE_METRICS
-    std::unique_ptr<custom_metrics_reporter::CustomMetricsReporter> custom_metrics_reporter_;
-#endif
+    const std::string model_name_;
+    uint64_t model_version_;
     common::TritonJson::Value model_config_;
     std::shared_ptr<nvinfer1::ILogger> mTrtLogger{};
 
-    ModelState(TRITONBACKEND_Model* triton_model, TritonJson::Value&& model_config)
-        : model_config_(std::move(model_config))
+    ModelState(
+        TRITONBACKEND_Model* triton_model, const std::string& name, uint64_t version, TritonJson::Value&& model_config)
+        : model_name_(name)
+        , model_version_(version)
+        , model_config_(std::move(model_config))
     {
         mTrtLogger = std::make_shared<tensorrt_llm::runtime::TllmLogger>();
         initTrtLlmPlugins(mTrtLogger.get());
-#ifdef TRITON_ENABLE_METRICS
-        custom_metrics_reporter_ = std::make_unique<custom_metrics_reporter::CustomMetricsReporter>();
-#endif
     }
 };
 
diff --git a/tensorrt_llm b/tensorrt_llm
index c8965302..b57221b7 160000
--- a/tensorrt_llm
+++ b/tensorrt_llm
@@ -1 +1 @@
-Subproject commit c89653021e66ca78c55f02b366f404455bc12e8d
+Subproject commit b57221b764bc579cbb2490154916a871f620e2c4
diff --git a/tools/version.txt b/tools/version.txt
index 24b8df30..a5d5936f 100644
--- a/tools/version.txt
+++ b/tools/version.txt
@@ -1 +1 @@
-ad7d4adac6bebead80be01388b94d1f57a50245a
+dcc9252db07dc9e8d4584fd92dbbf743a6c18b4e