TheCodeWrangler
diff --git a/‎README.md
Lines changed: 1 addition & 0 deletions b/‎README.md
Lines changed: 1 addition & 0 deletions
diff --git a/‎all_models/inflight_batcher_llm/tensorrt_llm/1/model.py
Lines changed: 30 additions & 22 deletions b/‎all_models/inflight_batcher_llm/tensorrt_llm/1/model.py
Lines changed: 30 additions & 22 deletions
diff --git a/‎all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt
Lines changed: 6 additions & 0 deletions b/‎all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt
Lines changed: 6 additions & 0 deletions
diff --git a/‎all_models/inflight_batcher_llm/tensorrt_llm_bls/1/lib/decode.py
Lines changed: 1 addition & 5 deletions b/‎all_models/inflight_batcher_llm/tensorrt_llm_bls/1/lib/decode.py
Lines changed: 1 addition & 5 deletions
diff --git a/‎all_models/tests/test_python_backend.py
Lines changed: 17 additions & 19 deletions b/‎all_models/tests/test_python_backend.py
Lines changed: 17 additions & 19 deletions
diff --git a/‎dockerfile/Dockerfile.trt_llm_backend
Lines changed: 2 additions & 4 deletions b/‎dockerfile/Dockerfile.trt_llm_backend
Lines changed: 2 additions & 4 deletions
@@ -311,6 +311,7 @@ The following table shows the fields that may to be modified before deployment:
 | `lora_cache_max_adapter_size` | Optional (default=64) Used to set the minimum size of a cache page.  Pages must be at least large enough to fit a single module, single later adapter_size `maxAdapterSize` row of weights. |
 | `lora_cache_gpu_memory_fraction` | Optional (default=0.05) Fraction of GPU memory used for LoRA cache. Computed as a fraction of left over memory after engine load, and after KV cache is loaded |
 | `lora_cache_host_memory_bytes` | Optional (default=1G) Size of host LoRA cache in bytes |
+| `gpu_weights_percent` | Optional (default=1.0). Set to a number between 0.0 and 1.0 to specify the percentage of weights that reside on GPU instead of CPU and streaming load during runtime. Values less than 1.0 are only supported for an engine built with `weight_streaming` on. |
 
 *triton_model_repo/postprocessing/config.pbtxt*
 
 
@@ -134,7 +134,7 @@ def get_output_config_from_request(request, exclude_input_from_output):
     return trtllm.OutputConfig(**kwargs)
 
 
-def get_speculative_decoding_config_from_request(request):
+def get_external_draft_tokens_config_from_request(request):
     kwargs = {}
     draft_input_ids = get_input_tensor_by_name(request, 'draft_input_ids')
     if draft_input_ids is not None:
@@ -146,7 +146,7 @@ def get_speculative_decoding_config_from_request(request):
         request, 'draft_acceptance_threshold')
     kwargs = {k: v for k, v in kwargs.items() if v is not None}
     if len(kwargs) > 0:
-        return trtllm.SpeculativeDecodingConfig(**kwargs)
+        return trtllm.ExternalDraftTokensConfig(**kwargs)
     return None
 
 
@@ -211,7 +211,7 @@ def convert_request(request, exclude_input_from_output, decoupled):
     sampling_config = get_sampling_config_from_request(request)
     output_config = get_output_config_from_request(request,
                                                    exclude_input_from_output)
-    speculative_decoding_config = get_speculative_decoding_config_from_request(
+    external_draft_tokens_config = get_external_draft_tokens_config_from_request(
         request)
     prompt_tuning_config = get_prompt_tuning_config_from_request(request)
     lora_config = get_lora_config_from_request(request)
@@ -220,7 +220,7 @@ def convert_request(request, exclude_input_from_output, decoupled):
         **inputs,
         sampling_config=sampling_config,
         output_config=output_config,
-        speculative_decoding_config=speculative_decoding_config,
+        external_draft_tokens_config=external_draft_tokens_config,
         prompt_tuning_config=prompt_tuning_config,
         lora_config=lora_config,
     )
@@ -295,18 +295,18 @@ def convert_batching_type(gpt_model_type: str):
 def convert_decoding_mode(decoding_mode: str):
     if decoding_mode is None:
         return None
-    elif decoding_mode == "none":
-        return trtllm.DecodingMode.NONE
+    elif decoding_mode == "auto":
+        return trtllm.DecodingMode.Auto()
     elif decoding_mode == "top_k":
-        return trtllm.DecodingMode.TOP_K
+        return trtllm.DecodingMode.TopK()
     elif decoding_mode == "top_p":
-        return trtllm.DecodingMode.TOP_P
+        return trtllm.DecodingMode.TopP()
     elif decoding_mode == "top_k_top_p":
-        return trtllm.DecodingMode.TOP_K_TOP_P
+        return trtllm.DecodingMode.TopKTopP()
     elif decoding_mode == "beam_search":
-        return trtllm.DecodingMode.BEAM_SEARCH
+        return trtllm.DecodingMode.BeamSearch()
     elif decoding_mode == "medusa":
-        return trtllm.DecodingMode.MEDUSA
+        return trtllm.DecodingMode.Medusa()
     raise pb_utils.TritonModelException(
         f"decoding_mode value of '{decoding_mode}' is not supported.")
 
@@ -384,6 +384,19 @@ def get_peft_cache_config(self, model_config):
         kwargs = {k: v for k, v in kwargs.items() if v is not None}
         return trtllm.PeftCacheConfig(**kwargs)
 
+    def get_decoding_config(self, model_config):
+        kwargs = {
+            "medusa_choices":
+            parse_medusa_choices(get_parameter(model_config,
+                                               "medusa_choices")),
+            "decoding_mode":
+            convert_decoding_mode(get_parameter(model_config,
+                                                "decoding_mode")),
+        }
+        print(kwargs)
+        kwargs = {k: v for k, v in kwargs.items() if v is not None}
+        return trtllm.DecodingConfig(**kwargs)
+
     def get_executor_config(self, model_config):
         kwargs = {
             "max_beam_width":
@@ -403,12 +416,8 @@ def get_executor_config(self, model_config):
             self.get_parallel_config(model_config),
             "peft_cache_config":
             self.get_peft_cache_config(model_config),
-            "medusa_choices":
-            parse_medusa_choices(get_parameter(model_config,
-                                               "medusa_choices")),
-            "decoding_mode":
-            convert_decoding_mode(get_parameter(model_config,
-                                                "decoding_mode")),
+            "decoding_config":
+            self.get_decoding_config(model_config),
         }
         kwargs = {k: v for k, v in kwargs.items() if v is not None}
         return trtllm.ExecutorConfig(**kwargs)
@@ -553,22 +562,21 @@ def awaiter_loop(self):
                     with self.lock:
                         del self.triton_id_to_req_id[triton_id]
                         del self.req_id_to_response_sender[req_id]
+                # Remove local reference so response_sender can be cleaned properly.
+                del response_sender
             # TODO: Read stats: https://jirasw.nvidia.com/browse/TRTLLM-563
 
     def cancellation_loop(self):
         """Checks if any pending requests have been cancelled."""
         while self.running:
             time.sleep(self.cancellation_check_period_ms / 1000.0)
             with self.lock:
-                cancelled_ids = []
                 for req_id, (triton_id, response_sender
                              ) in self.req_id_to_response_sender.items():
                     if response_sender.is_cancelled():
                         self.executor.cancel_request(req_id)
-                        cancelled_ids.append((req_id, triton_id))
-                for req_id, triton_id in cancelled_ids:
-                    del self.triton_id_to_req_id[triton_id]
-                    del self.req_id_to_response_sender[req_id]
+                    # Remove local reference so response_sender can be cleaned properly.
+                    del response_sender
 
     def finalize(self):
         """`finalize` is called only once when the model is being unloaded.
 
@@ -513,3 +513,9 @@ parameters: {
       string_value: "${medusa_choices}"
   }
 }
+parameters: {
+  key: "gpu_weights_percent"
+    value: {
+      string_value: "${gpu_weights_percent}"
+  }
+}
@@ -91,17 +91,13 @@ def validate(self):
 
         num_draft_tokens = _single_value(self.num_draft_tokens)
         stream = _single_value(self.stream)
-        gen_logits = _single_value(self.return_generation_logits)
+        _single_value(self.return_generation_logits)
         context_logits = _single_value(self.return_context_logits)
 
         if num_draft_tokens:
             _validate_that(
                 not stream,
                 "streaming is not supported with speculative decoding")
-            _validate_that(
-                not gen_logits,
-                "generation logits are not supported with speculative decoding"
-            )
             _validate_that(
                 not context_logits,
                 "context logits are not supported with speculative decoding")
 
@@ -291,12 +291,12 @@ def test_convert_request(triton_request: MockTritonRequest):
     assert (converted.embedding_bias == torch.tensor([0., 0., 0.])).all()
     assert converted.logits_post_processor_name is None
 
-    assert isinstance(converted.speculative_decoding_config,
-                      trtllm.SpeculativeDecodingConfig)
-    assert converted.speculative_decoding_config.tokens == [0, 1]
-    assert (converted.speculative_decoding_config.logits == torch.tensor(
+    assert isinstance(converted.external_draft_tokens_config,
+                      trtllm.ExternalDraftTokensConfig)
+    assert converted.external_draft_tokens_config.tokens == [0, 1]
+    assert (converted.external_draft_tokens_config.logits == torch.tensor(
         [[1.0, 2.0], [3.0, 4.0]])).all()
-    assert converted.speculative_decoding_config.acceptance_threshold == 1.0
+    assert converted.external_draft_tokens_config.acceptance_threshold == 1.0
 
     assert isinstance(converted.prompt_tuning_config,
                       trtllm.PromptTuningConfig)
@@ -345,7 +345,7 @@ def test_convert_request_minimal(triton_request_minimal: MockTritonRequest):
     assert converted.bad_words is None
     assert converted.embedding_bias is None
     assert converted.logits_post_processor_name is None
-    assert converted.speculative_decoding_config is None
+    assert converted.external_draft_tokens_config is None
     assert converted.prompt_tuning_config is None
     assert converted.lora_config is None
 
@@ -468,14 +468,12 @@ def test_convert_batching_type():
 
 def test_convert_decoding_mode():
     assert convert_decoding_mode(None) is None
-    assert convert_decoding_mode("none") == trtllm.DecodingMode.NONE
-    assert convert_decoding_mode("top_k") == trtllm.DecodingMode.TOP_K
-    assert convert_decoding_mode("top_p") == trtllm.DecodingMode.TOP_P
-    assert convert_decoding_mode(
-        "top_k_top_p") == trtllm.DecodingMode.TOP_K_TOP_P
-    assert convert_decoding_mode(
-        "beam_search") == trtllm.DecodingMode.BEAM_SEARCH
-    assert convert_decoding_mode("medusa") == trtllm.DecodingMode.MEDUSA
+    assert convert_decoding_mode("auto").isAuto()
+    assert convert_decoding_mode("top_k").isTopK()
+    assert convert_decoding_mode("top_p").isTopP()
+    assert convert_decoding_mode("top_k_top_p").isTopKandTopP()
+    assert convert_decoding_mode("beam_search").isBeamSearch()
+    assert convert_decoding_mode("medusa").isMedusa()
     with pytest.raises(
             Exception,
             match="decoding_mode value of 'other' is not supported"):
@@ -490,7 +488,7 @@ def model_config() -> Dict:
         "normalize_log_probs": "false",
         "gpt_model_type": "inflight_batching",
         "medusa_choices": "{1, 2, 3, 4}, {5, 6, 7}",
-        "decoding_mode": "top_k_top_p",
+        "decoding_mode": "medusa",
         "batch_scheduler_policy": "max_utilization",
         "enable_kv_cache_reuse": "true",
         "max_tokens_in_paged_kv_cache": "1",
@@ -516,8 +514,8 @@ def test_get_executor_config(model_config: Dict):
     assert config.enable_chunked_context == True
     assert config.normalize_log_probs == False
     assert config.batching_type == trtllm.BatchingType.INFLIGHT
-    assert config.medusa_choices == [[1, 2, 3, 4], [5, 6, 7]]
-    assert config.decoding_mode == trtllm.DecodingMode.TOP_K_TOP_P
+    assert config.decoding_config.medusa_choices == [[1, 2, 3, 4], [5, 6, 7]]
+    assert config.decoding_config.decoding_mode.isMedusa()
     assert config.scheduler_config.capacity_scheduler_policy == trtllm.CapacitySchedulerPolicy.MAX_UTILIZATION
     assert config.kv_cache_config.enable_block_reuse == True
     assert config.kv_cache_config.max_tokens == 1
@@ -556,8 +554,8 @@ def test_get_executor_config_minimal():
     assert config.enable_chunked_context == False
     assert config.normalize_log_probs == True
     assert config.batching_type == trtllm.BatchingType.INFLIGHT
-    assert config.medusa_choices is None
-    assert config.decoding_mode is None
+    assert config.decoding_config.decoding_mode is None
+    assert config.decoding_config.medusa_choices is None
     assert config.scheduler_config.capacity_scheduler_policy == trtllm.CapacitySchedulerPolicy.GUARANTEED_NO_EVICT
     assert config.kv_cache_config.enable_block_reuse == False
     assert config.kv_cache_config.max_tokens is None
 
@@ -1,5 +1,5 @@
 ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver
-ARG BASE_TAG=24.03-py3
+ARG BASE_TAG=24.04-py3
 
 FROM ${BASE_IMAGE}:${BASE_TAG} as base
 
@@ -37,9 +37,7 @@ RUN bash /tmp/install_mpi4py.sh && rm /tmp/install_mpi4py.sh
 # Use "pypi" (default) for x86_64 arch and "src_non_cxx11_abi" for aarch64 arch
 ARG TORCH_INSTALL_TYPE="pypi"
 COPY tensorrt_llm/docker/common/install_pytorch.sh install_pytorch.sh
-# Apply PyTorch patch for supporting compiling with CUDA 12.4 from source codes.
-COPY tensorrt_llm/docker/common/pytorch_pr_116072.patch /tmp/pytorch_pr_116072.patch
-RUN bash ./install_pytorch.sh $TORCH_INSTALL_TYPE && rm install_pytorch.sh /tmp/pytorch_pr_116072.patch
+RUN bash ./install_pytorch.sh $TORCH_INSTALL_TYPE && rm install_pytorch.sh
 
 FROM dev as trt_llm_builder
Original file line number	Diff line number	Diff line change
`@@ -513,3 +513,9 @@ parameters: {`
`513`	`513`	`string_value: "${medusa_choices}"`
`514`	`514`	`}`
`515`	`515`	`}`
	`516`	`+parameters: {`
	`517`	`+ key: "gpu_weights_percent"`
	`518`	`+ value: {`
	`519`	`+ string_value: "${gpu_weights_percent}"`
	`520`	`+ }`
	`521`	`+}`