Update TensorRT-LLM backend main branch (triton-inference-server#264)

kaiyux · web-flow · commit f51f50ce77f1 · 2023-12-27T17:56:57.000+08:00
* Update TensorRT-LLM backend
diff --git a/README.md b/README.md
@@ -196,7 +196,7 @@ cp tensorrt_llm/examples/gpt/engines/fp16/4-gpu/* triton_model_repo/tensorrt_llm
 ```
 
 ### Modify the model configuration
-The following table shows the fields that need to be modified before deployment:
+The following table shows the fields that may to be modified before deployment:
 
 *triton_model_repo/preprocessing/config.pbtxt*
 
@@ -209,17 +209,18 @@ The following table shows the fields that need to be modified before deployment:
 
 | Name | Description
 | :----------------------: | :-----------------------------: |
-| `decoupled` | Controls streaming. Decoupled mode must be set to `True` if using the streaming option from the client. |
-| `max_beam_width` | The maximum beam width that any request may ask for when using beam search |
-| `gpt_model_type` | Set to `inflight_fused_batching` when enabling in-flight batching support. To disable in-flight batching, set to `V1` |
-| `gpt_model_path` | Path to the TensorRT-LLM engines for deployment. In this example, the path should be set to `/tensorrtllm_backend/triton_model_repo/tensorrt_llm/1` as the tensorrtllm_backend directory will be mounted to `/tensorrtllm_backend` within the container |
-| `max_tokens_in_paged_kv_cache` | The maximum size of the KV cache in number of tokens |
-| `max_attention_window_size` | When using techniques like sliding window attention, the maximum number of tokens that are attended to generate one token. Defaults to maximum sequence length |
-| `batch_scheduler_policy` | Set to `max_utilization` to greedily pack as many requests as possible in each current in-flight batching iteration. This maximizes the throughput but may result in overheads due to request pause/resume if KV cache limits are reached during execution. Set to `guaranteed_no_evict` to guarantee that a started request is never paused.|
-| `kv_cache_free_gpu_mem_fraction` | Set to a number between 0 and 1 to indicate the maximum fraction of GPU memory (after loading the model) that may be used for KV cache|
-| `max_num_sequences` | Maximum number of sequences that the in-flight batching scheme can maintain state for. Defaults to `max_batch_size` if `enable_trt_overlap` is `false` and to `2 * max_batch_size` if `enable_trt_overlap` is `true`, where `max_batch_size` is the TRT engine maximum batch size.
-| `enable_trt_overlap` | Set to `true` to partition available requests into 2 'microbatches' that can be run concurrently to hide exposed CPU runtime |
-| `exclude_input_in_output` | Set to `true` to only return completion tokens in a response. Set to `false` to return the prompt tokens concatenated with the generated tokens  |
+| `gpt_model_type` | Mandatory. Set to `inflight_fused_batching` when enabling in-flight batching support. To disable in-flight batching, set to `V1` |
+| `gpt_model_path` | Mandatory. Path to the TensorRT-LLM engines for deployment. In this example, the path should be set to `/tensorrtllm_backend/triton_model_repo/tensorrt_llm/1` as the tensorrtllm_backend directory will be mounted to `/tensorrtllm_backend` within the container |
+| `batch_scheduler_policy` | Mandatory. Set to `max_utilization` to greedily pack as many requests as possible in each current in-flight batching iteration. This maximizes the throughput but may result in overheads due to request pause/resume if KV cache limits are reached during execution. Set to `guaranteed_no_evict` to guarantee that a started request is never paused.|
+| `decoupled` | Optional (default=`false`). Controls streaming. Decoupled mode must be set to `True` if using the streaming option from the client. |
+| `max_beam_width` | Optional (default=1). The maximum beam width that any request may ask for when using beam search.|
+| `max_tokens_in_paged_kv_cache` | Optional (default=unspecified). The maximum size of the KV cache in number of tokens. If unspecified, value is interpreted as 'infinite'. KV cache allocation is the min of max_tokens_in_paged_kv_cache and value derived from kv_cache_free_gpu_mem_fraction below. |
+| `max_attention_window_size` | Optional (default=max_sequence_length). When using techniques like sliding window attention, the maximum number of tokens that are attended to generate one token. Defaults attends to all tokens in sequence. |
+| `kv_cache_free_gpu_mem_fraction` | Optional (default=0.85). Set to a number between 0 and 1 to indicate the maximum fraction of GPU memory (after loading the model) that may be used for KV cache.|
+| `max_num_sequences` | Optional (default=`max_batch_size` if `enable_trt_overlap` is `false` and to `2 * max_batch_size` if `enable_trt_overlap` is `true`, where `max_batch_size` is the TRT engine maximum batch size). Maximum number of sequences that the in-flight batching scheme can maintain state for.
+| `enable_trt_overlap` | Optional (default=`true`). Set to `true` to partition available requests into 2 'microbatches' that can be run concurrently to hide exposed CPU runtime |
+| `exclude_input_in_output` | Optional (default=`false`). Set to `true` to only return completion tokens in a response. Set to `false` to return the prompt tokens concatenated with the generated tokens  |
+| `normalize_log_probs` | Optional (default=`true`). Set to `false` to skip normalization of `output_log_probs`  |
 
 *triton_model_repo/postprocessing/config.pbtxt*
 
diff --git a/all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt b/all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt
@@ -294,3 +294,9 @@ parameters: {
     string_value: "${enable_kv_cache_reuse}"
   }
 }
+parameters: {
+  key: "normalize_log_probs"
+  value: {
+    string_value: "${normalize_log_probs}"
+  }
+}
diff --git a/inflight_batcher_llm/client/README.md b/inflight_batcher_llm/client/README.md
@@ -0,0 +1,28 @@
+# Sample TRT-LLM backend clients
+Three sample TRT-LLM Triton clients are provided with the TRT-LLM Triton backend implementation.
+* `e2e_grpc_speculative_decoding_client.py`: Demonstrates how to orchestrate between two independent TRT-LLM models - a draft model and a target model to achiever faster inferencing using speculative decoding. The high level design involves the client making a call to the draft model requesting a certain number of draft tokens, and then associating those draft tokens with a request to the target model. The target model returns some number of completion tokens internally leveraging the draft tokens to speed up inference. The client wraps these back-to-back calls to draft and target models in a loop to complete the full generation.
+Example command:
+```
+python3 speculative_decoding_test.py --max-input-len 200 \
+              --dataset ${LOGDIR}/prompts.csv \
+              --url-draft ${DRAFT_MODEL_URL} \
+              --url-target ${TARGET_MODEL_URL}
+```
+
+* `end_to_end_grpc_client.py`: Demonstrates sending a single request to a tritonserver running an ensemble including preprocessor (tokenizer), TRT-LLM model and postprocessor (detokenizer) and getting back a completion from it.
+Example command:
+```
+python3 end_to_end_grpc_client.py \
+        --streaming --output-len 10 \
+        --prompt "The only thing we have to fear is"
+
+```
+* `inflight_batcher_llm_client.py`: Isolates queries and responses to the TRT-LLM model alone. Invokes tokenizer and detokenizer in the client script i.e. outside the server running inference.
+Example command:
+```
+python3 inflight_batcher_llm_client.py \
+            --tokenizer-dir ${TOKENIZER_PATH} \
+            --tokenizer-type ${TOKENIZER_TYPE} \
+            --input-tokens-csv=${LOGDIR}/prompts.csv \
+            --output-tokens-csv=${LOGDIR}/completions.csv
+```
diff --git a/inflight_batcher_llm/src/model_instance_state.cc b/inflight_batcher_llm/src/model_instance_state.cc
@@ -27,6 +27,10 @@
 
 #include "model_instance_state.h"
 
+#include "tensorrt_llm/common/mpiUtils.h"
+
+namespace mpi = tensorrt_llm::mpi;
+
 namespace triton::backend::inflight_batcher_llm
 {
 
@@ -181,6 +185,17 @@ ModelInstanceState::ModelInstanceState(ModelState* model_state, TRITONBACKEND_Mo
         TLLM_LOG_WARNING("enable_trt_overlap is not specified, will be set to true");
     }
 
+    bool normalizeLogProbs = true;
+    try
+    {
+        normalizeLogProbs = model_state_->GetParameter<bool>("normalize_log_probs");
+    }
+    catch (const std::exception& e)
+    {
+        // If parameter is not specified, just ignore
+        TLLM_LOG_WARNING("normalize_log_probs is not specified, will be set to true");
+    }
+
     bool excludeInputInOutput = false;
     try
     {
@@ -223,6 +238,7 @@ ModelInstanceState::ModelInstanceState(ModelState* model_state, TRITONBACKEND_Mo
     optionalParams.kvCacheConfig.maxAttentionWindow = maxAttentionWindow;
     optionalParams.kvCacheConfig.enableBlockReuse = enableKVCacheReuse;
     optionalParams.enableTrtOverlap = enableTrtOverlap;
+    optionalParams.normalizeLogProbs = normalizeLogProbs;
 
     mBatchManager = std::make_shared<GptManager>(
         mModelPath, mTrtGptModelType, maxBeamWidth, schedulerPolicy,
@@ -232,7 +248,7 @@ ModelInstanceState::ModelInstanceState(ModelState* model_state, TRITONBACKEND_Mo
         [this]() { return pollStopSignals(); }, [this](const std::string& s) { return logStats(s); }, optionalParams,
         std::nullopt, std::nullopt, excludeInputInOutput);
 
-    if (getCommWorldRank() != 0)
+    if (COMM_SESSION.getRank() != 0)
     {
         while (true)
         {
@@ -272,7 +288,7 @@ void ModelInstanceState::enqueue(TRITONBACKEND_Request** requests, const uint32_
         TRITONBACKEND_Request* request = requests[r];
         try
         {
-            auto requestId = utils::getRequestId(request);
+            auto requestId = utils::getRequestId(request, mRequestIdStrMap);
             bool stopRequest = utils::getRequestBooleanInputTensor(request, kStopInputTensorName);
 
             if (stopRequest)
@@ -326,8 +342,10 @@ std::list<std::shared_ptr<InferenceRequest>> ModelInstanceState::get_inference_r
         return rval;
     }
 
-    auto world_size = getCommWorldSize();
-    auto rank = getCommWorldRank();
+    auto const& commSession = COMM_SESSION;
+
+    auto world_size = commSession.getSize();
+    auto rank = commSession.getRank();
     if (rank == 0)
     {
         auto numPendingWorkItems = mWorkItemsQueue->numPendingWorkItems();
@@ -355,7 +373,7 @@ std::list<std::shared_ptr<InferenceRequest>> ModelInstanceState::get_inference_r
         if (world_size > 1)
         {
             int64_t num_new_work_items = rval.size();
-            bcast(&num_new_work_items, 1, MPI_TYPE_INT64_T, 0, COMM_WORLD);
+            commSession.bcast(num_new_work_items, 0);
 
             if (num_new_work_items > 0)
             {
@@ -366,19 +384,19 @@ std::list<std::shared_ptr<InferenceRequest>> ModelInstanceState::get_inference_r
                     packed.push_back(static_cast<int64_t>(vpacked.size()));
                     packed.insert(packed.end(), std::move_iterator(vpacked.begin()), std::move_iterator(vpacked.end()));
                 }
-                bcast(packed, 0, COMM_WORLD);
+                commSession.bcast(packed, 0);
             }
         }
     }
     else
     {
         // subordinate ranks hang until master rank sends work
         int64_t num_new_work_items;
-        bcast(&num_new_work_items, 1, MPI_TYPE_INT64_T, 0, COMM_WORLD);
+        commSession.bcast(num_new_work_items, 0);
         if (num_new_work_items > 0)
         {
             std::vector<int64_t> packed;
-            bcast(packed, 0, COMM_WORLD);
+            commSession.bcast(packed, 0);
             int64_t* packed_ptr = packed.data();
             for (int64_t count = 0; count < num_new_work_items; ++count)
             {
@@ -395,9 +413,14 @@ std::list<std::shared_ptr<InferenceRequest>> ModelInstanceState::get_inference_r
 void ModelInstanceState::sendResponse(
     uint64_t requestId, std::list<NamedTensor> const& response_tensors, bool final_response, const std::string& errMsg)
 {
-    if (getCommWorldRank() == 0)
+    if (COMM_SESSION.getRank() == 0)
     {
-        std::string errStr = std::string("Failed to send Triton response for requestId: ") + std::to_string(requestId);
+        std::string errStr = std::string("Failed to send Triton response for requestId: ")
+            + utils::getRequestIdStr(requestId, mRequestIdStrMap);
+        if (final_response)
+        {
+            mRequestIdStrMap.erase(requestId);
+        }
         try
         {
             auto workItem = mWorkItemsQueue->getInProgressWorkItem(requestId);
@@ -421,31 +444,34 @@ std::unordered_set<uint64_t> ModelInstanceState::pollStopSignals()
 
     int64_t nStoppedReqIds = static_cast<int64_t>(stoppedReqIds.size());
 
-    if (getCommWorldSize() > 1)
+    auto const& commSession = COMM_SESSION;
+
+    if (commSession.getSize() > 1)
     {
         // Broadcast number of stopped requests
-        bcast(&nStoppedReqIds, 1, MPI_TYPE_INT64_T, 0, COMM_WORLD);
+        commSession.bcast(nStoppedReqIds, 0);
 
         if (nStoppedReqIds > 0)
         {
             // Broadcast stopped requests Ids
-            if (getCommWorldRank() == 0)
+            if (commSession.getRank() == 0)
             {
                 // Store the requestIds in a contiguous vector
                 std::vector<uint64_t> stoppedReqIdsVec(stoppedReqIds.begin(), stoppedReqIds.end());
-                bcast(stoppedReqIdsVec.data(), stoppedReqIdsVec.size(), MPI_TYPE_UINT64_T, 0, COMM_WORLD);
+                commSession.bcast(stoppedReqIdsVec.data(), stoppedReqIdsVec.size(), mpi::MpiType::kUINT64, 0);
             }
             else
             {
                 std::vector<uint64_t> stoppedReqIdsVec(nStoppedReqIds);
-                bcast(stoppedReqIdsVec.data(), stoppedReqIdsVec.size(), MPI_TYPE_UINT64_T, 0, COMM_WORLD);
+                commSession.bcast(stoppedReqIdsVec.data(), stoppedReqIdsVec.size(), mpi::MpiType::kUINT64, 0);
                 // Store the requestIds in the set
                 stoppedReqIds.clear();
                 std::copy(stoppedReqIdsVec.begin(), stoppedReqIdsVec.end(),
                     std::inserter(stoppedReqIds, stoppedReqIds.end()));
             }
         }
     }
+
     return stoppedReqIds;
 }
 
diff --git a/inflight_batcher_llm/src/model_instance_state.h b/inflight_batcher_llm/src/model_instance_state.h
@@ -28,6 +28,7 @@
 #define _GLIBCXX_USE_CXX11_ABI 0
 
 #include <nlohmann/json.hpp>
+#include <unordered_map>
 
 #include "triton/backend/backend_common.h"
 #include "triton/core/tritonbackend.h"
@@ -40,15 +41,13 @@
 #include "tensorrt_llm/batch_manager/kvCacheConfig.h"
 #include "tensorrt_llm/batch_manager/namedTensor.h"
 #include "tensorrt_llm/batch_manager/trtGptModelOptionalParams.h"
-#include "tensorrt_llm/common/mpiUtils.h"
 
 #include "model_state.h"
 #include "work_item.h"
 #include "work_items_queue.h"
 
 using namespace tensorrt_llm::batch_manager;
 using namespace tensorrt_llm::batch_manager::batch_scheduler;
-using namespace tensorrt_llm::mpi;
 
 namespace triton::backend::inflight_batcher_llm
 {
@@ -126,6 +125,8 @@ class ModelInstanceState
 
     std::shared_ptr<GptManager> mBatchManager;
     std::unique_ptr<WorkItemsQueue> mWorkItemsQueue;
+
+    std::unordered_map<uint64_t, std::string> mRequestIdStrMap;
 };
 
 } // namespace triton::backend::inflight_batcher_llm
diff --git a/inflight_batcher_llm/src/utils.cc b/inflight_batcher_llm/src/utils.cc
@@ -139,7 +139,7 @@ TRITONSERVER_DataType to_triton_datatype(nvinfer1::DataType data_type)
     }
 }
 
-uint64_t getRequestId(TRITONBACKEND_Request* request)
+uint64_t getRequestId(TRITONBACKEND_Request* request, std::unordered_map<uint64_t, std::string>& requestIdStrMap)
 {
     const char* charRequestId;
     TRITONBACKEND_RequestId(request, &charRequestId);
@@ -155,15 +155,34 @@ uint64_t getRequestId(TRITONBACKEND_Request* request)
             }
             catch (const std::exception& e)
             {
-                std::string err = std::string("Invalid requestId, must be uint64_t. Got ") + strRequestId;
-                throw std::runtime_error(err);
+                std::hash<std::string> hasher;
+                requestId = hasher(strRequestId);
+
+                // Check for hash collisions
+                // If requestID already exists in the map with the same string, increment the ID and check again
+                for (auto it = requestIdStrMap.find(requestId);
+                     it != requestIdStrMap.end() && it->second != strRequestId;)
+                {
+                    requestId++;
+                }
             }
+            requestIdStrMap.insert({requestId, strRequestId});
         }
     }
 
     return requestId;
 }
 
+std::string getRequestIdStr(uint64_t requestId, std::unordered_map<uint64_t, std::string> const& requestIdStrMap)
+{
+    auto it = requestIdStrMap.find(requestId);
+    if (it != requestIdStrMap.end())
+    {
+        return it->second;
+    }
+    return std::to_string(requestId);
+}
+
 std::unordered_set<std::string> getRequestOutputNames(TRITONBACKEND_Request* request)
 {
     std::unordered_set<std::string> outputNames;
diff --git a/inflight_batcher_llm/src/utils.h b/inflight_batcher_llm/src/utils.h
@@ -50,9 +50,13 @@ nvinfer1::DataType to_trt_datatype(TRITONSERVER_DataType data_type);
 /// @brief  Convert TRT datatype to Triton datatype
 TRITONSERVER_DataType to_triton_datatype(nvinfer1::DataType data_type);
 
-/// @brief get the requestId of the request
+/// @brief get the requestId of the request and update requestIdStrMap
 /// @return Returns 0 if not specified. Throws an error if request_id cannot be convert to uint64_t
-uint64_t getRequestId(TRITONBACKEND_Request* request);
+uint64_t getRequestId(TRITONBACKEND_Request* request, std::unordered_map<uint64_t, std::string>& requestIdStrMap);
+
+/// @brief get the original requestId string from the uint64_t requestId
+/// @return If uint64_t id is not present in requestIdStrMap, returns std::to_string(requestId)
+std::string getRequestIdStr(uint64_t requestId, std::unordered_map<uint64_t, std::string> const& requestIdStrMap);
 
 /// @brief Get the requested output names
 std::unordered_set<std::string> getRequestOutputNames(TRITONBACKEND_Request* request);
diff --git a/tensorrt_llm b/tensorrt_llm
@@ -1 +1 @@
-Subproject commit a75618df24e97ecf92b8899ca3c229c4b8097dda
+Subproject commit d37b507f41a87457fe9f10f7459d08f5db235745
diff --git a/tools/version.txt b/tools/version.txt
@@ -0,0 +1 @@
+e880735c6b44a0cf74ae1d37d23d529c86deb65d

Original file line number	Diff line number	Diff line change
`@@ -294,3 +294,9 @@ parameters: {`
`294`	`294`	`string_value: "${enable_kv_cache_reuse}"`
`295`	`295`	`}`
`296`	`296`	`}`
	`297`	`+parameters: {`
	`298`	`+ key: "normalize_log_probs"`
	`299`	`+ value: {`
	`300`	`+ string_value: "${normalize_log_probs}"`
	`301`	`+ }`
	`302`	`+}`
Original file line number	Diff line number	Diff line change
`@@ -27,6 +27,10 @@`
`27`	`27`
`28`	`28`	`#include "model_instance_state.h"`
`29`	`29`
	`30`	`+#include "tensorrt_llm/common/mpiUtils.h"`
	`31`	`+`
	`32`	`+namespace mpi = tensorrt_llm::mpi;`
	`33`	`+`
`30`	`34`	`namespace triton::backend::inflight_batcher_llm`
`31`	`35`	`{`
`32`	`36`
`@@ -181,6 +185,17 @@ ModelInstanceState::ModelInstanceState(ModelState* model_state, TRITONBACKEND_Mo`
`181`	`185`	`TLLM_LOG_WARNING("enable_trt_overlap is not specified, will be set to true");`
`182`	`186`	`}`
`183`	`187`
	`188`	`+ bool normalizeLogProbs = true;`
	`189`	`+ try`
	`190`	`+ {`
	`191`	`+ normalizeLogProbs = model_state_->GetParameter<bool>("normalize_log_probs");`
	`192`	`+ }`
	`193`	`+ catch (const std::exception& e)`
	`194`	`+ {`
	`195`	`+ // If parameter is not specified, just ignore`
	`196`	`+ TLLM_LOG_WARNING("normalize_log_probs is not specified, will be set to true");`
	`197`	`+ }`
	`198`	`+`
`184`	`199`	`bool excludeInputInOutput = false;`
`185`	`200`	`try`
`186`	`201`	`{`
`@@ -223,6 +238,7 @@ ModelInstanceState::ModelInstanceState(ModelState* model_state, TRITONBACKEND_Mo`
`223`	`238`	`optionalParams.kvCacheConfig.maxAttentionWindow = maxAttentionWindow;`
`224`	`239`	`optionalParams.kvCacheConfig.enableBlockReuse = enableKVCacheReuse;`
`225`	`240`	`optionalParams.enableTrtOverlap = enableTrtOverlap;`
	`241`	`+ optionalParams.normalizeLogProbs = normalizeLogProbs;`
`226`	`242`
`227`	`243`	`mBatchManager = std::make_shared<GptManager>(`
`228`	`244`	`mModelPath, mTrtGptModelType, maxBeamWidth, schedulerPolicy,`
`@@ -232,7 +248,7 @@ ModelInstanceState::ModelInstanceState(ModelState* model_state, TRITONBACKEND_Mo`
`232`	`248`	`[this]() { return pollStopSignals(); }, [this](const std::string& s) { return logStats(s); }, optionalParams,`
`233`	`249`	`std::nullopt, std::nullopt, excludeInputInOutput);`
`234`	`250`
`235`		`- if (getCommWorldRank() != 0)`
	`251`	`+ if (COMM_SESSION.getRank() != 0)`
`236`	`252`	`{`
`237`	`253`	`while (true)`
`238`	`254`	`{`
`@@ -272,7 +288,7 @@ void ModelInstanceState::enqueue(TRITONBACKEND_Request** requests, const uint32_`
`272`	`288`	`TRITONBACKEND_Request* request = requests[r];`
`273`	`289`	`try`
`274`	`290`	`{`
`275`		`- auto requestId = utils::getRequestId(request);`
	`291`	`+ auto requestId = utils::getRequestId(request, mRequestIdStrMap);`
`276`	`292`	`bool stopRequest = utils::getRequestBooleanInputTensor(request, kStopInputTensorName);`
`277`	`293`
`278`	`294`	`if (stopRequest)`
`@@ -326,8 +342,10 @@ std::list<std::shared_ptr<InferenceRequest>> ModelInstanceState::get_inference_r`
`326`	`342`	`return rval;`
`327`	`343`	`}`
`328`	`344`
`329`		`- auto world_size = getCommWorldSize();`
`330`		`- auto rank = getCommWorldRank();`
	`345`	`+ auto const& commSession = COMM_SESSION;`
	`346`	`+`
	`347`	`+ auto world_size = commSession.getSize();`
	`348`	`+ auto rank = commSession.getRank();`
`331`	`349`	`if (rank == 0)`
`332`	`350`	`{`
`333`	`351`	`auto numPendingWorkItems = mWorkItemsQueue->numPendingWorkItems();`
`@@ -355,7 +373,7 @@ std::list<std::shared_ptr<InferenceRequest>> ModelInstanceState::get_inference_r`
`355`	`373`	`if (world_size > 1)`
`356`	`374`	`{`
`357`	`375`	`int64_t num_new_work_items = rval.size();`
`358`		`- bcast(&num_new_work_items, 1, MPI_TYPE_INT64_T, 0, COMM_WORLD);`
	`376`	`+ commSession.bcast(num_new_work_items, 0);`
`359`	`377`
`360`	`378`	`if (num_new_work_items > 0)`
`361`	`379`	`{`
`@@ -366,19 +384,19 @@ std::list<std::shared_ptr<InferenceRequest>> ModelInstanceState::get_inference_r`
`366`	`384`	`packed.push_back(static_cast<int64_t>(vpacked.size()));`
`367`	`385`	`packed.insert(packed.end(), std::move_iterator(vpacked.begin()), std::move_iterator(vpacked.end()));`
`368`	`386`	`}`
`369`		`- bcast(packed, 0, COMM_WORLD);`
	`387`	`+ commSession.bcast(packed, 0);`
`370`	`388`	`}`
`371`	`389`	`}`
`372`	`390`	`}`
`373`	`391`	`else`
`374`	`392`	`{`
`375`	`393`	`// subordinate ranks hang until master rank sends work`
`376`	`394`	`int64_t num_new_work_items;`
`377`		`- bcast(&num_new_work_items, 1, MPI_TYPE_INT64_T, 0, COMM_WORLD);`
	`395`	`+ commSession.bcast(num_new_work_items, 0);`
`378`	`396`	`if (num_new_work_items > 0)`
`379`	`397`	`{`
`380`	`398`	`std::vector<int64_t> packed;`
`381`		`- bcast(packed, 0, COMM_WORLD);`
	`399`	`+ commSession.bcast(packed, 0);`
`382`	`400`	`int64_t* packed_ptr = packed.data();`
`383`	`401`	`for (int64_t count = 0; count < num_new_work_items; ++count)`
`384`	`402`	`{`
`@@ -395,9 +413,14 @@ std::list<std::shared_ptr<InferenceRequest>> ModelInstanceState::get_inference_r`
`395`	`413`	`void ModelInstanceState::sendResponse(`
`396`	`414`	`uint64_t requestId, std::list<NamedTensor> const& response_tensors, bool final_response, const std::string& errMsg)`
`397`	`415`	`{`
`398`		`- if (getCommWorldRank() == 0)`
	`416`	`+ if (COMM_SESSION.getRank() == 0)`
`399`	`417`	`{`
`400`		`- std::string errStr = std::string("Failed to send Triton response for requestId: ") + std::to_string(requestId);`
	`418`	`+ std::string errStr = std::string("Failed to send Triton response for requestId: ")`
	`419`	`+ + utils::getRequestIdStr(requestId, mRequestIdStrMap);`
	`420`	`+ if (final_response)`
	`421`	`+ {`
	`422`	`+ mRequestIdStrMap.erase(requestId);`
	`423`	`+ }`
`401`	`424`	`try`
`402`	`425`	`{`
`403`	`426`	`auto workItem = mWorkItemsQueue->getInProgressWorkItem(requestId);`
`@@ -421,31 +444,34 @@ std::unordered_set<uint64_t> ModelInstanceState::pollStopSignals()`
`421`	`444`
`422`	`445`	`int64_t nStoppedReqIds = static_cast<int64_t>(stoppedReqIds.size());`
`423`	`446`
`424`		`- if (getCommWorldSize() > 1)`
	`447`	`+ auto const& commSession = COMM_SESSION;`
	`448`	`+`
	`449`	`+ if (commSession.getSize() > 1)`
`425`	`450`	`{`
`426`	`451`	`// Broadcast number of stopped requests`
`427`		`- bcast(&nStoppedReqIds, 1, MPI_TYPE_INT64_T, 0, COMM_WORLD);`
	`452`	`+ commSession.bcast(nStoppedReqIds, 0);`
`428`	`453`
`429`	`454`	`if (nStoppedReqIds > 0)`
`430`	`455`	`{`
`431`	`456`	`// Broadcast stopped requests Ids`
`432`		`- if (getCommWorldRank() == 0)`
	`457`	`+ if (commSession.getRank() == 0)`
`433`	`458`	`{`
`434`	`459`	`// Store the requestIds in a contiguous vector`
`435`	`460`	`std::vector<uint64_t> stoppedReqIdsVec(stoppedReqIds.begin(), stoppedReqIds.end());`
`436`		`- bcast(stoppedReqIdsVec.data(), stoppedReqIdsVec.size(), MPI_TYPE_UINT64_T, 0, COMM_WORLD);`
	`461`	`+ commSession.bcast(stoppedReqIdsVec.data(), stoppedReqIdsVec.size(), mpi::MpiType::kUINT64, 0);`
`437`	`462`	`}`
`438`	`463`	`else`
`439`	`464`	`{`
`440`	`465`	`std::vector<uint64_t> stoppedReqIdsVec(nStoppedReqIds);`
`441`		`- bcast(stoppedReqIdsVec.data(), stoppedReqIdsVec.size(), MPI_TYPE_UINT64_T, 0, COMM_WORLD);`
	`466`	`+ commSession.bcast(stoppedReqIdsVec.data(), stoppedReqIdsVec.size(), mpi::MpiType::kUINT64, 0);`
`442`	`467`	`// Store the requestIds in the set`
`443`	`468`	`stoppedReqIds.clear();`
`444`	`469`	`std::copy(stoppedReqIdsVec.begin(), stoppedReqIdsVec.end(),`
`445`	`470`	`std::inserter(stoppedReqIds, stoppedReqIds.end()));`
`446`	`471`	`}`
`447`	`472`	`}`
`448`	`473`	`}`
	`474`	`+`
`449`	`475`	`return stoppedReqIds;`
`450`	`476`	`}`
`451`	`477`
Original file line number	Diff line number	Diff line change
`@@ -139,7 +139,7 @@ TRITONSERVER_DataType to_triton_datatype(nvinfer1::DataType data_type)`
`139`	`139`	`}`
`140`	`140`	`}`
`141`	`141`
`142`		`-uint64_t getRequestId(TRITONBACKEND_Request* request)`
	`142`	`+uint64_t getRequestId(TRITONBACKEND_Request* request, std::unordered_map<uint64_t, std::string>& requestIdStrMap)`
`143`	`143`	`{`
`144`	`144`	`const char* charRequestId;`
`145`	`145`	`TRITONBACKEND_RequestId(request, &charRequestId);`
`@@ -155,15 +155,34 @@ uint64_t getRequestId(TRITONBACKEND_Request* request)`
`155`	`155`	`}`
`156`	`156`	`catch (const std::exception& e)`
`157`	`157`	`{`
`158`		`- std::string err = std::string("Invalid requestId, must be uint64_t. Got ") + strRequestId;`
`159`		`- throw std::runtime_error(err);`
	`158`	`+ std::hash<std::string> hasher;`
	`159`	`+ requestId = hasher(strRequestId);`
	`160`	`+`
	`161`	`+ // Check for hash collisions`
	`162`	`+ // If requestID already exists in the map with the same string, increment the ID and check again`
	`163`	`+ for (auto it = requestIdStrMap.find(requestId);`
	`164`	`+ it != requestIdStrMap.end() && it->second != strRequestId;)`
	`165`	`+ {`
	`166`	`+ requestId++;`
	`167`	`+ }`
`160`	`168`	`}`
	`169`	`+ requestIdStrMap.insert({requestId, strRequestId});`
`161`	`170`	`}`
`162`	`171`	`}`
`163`	`172`
`164`	`173`	`return requestId;`
`165`	`174`	`}`
`166`	`175`
	`176`	`+std::string getRequestIdStr(uint64_t requestId, std::unordered_map<uint64_t, std::string> const& requestIdStrMap)`
	`177`	`+{`
	`178`	`+ auto it = requestIdStrMap.find(requestId);`
	`179`	`+ if (it != requestIdStrMap.end())`
	`180`	`+ {`
	`181`	`+ return it->second;`
	`182`	`+ }`
	`183`	`+ return std::to_string(requestId);`
	`184`	`+}`
	`185`	`+`
`167`	`186`	`std::unordered_set<std::string> getRequestOutputNames(TRITONBACKEND_Request* request)`
`168`	`187`	`{`
`169`	`188`	`std::unordered_set<std::string> outputNames;`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+e880735c6b44a0cf74ae1d37d23d529c86deb65d`