triton-inference-server · HennerM · Jan 7, 2024
diff --git a/include/triton/backend/backend_output_responder.h b/include/triton/backend/backend_output_responder.h
@@ -52,13 +52,31 @@ class BackendOutputResponder {
   // The caller can optionally provide 'event' for internal synchronization
   // instead of using 'stream'.
   explicit BackendOutputResponder(
-      TRITONBACKEND_Request** requests, const uint32_t request_count,
+      TRITONBACKEND_Request** requests,
       std::vector<TRITONBACKEND_Response*>* responses,
       TRITONBACKEND_MemoryManager* memory_manager,
       const bool first_dim_batching, const bool pinned_enabled,
       cudaStream_t stream, cudaEvent_t event = nullptr,
       bool copy_on_stream = false)
-      : need_sync_(false), requests_(requests), request_count_(request_count),
+      : need_sync_(false), requests_(requests),
+        responses_(responses), memory_manager_(memory_manager),
+        first_dim_batching_(first_dim_batching),
+        pinned_enabled_(pinned_enabled),
+        use_async_cpu_copy_(triton::common::AsyncWorkQueue::WorkerCount() > 1),
+        stream_(stream), event_(event), pending_pinned_byte_size_(0),
+        copy_on_stream_(copy_on_stream)
+  {
+  }
+
+  // Legacy constructor for backwards compatibility with request_count parameter
+  explicit BackendOutputResponder(
+      TRITONBACKEND_Request** requests, const uint32_t /* request_count */,
+      std::vector<TRITONBACKEND_Response*>* responses,
+      TRITONBACKEND_MemoryManager* memory_manager,
+      const bool first_dim_batching, const bool pinned_enabled,
+      cudaStream_t stream, cudaEvent_t event = nullptr,
+      bool copy_on_stream = false)
+      : need_sync_(false), requests_(requests),
         responses_(responses), memory_manager_(memory_manager),
         first_dim_batching_(first_dim_batching),
         pinned_enabled_(pinned_enabled),
@@ -75,12 +93,12 @@ class BackendOutputResponder {
   // max_batch_size value instead of having it provided directly as in
   // the above constructor.
   explicit BackendOutputResponder(
-      TRITONBACKEND_Request** requests, const uint32_t request_count,
+      TRITONBACKEND_Request** requests, const uint32_t /* request_count */,
       std::vector<TRITONBACKEND_Response*>* responses, const int max_batch_size,
       TRITONBACKEND_MemoryManager* memory_manager, const bool pinned_enabled,
       cudaStream_t stream, cudaEvent_t event = nullptr,
       bool copy_on_stream = false)
-      : need_sync_(false), requests_(requests), request_count_(request_count),
+      : need_sync_(false), requests_(requests),
         responses_(responses), memory_manager_(memory_manager),
         first_dim_batching_(max_batch_size >= 1),
         pinned_enabled_(pinned_enabled),
@@ -152,7 +170,6 @@ class BackendOutputResponder {
 
   bool need_sync_;
   TRITONBACKEND_Request** requests_;
-  const uint32_t request_count_;
   std::vector<TRITONBACKEND_Response*>* responses_;
   TRITONBACKEND_MemoryManager* memory_manager_;
   const bool first_dim_batching_;