stateful gpu: use full chat history for each prefill (#668)

RyanMetcalfeInt8 · web-flow · commit 2b6b7d59a671 · 2025-04-21T22:26:29.000+05:30
diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc
@@ -419,17 +419,23 @@ void OVInferRequest::QueryStatus() {
             << " ";
 }
 
-void StatefulOVInferRequest::_pre_infer() {
+StatefulOVInferRequest::StatefulOVInferRequest(ov::InferRequest infer_request, std::string d)
+    : OVInferRequest(std::move(infer_request)), device(d) {
+  if ((device.find("NPU") != std::string::npos) || (device.find("GPU") != std::string::npos)) {
+    prefill_use_full_chat_history = true;
+  }
+}
+
+void StatefulOVInferRequest::PreProcessInferRequest() {
   // Since we can't seem to set at ORT GenAI layer right now, we just set it here
   // as a workaround.
   // TODO: Fix this.
   ov::Tensor beam_idx = ov::Tensor(ov::element::i32, {1});
   std::fill_n(beam_idx.data<int32_t>(), 1, 0);
   ovInfReq.set_tensor("beam_idx", beam_idx);
 
-  // For NPU, we need to cache input_ids and position_ids for
-  // chat-mode support.
-  if (device.find("NPU") != std::string::npos) {
+  // If 'prefill full chat history' mode is enabled, we need to cache input_ids and position_ids.
+  if (prefill_use_full_chat_history) {
     auto input_ids_tensor = ovInfReq.get_tensor("input_ids");
 
     // add input_ids to our cache
@@ -454,6 +460,9 @@ void StatefulOVInferRequest::_pre_infer() {
       // if the input_ids size doesn't equal cached size of the input_ids
       //  then it means that we're running 2nd (or later) prompt.
       if (input_ids_tensor.get_shape()[1] != cached_input_ids.size()) {
+        // Clear the internal KVCache state (note: this is a no-op for NPU)
+        ovInfReq.reset_state();
+
         // set a new input_ids tensor with the content of our cached input_ids
         {
           auto new_shape = input_ids_tensor.get_shape();
@@ -480,19 +489,22 @@ void StatefulOVInferRequest::_pre_infer() {
 }
 
 void StatefulOVInferRequest::StartAsync() {
-  _pre_infer();
+  PreProcessInferRequest();
   OVInferRequest::StartAsync();
 }
 
 void StatefulOVInferRequest::Infer() {
-  _pre_infer();
+  PreProcessInferRequest();
   OVInferRequest::Infer();
 }
 
 void StatefulOVInferRequest::RewindKVCache(size_t index) {
-  if (device == "NPU") {
-    std::cout << "RewindKVCache on NPU: Trimming cached input_ids / position_ids to length "
-              << index << std::endl;
+  LOGS_DEFAULT(INFO) << log_tag << "RewindKVCache: Rewinding OpenVINO-internal KVCache state to index=" << index << std::endl;
+
+  if (prefill_use_full_chat_history) {
+    // Clear the internal KVCache state (note: this is a no-op for NPU)
+    ovInfReq.reset_state();
+
     if (cached_input_ids.size() > index) {
       cached_input_ids.resize(index);
     }
@@ -501,8 +513,6 @@ void StatefulOVInferRequest::RewindKVCache(size_t index) {
       cached_position_ids.resize(index);
     }
   } else {
-    std::cout << "OVInferRequest::RewindKVCache: Trimming internal states to length = "
-              << index << std::endl;
     if (index == 0) {
       // in this case, since we're trimming *all* of the KVCache, just reset the state.
       ovInfReq.reset_state();
diff --git a/onnxruntime/core/providers/openvino/ov_interface.h b/onnxruntime/core/providers/openvino/ov_interface.h
@@ -142,17 +142,19 @@ class OVInferRequest {
 
 class StatefulOVInferRequest : public OVInferRequest {
  public:
-  explicit StatefulOVInferRequest(ov::InferRequest obj, std::string d) : OVInferRequest(std::move(obj)), device(d) {}
+  explicit StatefulOVInferRequest(ov::InferRequest infer_request, std::string device);
 
   void StartAsync() override;
   void Infer() override;
   void RewindKVCache(size_t index) override;
 
  private:
-  void _pre_infer();
+  void PreProcessInferRequest();
   std::string device;
 
-  // For NPU, we need to cache input_ids & position_ids to support chat-mode.
+  // If prefill_use_full_chat_history is true, cache input_ids & position_ids,
+  // and ensure that the full chat history is passed to each prefill.
+  bool prefill_use_full_chat_history = false;
   std::vector<int64_t> cached_input_ids;
   std::vector<int64_t> cached_position_ids;
 };