feat: load_config will now accept MAX_PROMPT_LEN & MIN_RESPONSE_LEN with enable_causallm=True (#660)

ankitm3k · ankitm3k · commit 761d911cabe3 · 2025-04-21T11:07:51.000+05:30
diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc
@@ -15,6 +15,7 @@
 #include "core/providers/openvino/backends/basic_backend.h"
 #include "core/providers/openvino/onnx_ctx_model_helper.h"
 #include "core/providers/openvino/backend_manager.h"
+#include "core/providers/openvino/ov_stateful_patch_utils.h"
 
 namespace onnxruntime {
 
@@ -200,6 +201,15 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) {
   if (!session_context_.load_config.empty()) {
     const std::map<std::string, ov::AnyMap>& target_config = session_context_.load_config;
 
+    if ((session_context_.device_type.find("NPU") != std::string::npos) && session_context_.enable_causallm) {
+      if (target_config.find("NPU") != target_config.end()) {
+        auto npu_genai_config = target_config.at("NPU");
+        CausalLMConfig().ApplyConfig(npu_genai_config, device_config);
+      } else {
+        LOGS_DEFAULT(WARNING) << "ORT GenAI CausalLMConfig Configuration not found.";
+      }
+    }
+
     if (session_context_.device_type.find("NPU") != std::string::npos) {
       auto npuw_config = target_config.at("NPU");
 
@@ -265,7 +275,8 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) {
     auto set_target_properties = [&](const std::string& device, const ov::AnyMap& config_options,
                                      const std::vector<ov::PropertyName>& supported_properties) {
       for (const auto& [key, value] : config_options) {
-        if (key.find("NPUW") != std::string::npos) {
+        if ((key.find("NPUW") != std::string::npos) ||
+            ((device_config.find(key) != device_config.end()) && session_context_.enable_causallm)) {
           continue;
         }
         if (is_supported_and_mutable(key, supported_properties)) {
diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc
@@ -7,8 +7,7 @@
 #include "core/session/onnxruntime_cxx_api.h"
 #include "core/providers/shared_library/provider_api.h"
 #include "core/providers/openvino/backend_utils.h"
-
-// for make stateful utility function(s)
+#include "core/providers/openvino/backends/basic_backend.h"
 #include "core/providers/openvino/ov_stateful_patch_utils.h"
 
 using Exception = ov::Exception;
@@ -97,9 +96,9 @@ OVExeNetwork OVCore::StatefulCompileModel(std::shared_ptr<OVNetwork>& model,
   }
 
   LOGS_DEFAULT(INFO) << log_tag << "Converting from Stateless OV Model to Stateful OV Model" << std::endl;
-  bool status = IsStateful(model);
-  std::cout << "IsStateful Status:\t" << status << std::endl;
-  if (!status) {
+  bool model_status = IsStateful(model);
+  LOGS_DEFAULT(INFO) << log_tag << "Model IsStateful() Status:\t" << (model_status ? "True" : "False");
+  if (!model_status) {
     PatchStatefulDecoder(model);
   }
 
@@ -109,17 +108,25 @@ OVExeNetwork OVCore::StatefulCompileModel(std::shared_ptr<OVNetwork>& model,
   }
 
   auto kv_pos = GetKVAxesPos(model);
-  if (onnxruntime::openvino_ep::backend_utils::IsDebugEnabled()) {
-    std::cout << "kv_pos.batch = " << kv_pos.batch << std::endl;
-    std::cout << "kv_pos.seq_len = " << kv_pos.seq_len << std::endl;
-  }
 
   if (hw_target.find("NPU") != std::string::npos) {
     KVDesc kv_desc;
-    kv_desc.max_prompt_len = PopIntAndCast(config, "MAX_PROMPT_LEN").value_or(1024u);
-    kv_desc.min_response_len = PopIntAndCast(config, "MIN_RESPONSE_LEN").value_or(128u);
+    auto parse_genai_config = [&](const std::string& key, unsigned int default_value) {
+      return (config.count(key) && !config.at(key).empty() && config.at(key).as<std::string>() != "0") ?
+         config.at(key).as<unsigned int>() : default_value;
+    };
+
+    kv_desc.max_prompt_len = parse_genai_config("MAX_PROMPT_LEN", CausalLMConfig().max_prompt_len);
+    kv_desc.min_response_len = parse_genai_config("MIN_RESPONSE_LEN", CausalLMConfig().min_response_len);
+
+    // For compilation, MAX_PROMPT_LEN & MIN_RESPONSE_LEN should not be 0
+    if (kv_desc.max_prompt_len == 0 || kv_desc.min_response_len == 0) {
+      ORT_THROW(log_tag + "MAX_PROMPT_LEN and MIN_RESPONSE_LEN cannot be 0 or empty");
+    }
 
     if (onnxruntime::openvino_ep::backend_utils::IsDebugEnabled()) {
+      std::cout << "kv_pos.batch = " << kv_pos.batch << std::endl;
+      std::cout << "kv_pos.seq_len = " << kv_pos.seq_len << std::endl;
       std::cout << "kv_desc.max_prompt_len:\t" << kv_desc.max_prompt_len << std::endl;
       std::cout << "kv_desc.min_response_len:\t" << kv_desc.min_response_len << std::endl;
     }
@@ -132,10 +139,8 @@ OVExeNetwork OVCore::StatefulCompileModel(std::shared_ptr<OVNetwork>& model,
     ApplySliceBeforeMatmulTransformation(model);
   }
 
-  std::cout << "Compiling Stateful OV Model ..." << std::endl;
+  LOGS_DEFAULT(INFO) << log_tag << "Compiling OV Model using Stateful Transformation flow";
   compiled_model = OVCore::Get()->core.compile_model(model, hw_target, config);
-  std::cout << "Stateful OV Model Compilation Complete" << std::endl;
-
   OVExeNetwork exe(compiled_model, hw_target, true);
   return exe;
 }
diff --git a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.h b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.h
@@ -57,6 +57,22 @@ struct KVDesc {
   uint32_t min_response_len;
 };
 
+struct CausalLMConfig {
+  void ApplyConfig(const ov::AnyMap& external_config, ov::AnyMap& genai_config) {
+    if (external_config.find("MAX_PROMPT_LEN") != external_config.end()) {
+      max_prompt_len = external_config.at("MAX_PROMPT_LEN").as<unsigned int>();
+    }
+    if (external_config.find("MIN_RESPONSE_LEN") != external_config.end()) {
+      min_response_len = external_config.at("MIN_RESPONSE_LEN").as<unsigned int>();
+    }
+    genai_config["MAX_PROMPT_LEN"] = ov::Any(max_prompt_len);
+    genai_config["MIN_RESPONSE_LEN"] = ov::Any(min_response_len);
+  }
+
+  unsigned int max_prompt_len = 1024;
+  unsigned int min_response_len = 128;
+};
+
 void UpdateNPUConfig(ov::AnyMap& config, const KVAxesPosition& kv_pos, const KVDesc& kv_desc);
 
 std::optional<ov::Any> PopOptionNew(ov::AnyMap& config, const std::string& option_name);