diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/clip_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/clip_op_builder.cc index 0a9f9889ad2d8..dc99687e78d30 100644 --- a/onnxruntime/core/providers/qnn/builder/opbuilder/clip_op_builder.cc +++ b/onnxruntime/core/providers/qnn/builder/opbuilder/clip_op_builder.cc @@ -36,6 +36,27 @@ class ClipOpBuilder : public BaseOpBuilder { Status ExplictOpCheck(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit) const; }; +static Status ProcessClipMinMax(QnnModelWrapper& qnn_model_wrapper, + const NodeUnitIODef& input, + float& float_value) { + TensorInfo input_info = {}; + std::vector val_bytes; + ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(input, input_info)); + assert(input_info.is_initializer); // Checked by ExplicitOpCheck(). + if (QNN_DATATYPE_FLOAT_16 == input_info.qnn_data_type) { + ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(*input_info.initializer_tensor, val_bytes)); + MLFloat16 fp16_value = *reinterpret_cast(val_bytes.data()); + float_value = fp16_value.ToFloat(); + } else { + ORT_RETURN_IF_NOT(QNN_DATATYPE_FLOAT_32 == input_info.qnn_data_type, + "QNN EP: The 'min' input of the Clip operator must be of type float32."); + ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(*input_info.initializer_tensor, val_bytes)); + float_value = *reinterpret_cast(val_bytes.data()); + } + + return Status::OK(); +} + Status ClipOpBuilder::ExplictOpCheck(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit) const { if (node_unit.Inputs().size() > 1) { const auto& min_input_name = node_unit.Inputs()[1].node_arg.Name(); @@ -75,54 +96,36 @@ Status ClipOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wra const Qnn_DataType_t qnn_data_type = QNN_DATATYPE_FLOAT_32; std::vector param_tensor_names; - auto get_f32_from_bytes = [](const std::vector& bytes, float default_val) -> float { - return bytes.empty() ? default_val : *reinterpret_cast(bytes.data()); - }; - // Set the 'min' parameter. - { - std::vector min_val_bytes; - - if (num_inputs > 1 && !inputs[1].node_arg.Name().empty()) { - TensorInfo min_input_info = {}; - ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(inputs[1], min_input_info)); - ORT_RETURN_IF_NOT(min_input_info.qnn_data_type == qnn_data_type, - "QNN EP: The 'min' input of the Clip operator must be of type float32."); - assert(min_input_info.is_initializer); // Checked by ExplicitOpCheck(). - ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(*min_input_info.initializer_tensor, min_val_bytes)); - } + Qnn_Scalar_t min_qnn_scalar = QNN_SCALAR_INIT; + min_qnn_scalar.dataType = qnn_data_type; - Qnn_Scalar_t min_qnn_scalar = QNN_SCALAR_INIT; - min_qnn_scalar.dataType = qnn_data_type; - min_qnn_scalar.floatValue = get_f32_from_bytes(min_val_bytes, std::numeric_limits::lowest()); - QnnParamWrapper min_value_param(node_unit.Index(), node_unit.Name(), QNN_OP_RELU_MIN_MAX_PARAM_MIN_VALUE, - min_qnn_scalar); - param_tensor_names.push_back(min_value_param.GetParamTensorName()); - qnn_model_wrapper.AddParamWrapper(std::move(min_value_param)); + if (num_inputs > 1 && !inputs[1].node_arg.Name().empty()) { + ORT_RETURN_IF_ERROR(ProcessClipMinMax(qnn_model_wrapper, inputs[1], min_qnn_scalar.floatValue)); + } else { + min_qnn_scalar.floatValue = std::numeric_limits::lowest(); } + QnnParamWrapper min_value_param(node_unit.Index(), node_unit.Name(), QNN_OP_RELU_MIN_MAX_PARAM_MIN_VALUE, + min_qnn_scalar); + param_tensor_names.push_back(min_value_param.GetParamTensorName()); + qnn_model_wrapper.AddParamWrapper(std::move(min_value_param)); + // Set the 'max' parameter. - { - std::vector max_val_bytes; - - if (num_inputs > 2 && !inputs[2].node_arg.Name().empty()) { - TensorInfo max_input_info = {}; - ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(inputs[2], max_input_info)); - ORT_RETURN_IF_NOT(max_input_info.qnn_data_type == qnn_data_type, - "QNN EP: The 'max' input of the Clip operator must of type float32."); - assert(max_input_info.is_initializer); // Checked by ExplicitOpCheck(). - ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(*max_input_info.initializer_tensor, max_val_bytes)); - } + Qnn_Scalar_t max_qnn_scalar = QNN_SCALAR_INIT; + max_qnn_scalar.dataType = qnn_data_type; - Qnn_Scalar_t max_qnn_scalar = QNN_SCALAR_INIT; - max_qnn_scalar.dataType = qnn_data_type; - max_qnn_scalar.floatValue = get_f32_from_bytes(max_val_bytes, std::numeric_limits::max()); - QnnParamWrapper max_value_param(node_unit.Index(), node_unit.Name(), QNN_OP_RELU_MIN_MAX_PARAM_MAX_VALUE, - max_qnn_scalar); - param_tensor_names.push_back(max_value_param.GetParamTensorName()); - qnn_model_wrapper.AddParamWrapper(std::move(max_value_param)); + if (num_inputs > 2 && !inputs[2].node_arg.Name().empty()) { + ORT_RETURN_IF_ERROR(ProcessClipMinMax(qnn_model_wrapper, inputs[2], max_qnn_scalar.floatValue)); + } else { + max_qnn_scalar.floatValue = std::numeric_limits::max(); } + QnnParamWrapper max_value_param(node_unit.Index(), node_unit.Name(), QNN_OP_RELU_MIN_MAX_PARAM_MAX_VALUE, + max_qnn_scalar); + param_tensor_names.push_back(max_value_param.GetParamTensorName()); + qnn_model_wrapper.AddParamWrapper(std::move(max_value_param)); + ORT_RETURN_IF_ERROR(ProcessOutputs(qnn_model_wrapper, node_unit, std::move(input_names), std::move(param_tensor_names), diff --git a/onnxruntime/test/providers/qnn/clip_op_test.cc b/onnxruntime/test/providers/qnn/clip_op_test.cc index 15ba3b5de2fa1..e899f870f9e78 100644 --- a/onnxruntime/test/providers/qnn/clip_op_test.cc +++ b/onnxruntime/test/providers/qnn/clip_op_test.cc @@ -182,6 +182,44 @@ TEST_F(QnnHTPBackendTests, Clip_U8_Rank5) { ExpectedEPNodeAssignment::All); } +// Test FP16 Clip with min (FP16) +TEST_F(QnnHTPBackendTests, Clip_FP16) { + ProviderOptions provider_options; + +#if defined(_WIN32) + provider_options["backend_path"] = "QnnHtp.dll"; +#else + provider_options["backend_path"] = "libQnnHtp.so"; +#endif + + auto f32_input = TestInputDef({1, 3, 2, 2}, false, + {-10.0f, -8.0f, -3.5f, 2.2f, + 1.3f, 1.5f, 3.2f, 5.8f, + 5.8f, 9.7f, 8.5f, 8.9f}); + std::vector f16_data; + std::for_each(f32_input.GetRawData().begin(), f32_input.GetRawData().end(), + [&f16_data](const float data) { + f16_data.push_back(static_cast(data)); + }); + auto f16_input = TestInputDef({1, 3, 2, 2}, false, f16_data); + + const float min_f32 = 1.2f; + const MLFloat16 min_f16 = static_cast(min_f32); + auto f32_min_input = TestInputDef({}, true, {min_f32}); + auto f16_min_input = TestInputDef({}, true, {min_f16}); + + auto f32_model_builder = BuildOpTestCase("Clip", {f32_input}, {f32_min_input}, {}); + auto f16_model_builder = BuildOpTestCase("Clip", {f16_input}, {f16_min_input}, {}); + int opset = 13; + ExpectedEPNodeAssignment expected_ep_assignment = ExpectedEPNodeAssignment::All; + + TestFp16ModelAccuracy(f32_model_builder, + f16_model_builder, + provider_options, + opset, + expected_ep_assignment); +} + #endif // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__) } // namespace test } // namespace onnxruntime diff --git a/onnxruntime/test/providers/qnn/qnn_test_utils.h b/onnxruntime/test/providers/qnn/qnn_test_utils.h index f4febd99ddae7..c0cfe3f0342fd 100644 --- a/onnxruntime/test/providers/qnn/qnn_test_utils.h +++ b/onnxruntime/test/providers/qnn/qnn_test_utils.h @@ -467,6 +467,187 @@ inline void TestQDQModelAccuracy(const GetTestModelFn& f32_model_fn, const GetTe } } +/** + * Tests the accuracy of a FP16 model on QNN EP by runnning 3 inferences: + * + * 1. float32 model on CPU EP (baseline) + * 2. FP16 model on CPU EP + * 3. FP16 model on QNN EP + * + * This function checks that running the FP16 model on QNN EP (#3) is at least as accurate (+- small tolerance) + * as running the FP16 model on CPU EP (#2). We primarily measure accuracy by comparing to the baseline (#1). + * + * \param f32_model_fn Function that builds the float model (baseline for comparison). + * \param f16_model_fn Function that builds the FP16 model (run by CPU EP and QNN EP). + * \param qnn_options QNN EP provider options. + * \param opset_version The opset version. + * \param expected_ep_assignment Describes "which nodes" should be assigned to the EP. + * \param tolerance The percent tolerance (as fraction) QNN EP results are allowed to differ from the FP16 model on CPU EP. + * This tolerance is a percentage of the output range. + * \param log_severity The logger's severity setting. + */ +inline void TestFp16ModelAccuracy(const GetTestModelFn& f32_model_fn, + const GetTestModelFn& f16_model_fn, + ProviderOptions qnn_options, + int opset_version, + ExpectedEPNodeAssignment expected_ep_assignment, + float tolerance = 0.004, + logging::Severity log_severity = logging::Severity::kERROR, + const std::string& qnn_ctx_model_path = "", + const std::unordered_map& session_option_pairs = {}) { + // Add kMSDomain to cover contrib op like Gelu + const std::unordered_map domain_to_version = {{"", opset_version}, {kMSDomain, 1}}; + + auto& logging_manager = DefaultLoggingManager(); + logging_manager.SetDefaultLoggerSeverity(log_severity); + + // Create float model and serialize it to a string. + onnxruntime::Model f32_model("f32_model", false, ModelMetaData(), PathString(), + IOnnxRuntimeOpSchemaRegistryList(), domain_to_version, {}, + logging_manager.DefaultLogger()); + ModelTestBuilder f32_helper(f32_model.MainGraph()); + std::string f32_model_data; + f32_model_fn(f32_helper); + f32_helper.SetGraphOutputs(); + ASSERT_STATUS_OK(f32_model.MainGraph().Resolve()); + f32_model.ToProto().SerializeToString(&f32_model_data); + + // Run f32 model on CPU EP and collect outputs. + std::vector cpu_f32_outputs; + InferenceModel(f32_model_data, "f32_model_logger", {}, ExpectedEPNodeAssignment::All, + f32_helper.feeds_, cpu_f32_outputs); + ASSERT_FALSE(cpu_f32_outputs.empty()); + + const size_t num_outputs = cpu_f32_outputs.size(); + + // Compute output range(s) and quantization params. + std::vector> output_vals; + std::vector output_types; + output_vals.resize(num_outputs); + output_types.resize(num_outputs); + + for (size_t i = 0; i < num_outputs; i++) { + auto& tensor = cpu_f32_outputs[i].Get(); + int32_t elem_type = tensor.GetElementType(); + + if (elem_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT) { + output_vals[i] = tensor.DataAsSpan(); + } + + output_types[i] = elem_type; + } + + // Create FP16 model and serialize it to a string. + onnxruntime::Model f16_model("fp16_model", false, ModelMetaData(), PathString(), + IOnnxRuntimeOpSchemaRegistryList(), domain_to_version, {}, + logging_manager.DefaultLogger()); + ModelTestBuilder f16_helper(f16_model.MainGraph()); + std::string f16_model_data; + f16_model_fn(f16_helper); + f16_helper.SetGraphOutputs(); + ASSERT_STATUS_OK(f16_model.MainGraph().Resolve()); + f16_model.ToProto().SerializeToString(&f16_model_data); + + bool is_qnn_ep = true; + TryEnableQNNSaver(qnn_options); + std::vector qnn_f16_outputs; + if (!qnn_ctx_model_path.empty()) { + onnx::ModelProto model_proto; + onnxruntime::Model qnn_ctx_model; + // Load the QNN context cache model from path specified + ASSERT_STATUS_OK(qnn_ctx_model.Load(ToPathString(qnn_ctx_model_path), model_proto)); + std::string qnn_ctx_model_data; + model_proto.SerializeToString(&qnn_ctx_model_data); + // Run QNN context cache model on QNN EP and collect outputs. + InferenceModel(qnn_ctx_model_data, "qnn_ctx_model_logger", qnn_options, + expected_ep_assignment, f16_helper.feeds_, qnn_f16_outputs, is_qnn_ep, session_option_pairs); + } else { + // Run QDQ model on QNN EP and collect outputs. + // Only need to apply the extra session options to this QDQ model inference on QNN EP + InferenceModel(f16_model_data, "fp16_model_logger", qnn_options, expected_ep_assignment, + f16_helper.feeds_, qnn_f16_outputs, is_qnn_ep, session_option_pairs); + } + + if (expected_ep_assignment != ExpectedEPNodeAssignment::None) { + // Run QDQ model on CPU EP and collect outputs. + std::vector cpu_f16_outputs; + InferenceModel(f16_model_data, "fp16_model_logger", {}, ExpectedEPNodeAssignment::All, + f16_helper.feeds_, cpu_f16_outputs); + ASSERT_EQ(cpu_f16_outputs.size(), num_outputs); + ASSERT_EQ(qnn_f16_outputs.size(), num_outputs); + + // limit the error message count in case test with large data failed + size_t max_error_count = 10; + size_t error_count = 0; + + // Compare accuracy of QDQ results with float model. + // QNN EP must be at least as accurate as CPU EP when running the QDQ model. + const std::string base_output_name = "output_"; + for (size_t i = 0; i < num_outputs; i++) { + std::string debug_output_name = base_output_name + std::to_string(i); + auto& cpu_f16_tensor = cpu_f16_outputs[i].Get(); + auto& qnn_f16_tensor = qnn_f16_outputs[i].Get(); + + ASSERT_EQ(cpu_f16_tensor.GetElementType(), ONNX_NAMESPACE::TensorProto_DataType_FLOAT16); + ASSERT_EQ(qnn_f16_tensor.GetElementType(), ONNX_NAMESPACE::TensorProto_DataType_FLOAT16); + ASSERT_EQ(output_types[i], ONNX_NAMESPACE::TensorProto_DataType_FLOAT); + + const size_t num_vals = output_vals[i].size(); + gsl::span cpu_f32_vals = output_vals[i]; + gsl::span cpu_f16_vals = cpu_f16_tensor.DataAsSpan(); + gsl::span qnn_f16_vals = qnn_f16_tensor.DataAsSpan(); + + ASSERT_EQ(num_vals, cpu_f16_vals.size()); + ASSERT_EQ(num_vals, qnn_f16_vals.size()); + + float max_f16_cpu_err = 0.0f; + float max_f16_qnn_err = 0.0f; + + for (size_t j = 0; j < num_vals && error_count < max_error_count; j++) { + const float expected_val = cpu_f32_vals[j]; // f32@CPU_EP val ("ground-truth") + const float qnn_f16_val = qnn_f16_vals[j].ToFloat(); // f16@QNN_EP val + const float cpu_f16_val = cpu_f16_vals[j].ToFloat(); // f16@CPU_EP val + + // Get errors of f16@CPU_EP and f16@QNN_EP against f32@CPU_EP. + const float cpu_relative_err = std::fabs(expected_val - cpu_f16_val) / expected_val; + const float qnn_relative_err = std::fabs(expected_val - qnn_f16_val) / expected_val; + + // Also compare the FP16 values against each other. + // This is equivalent to abs(f16@QNN_EP - f16@CPU_EP) / output_range + const float f16_vals_err = std::fabs(qnn_relative_err - cpu_relative_err); + + // True if f16@QNN_EP is at least as accurate as f16@CPU_EP when compared to expected f32@CPU_EP value. + const bool is_as_accurate_as_cpu_ep = qnn_relative_err <= qnn_relative_err; + + // True if the normalized difference between f16@QNN_EP and f16@CPU_EP is within tolerance. + const bool f16_vals_diff_within_tolerance = f16_vals_err <= tolerance; + + const bool passed_test = is_as_accurate_as_cpu_ep || f16_vals_diff_within_tolerance; + if (!passed_test) { + ++error_count; + } + EXPECT_TRUE(passed_test) + << "Inaccuracy detected for output '" << debug_output_name + << "', element " << j << ", tolerance=" << (tolerance * 100) << "%" + << ".\nExpected val (f32@CPU_EP): " << expected_val << "\n" + << "f16@QNN_EP val: " << qnn_f16_val << " (err: " << qnn_relative_err << ")\n" + << "f16@CPU_EP val: " << cpu_f16_val << " (err: " << cpu_relative_err << ")\n"; + + max_f16_cpu_err = std::max(max_f16_cpu_err, cpu_relative_err); + max_f16_qnn_err = std::max(max_f16_qnn_err, qnn_relative_err); + } + + if (error_count > 0) { + std::cerr << std::endl + << "[WARNING]: Output " << i + << " required larger tolerance to pass accuracy checks" << std::endl + << "Max relative error against f32@CPU_EP = " << max_f16_cpu_err << std::endl + << "Max relative error against f16@CPU_EP = " << max_f16_qnn_err << std::endl; + } + } + } +} + /** * Creates and returns an input in a test model graph. The input's characteristics are defined * by the provided input definition.