Skip to content

Commit

Permalink
Fix Clip op builder for FP16 support (microsoft#19825)
Browse files Browse the repository at this point in the history
### Description
Fix Clip op builder for FP16 support.

### Motivation and Context
Enables mobilenet v2 FP16 model inference on HTP
  • Loading branch information
HectorSVC authored Mar 11, 2024
1 parent 89aa469 commit cba605e
Show file tree
Hide file tree
Showing 3 changed files with 262 additions and 40 deletions.
83 changes: 43 additions & 40 deletions onnxruntime/core/providers/qnn/builder/opbuilder/clip_op_builder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,27 @@ class ClipOpBuilder : public BaseOpBuilder {
Status ExplictOpCheck(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit) const;
};

static Status ProcessClipMinMax(QnnModelWrapper& qnn_model_wrapper,
const NodeUnitIODef& input,
float& float_value) {
TensorInfo input_info = {};
std::vector<uint8_t> val_bytes;
ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(input, input_info));
assert(input_info.is_initializer); // Checked by ExplicitOpCheck().
if (QNN_DATATYPE_FLOAT_16 == input_info.qnn_data_type) {
ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(*input_info.initializer_tensor, val_bytes));
MLFloat16 fp16_value = *reinterpret_cast<const MLFloat16*>(val_bytes.data());
float_value = fp16_value.ToFloat();
} else {
ORT_RETURN_IF_NOT(QNN_DATATYPE_FLOAT_32 == input_info.qnn_data_type,
"QNN EP: The 'min' input of the Clip operator must be of type float32.");
ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(*input_info.initializer_tensor, val_bytes));
float_value = *reinterpret_cast<const float*>(val_bytes.data());
}

return Status::OK();
}

Status ClipOpBuilder::ExplictOpCheck(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit) const {
if (node_unit.Inputs().size() > 1) {
const auto& min_input_name = node_unit.Inputs()[1].node_arg.Name();
Expand Down Expand Up @@ -75,54 +96,36 @@ Status ClipOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wra
const Qnn_DataType_t qnn_data_type = QNN_DATATYPE_FLOAT_32;
std::vector<std::string> param_tensor_names;

auto get_f32_from_bytes = [](const std::vector<uint8_t>& bytes, float default_val) -> float {
return bytes.empty() ? default_val : *reinterpret_cast<const float*>(bytes.data());
};

// Set the 'min' parameter.
{
std::vector<uint8_t> min_val_bytes;

if (num_inputs > 1 && !inputs[1].node_arg.Name().empty()) {
TensorInfo min_input_info = {};
ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(inputs[1], min_input_info));
ORT_RETURN_IF_NOT(min_input_info.qnn_data_type == qnn_data_type,
"QNN EP: The 'min' input of the Clip operator must be of type float32.");
assert(min_input_info.is_initializer); // Checked by ExplicitOpCheck().
ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(*min_input_info.initializer_tensor, min_val_bytes));
}
Qnn_Scalar_t min_qnn_scalar = QNN_SCALAR_INIT;
min_qnn_scalar.dataType = qnn_data_type;

Qnn_Scalar_t min_qnn_scalar = QNN_SCALAR_INIT;
min_qnn_scalar.dataType = qnn_data_type;
min_qnn_scalar.floatValue = get_f32_from_bytes(min_val_bytes, std::numeric_limits<float>::lowest());
QnnParamWrapper min_value_param(node_unit.Index(), node_unit.Name(), QNN_OP_RELU_MIN_MAX_PARAM_MIN_VALUE,
min_qnn_scalar);
param_tensor_names.push_back(min_value_param.GetParamTensorName());
qnn_model_wrapper.AddParamWrapper(std::move(min_value_param));
if (num_inputs > 1 && !inputs[1].node_arg.Name().empty()) {
ORT_RETURN_IF_ERROR(ProcessClipMinMax(qnn_model_wrapper, inputs[1], min_qnn_scalar.floatValue));
} else {
min_qnn_scalar.floatValue = std::numeric_limits<float>::lowest();
}

QnnParamWrapper min_value_param(node_unit.Index(), node_unit.Name(), QNN_OP_RELU_MIN_MAX_PARAM_MIN_VALUE,
min_qnn_scalar);
param_tensor_names.push_back(min_value_param.GetParamTensorName());
qnn_model_wrapper.AddParamWrapper(std::move(min_value_param));

// Set the 'max' parameter.
{
std::vector<uint8_t> max_val_bytes;

if (num_inputs > 2 && !inputs[2].node_arg.Name().empty()) {
TensorInfo max_input_info = {};
ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(inputs[2], max_input_info));
ORT_RETURN_IF_NOT(max_input_info.qnn_data_type == qnn_data_type,
"QNN EP: The 'max' input of the Clip operator must of type float32.");
assert(max_input_info.is_initializer); // Checked by ExplicitOpCheck().
ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(*max_input_info.initializer_tensor, max_val_bytes));
}
Qnn_Scalar_t max_qnn_scalar = QNN_SCALAR_INIT;
max_qnn_scalar.dataType = qnn_data_type;

Qnn_Scalar_t max_qnn_scalar = QNN_SCALAR_INIT;
max_qnn_scalar.dataType = qnn_data_type;
max_qnn_scalar.floatValue = get_f32_from_bytes(max_val_bytes, std::numeric_limits<float>::max());
QnnParamWrapper max_value_param(node_unit.Index(), node_unit.Name(), QNN_OP_RELU_MIN_MAX_PARAM_MAX_VALUE,
max_qnn_scalar);
param_tensor_names.push_back(max_value_param.GetParamTensorName());
qnn_model_wrapper.AddParamWrapper(std::move(max_value_param));
if (num_inputs > 2 && !inputs[2].node_arg.Name().empty()) {
ORT_RETURN_IF_ERROR(ProcessClipMinMax(qnn_model_wrapper, inputs[2], max_qnn_scalar.floatValue));
} else {
max_qnn_scalar.floatValue = std::numeric_limits<float>::max();
}

QnnParamWrapper max_value_param(node_unit.Index(), node_unit.Name(), QNN_OP_RELU_MIN_MAX_PARAM_MAX_VALUE,
max_qnn_scalar);
param_tensor_names.push_back(max_value_param.GetParamTensorName());
qnn_model_wrapper.AddParamWrapper(std::move(max_value_param));

ORT_RETURN_IF_ERROR(ProcessOutputs(qnn_model_wrapper, node_unit,
std::move(input_names),
std::move(param_tensor_names),
Expand Down
38 changes: 38 additions & 0 deletions onnxruntime/test/providers/qnn/clip_op_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,44 @@ TEST_F(QnnHTPBackendTests, Clip_U8_Rank5) {
ExpectedEPNodeAssignment::All);
}

// Test FP16 Clip with min (FP16)
TEST_F(QnnHTPBackendTests, Clip_FP16) {
ProviderOptions provider_options;

#if defined(_WIN32)
provider_options["backend_path"] = "QnnHtp.dll";
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif

auto f32_input = TestInputDef<float>({1, 3, 2, 2}, false,
{-10.0f, -8.0f, -3.5f, 2.2f,
1.3f, 1.5f, 3.2f, 5.8f,
5.8f, 9.7f, 8.5f, 8.9f});
std::vector<MLFloat16> f16_data;
std::for_each(f32_input.GetRawData().begin(), f32_input.GetRawData().end(),
[&f16_data](const float data) {
f16_data.push_back(static_cast<MLFloat16>(data));
});
auto f16_input = TestInputDef<MLFloat16>({1, 3, 2, 2}, false, f16_data);

const float min_f32 = 1.2f;
const MLFloat16 min_f16 = static_cast<MLFloat16>(min_f32);
auto f32_min_input = TestInputDef<float>({}, true, {min_f32});
auto f16_min_input = TestInputDef<MLFloat16>({}, true, {min_f16});

auto f32_model_builder = BuildOpTestCase<float, float>("Clip", {f32_input}, {f32_min_input}, {});
auto f16_model_builder = BuildOpTestCase<MLFloat16, MLFloat16>("Clip", {f16_input}, {f16_min_input}, {});
int opset = 13;
ExpectedEPNodeAssignment expected_ep_assignment = ExpectedEPNodeAssignment::All;

TestFp16ModelAccuracy(f32_model_builder,
f16_model_builder,
provider_options,
opset,
expected_ep_assignment);
}

#endif // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
} // namespace test
} // namespace onnxruntime
Expand Down
181 changes: 181 additions & 0 deletions onnxruntime/test/providers/qnn/qnn_test_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -467,6 +467,187 @@ inline void TestQDQModelAccuracy(const GetTestModelFn& f32_model_fn, const GetTe
}
}

/**
* Tests the accuracy of a FP16 model on QNN EP by runnning 3 inferences:
*
* 1. float32 model on CPU EP (baseline)
* 2. FP16 model on CPU EP
* 3. FP16 model on QNN EP
*
* This function checks that running the FP16 model on QNN EP (#3) is at least as accurate (+- small tolerance)
* as running the FP16 model on CPU EP (#2). We primarily measure accuracy by comparing to the baseline (#1).
*
* \param f32_model_fn Function that builds the float model (baseline for comparison).
* \param f16_model_fn Function that builds the FP16 model (run by CPU EP and QNN EP).
* \param qnn_options QNN EP provider options.
* \param opset_version The opset version.
* \param expected_ep_assignment Describes "which nodes" should be assigned to the EP.
* \param tolerance The percent tolerance (as fraction) QNN EP results are allowed to differ from the FP16 model on CPU EP.
* This tolerance is a percentage of the output range.
* \param log_severity The logger's severity setting.
*/
inline void TestFp16ModelAccuracy(const GetTestModelFn& f32_model_fn,
const GetTestModelFn& f16_model_fn,
ProviderOptions qnn_options,
int opset_version,
ExpectedEPNodeAssignment expected_ep_assignment,
float tolerance = 0.004,
logging::Severity log_severity = logging::Severity::kERROR,
const std::string& qnn_ctx_model_path = "",
const std::unordered_map<std::string, std::string>& session_option_pairs = {}) {
// Add kMSDomain to cover contrib op like Gelu
const std::unordered_map<std::string, int> domain_to_version = {{"", opset_version}, {kMSDomain, 1}};

auto& logging_manager = DefaultLoggingManager();
logging_manager.SetDefaultLoggerSeverity(log_severity);

// Create float model and serialize it to a string.
onnxruntime::Model f32_model("f32_model", false, ModelMetaData(), PathString(),
IOnnxRuntimeOpSchemaRegistryList(), domain_to_version, {},
logging_manager.DefaultLogger());
ModelTestBuilder f32_helper(f32_model.MainGraph());
std::string f32_model_data;
f32_model_fn(f32_helper);
f32_helper.SetGraphOutputs();
ASSERT_STATUS_OK(f32_model.MainGraph().Resolve());
f32_model.ToProto().SerializeToString(&f32_model_data);

// Run f32 model on CPU EP and collect outputs.
std::vector<OrtValue> cpu_f32_outputs;
InferenceModel(f32_model_data, "f32_model_logger", {}, ExpectedEPNodeAssignment::All,
f32_helper.feeds_, cpu_f32_outputs);
ASSERT_FALSE(cpu_f32_outputs.empty());

const size_t num_outputs = cpu_f32_outputs.size();

// Compute output range(s) and quantization params.
std::vector<gsl::span<const float>> output_vals;
std::vector<int32_t> output_types;
output_vals.resize(num_outputs);
output_types.resize(num_outputs);

for (size_t i = 0; i < num_outputs; i++) {
auto& tensor = cpu_f32_outputs[i].Get<Tensor>();
int32_t elem_type = tensor.GetElementType();

if (elem_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT) {
output_vals[i] = tensor.DataAsSpan<float>();
}

output_types[i] = elem_type;
}

// Create FP16 model and serialize it to a string.
onnxruntime::Model f16_model("fp16_model", false, ModelMetaData(), PathString(),
IOnnxRuntimeOpSchemaRegistryList(), domain_to_version, {},
logging_manager.DefaultLogger());
ModelTestBuilder f16_helper(f16_model.MainGraph());
std::string f16_model_data;
f16_model_fn(f16_helper);
f16_helper.SetGraphOutputs();
ASSERT_STATUS_OK(f16_model.MainGraph().Resolve());
f16_model.ToProto().SerializeToString(&f16_model_data);

bool is_qnn_ep = true;
TryEnableQNNSaver(qnn_options);
std::vector<OrtValue> qnn_f16_outputs;
if (!qnn_ctx_model_path.empty()) {
onnx::ModelProto model_proto;
onnxruntime::Model qnn_ctx_model;
// Load the QNN context cache model from path specified
ASSERT_STATUS_OK(qnn_ctx_model.Load(ToPathString(qnn_ctx_model_path), model_proto));
std::string qnn_ctx_model_data;
model_proto.SerializeToString(&qnn_ctx_model_data);
// Run QNN context cache model on QNN EP and collect outputs.
InferenceModel(qnn_ctx_model_data, "qnn_ctx_model_logger", qnn_options,
expected_ep_assignment, f16_helper.feeds_, qnn_f16_outputs, is_qnn_ep, session_option_pairs);
} else {
// Run QDQ model on QNN EP and collect outputs.
// Only need to apply the extra session options to this QDQ model inference on QNN EP
InferenceModel(f16_model_data, "fp16_model_logger", qnn_options, expected_ep_assignment,
f16_helper.feeds_, qnn_f16_outputs, is_qnn_ep, session_option_pairs);
}

if (expected_ep_assignment != ExpectedEPNodeAssignment::None) {
// Run QDQ model on CPU EP and collect outputs.
std::vector<OrtValue> cpu_f16_outputs;
InferenceModel(f16_model_data, "fp16_model_logger", {}, ExpectedEPNodeAssignment::All,
f16_helper.feeds_, cpu_f16_outputs);
ASSERT_EQ(cpu_f16_outputs.size(), num_outputs);
ASSERT_EQ(qnn_f16_outputs.size(), num_outputs);

// limit the error message count in case test with large data failed
size_t max_error_count = 10;
size_t error_count = 0;

// Compare accuracy of QDQ results with float model.
// QNN EP must be at least as accurate as CPU EP when running the QDQ model.
const std::string base_output_name = "output_";
for (size_t i = 0; i < num_outputs; i++) {
std::string debug_output_name = base_output_name + std::to_string(i);
auto& cpu_f16_tensor = cpu_f16_outputs[i].Get<Tensor>();
auto& qnn_f16_tensor = qnn_f16_outputs[i].Get<Tensor>();

ASSERT_EQ(cpu_f16_tensor.GetElementType(), ONNX_NAMESPACE::TensorProto_DataType_FLOAT16);
ASSERT_EQ(qnn_f16_tensor.GetElementType(), ONNX_NAMESPACE::TensorProto_DataType_FLOAT16);
ASSERT_EQ(output_types[i], ONNX_NAMESPACE::TensorProto_DataType_FLOAT);

const size_t num_vals = output_vals[i].size();
gsl::span<const float> cpu_f32_vals = output_vals[i];
gsl::span<const MLFloat16> cpu_f16_vals = cpu_f16_tensor.DataAsSpan<MLFloat16>();
gsl::span<const MLFloat16> qnn_f16_vals = qnn_f16_tensor.DataAsSpan<MLFloat16>();

ASSERT_EQ(num_vals, cpu_f16_vals.size());
ASSERT_EQ(num_vals, qnn_f16_vals.size());

float max_f16_cpu_err = 0.0f;
float max_f16_qnn_err = 0.0f;

for (size_t j = 0; j < num_vals && error_count < max_error_count; j++) {
const float expected_val = cpu_f32_vals[j]; // f32@CPU_EP val ("ground-truth")
const float qnn_f16_val = qnn_f16_vals[j].ToFloat(); // f16@QNN_EP val
const float cpu_f16_val = cpu_f16_vals[j].ToFloat(); // f16@CPU_EP val

// Get errors of f16@CPU_EP and f16@QNN_EP against f32@CPU_EP.
const float cpu_relative_err = std::fabs(expected_val - cpu_f16_val) / expected_val;
const float qnn_relative_err = std::fabs(expected_val - qnn_f16_val) / expected_val;

// Also compare the FP16 values against each other.
// This is equivalent to abs(f16@QNN_EP - f16@CPU_EP) / output_range
const float f16_vals_err = std::fabs(qnn_relative_err - cpu_relative_err);

// True if f16@QNN_EP is at least as accurate as f16@CPU_EP when compared to expected f32@CPU_EP value.
const bool is_as_accurate_as_cpu_ep = qnn_relative_err <= qnn_relative_err;

// True if the normalized difference between f16@QNN_EP and f16@CPU_EP is within tolerance.
const bool f16_vals_diff_within_tolerance = f16_vals_err <= tolerance;

const bool passed_test = is_as_accurate_as_cpu_ep || f16_vals_diff_within_tolerance;
if (!passed_test) {
++error_count;
}
EXPECT_TRUE(passed_test)
<< "Inaccuracy detected for output '" << debug_output_name
<< "', element " << j << ", tolerance=" << (tolerance * 100) << "%"
<< ".\nExpected val (f32@CPU_EP): " << expected_val << "\n"
<< "f16@QNN_EP val: " << qnn_f16_val << " (err: " << qnn_relative_err << ")\n"
<< "f16@CPU_EP val: " << cpu_f16_val << " (err: " << cpu_relative_err << ")\n";

max_f16_cpu_err = std::max(max_f16_cpu_err, cpu_relative_err);
max_f16_qnn_err = std::max(max_f16_qnn_err, qnn_relative_err);
}

if (error_count > 0) {
std::cerr << std::endl
<< "[WARNING]: Output " << i
<< " required larger tolerance to pass accuracy checks" << std::endl
<< "Max relative error against f32@CPU_EP = " << max_f16_cpu_err << std::endl
<< "Max relative error against f16@CPU_EP = " << max_f16_qnn_err << std::endl;
}
}
}
}

/**
* Creates and returns an input in a test model graph. The input's characteristics are defined
* by the provided input definition.
Expand Down

0 comments on commit cba605e

Please sign in to comment.