From 05fcdce3fa7fe4239aceb5f83eff70ef95679709 Mon Sep 17 00:00:00 2001 From: "Javier E. Martinez" Date: Sat, 14 Jun 2025 17:26:10 -0700 Subject: [PATCH 1/6] Add pass to perform QDQ stripping and propagate scales --- cmake/onnxruntime_providers_openvino.cmake | 2 +- .../providers/openvino/backend_manager.cc | 14 +- .../providers/openvino/ov_protobuf_utils.cpp | 18 + .../providers/openvino/ov_protobuf_utils.h | 9 + .../qdq_transformations/qdq_scales_fix.cpp | 760 ++++++++++++++++++ .../qdq_transformations/qdq_scales_fix.h | 19 + 6 files changed, 819 insertions(+), 3 deletions(-) create mode 100644 onnxruntime/core/providers/openvino/ov_protobuf_utils.cpp create mode 100644 onnxruntime/core/providers/openvino/ov_protobuf_utils.h create mode 100644 onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.cpp create mode 100644 onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.h diff --git a/cmake/onnxruntime_providers_openvino.cmake b/cmake/onnxruntime_providers_openvino.cmake index d7cb2d5ea0d0f..552f4cd3b8988 100644 --- a/cmake/onnxruntime_providers_openvino.cmake +++ b/cmake/onnxruntime_providers_openvino.cmake @@ -49,7 +49,7 @@ endif() add_dependencies(onnxruntime_providers_openvino onnxruntime_providers_shared ${onnxruntime_EXTERNAL_DEPENDENCIES}) target_include_directories(onnxruntime_providers_openvino SYSTEM PUBLIC ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR} ${OpenVINO_INCLUDE_DIR} ${OPENVINO_INCLUDE_DIR_LIST} ${PYTHON_INCLUDE_DIRS} $ENV{OPENCL_INCS} $ENV{OPENCL_INCS}/../../cl_headers/) - target_link_libraries(onnxruntime_providers_openvino ${ONNXRUNTIME_PROVIDERS_SHARED} Boost::mp11 ${OPENVINO_LIB_LIST} ${ABSEIL_LIBS} Eigen3::Eigen) + target_link_libraries(onnxruntime_providers_openvino ${ONNXRUNTIME_PROVIDERS_SHARED} Boost::mp11 ${OPENVINO_LIB_LIST} ${ABSEIL_LIBS} Eigen3::Eigen onnx_proto) target_compile_definitions(onnxruntime_providers_openvino PRIVATE FILE_NAME=\"onnxruntime_providers_openvino.dll\") diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc index 041d9c07e41fe..baff8145902d3 100644 --- a/onnxruntime/core/providers/openvino/backend_manager.cc +++ b/onnxruntime/core/providers/openvino/backend_manager.cc @@ -20,6 +20,7 @@ #include "core/providers/openvino/ov_interface.h" #include "core/providers/openvino/ov_versions/capability.h" #include "core/providers/openvino/qdq_transformations/qdq_stripping.h" +#include "core/providers/openvino/qdq_transformations/qdq_scales_fix.h" namespace onnxruntime { namespace openvino_ep { @@ -429,8 +430,7 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node, const auto& onnx_model_path_name = subgraph.ModelPath(); // QDQ stripping enabled only for the NPU and experimentally on the GPU - if ((session_context_.device_type.find("NPU") != std::string::npos || - session_context_.device_type.find("GPU") != std::string::npos) && + if ((session_context_.device_type.find("NPU") != std::string::npos) && (enable_ovep_qdq_optimizer || session_context_.so_share_ep_contexts)) { std::unique_ptr model; Status status = CreateModelWithStrippedQDQNodes(subgraph, logger, session_context_.so_share_ep_contexts, enable_ovep_qdq_optimizer, model, shared_context_.shared_weights); @@ -440,6 +440,16 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node, DumpOpenVINOEPModel(onnx_model_path_name, model_proto.get(), fused_node); ORT_ENFORCE(status.IsOK(), status.ErrorMessage()); return model_proto; + } else if (session_context_.device_type.find("GPU") != std::string::npos) { + // Create a copy of the model + std::unique_ptr model; + Status status = qdq_scales_fix::Transform(subgraph, logger, model); + auto model_proto = model->ToProto(); + model_proto->set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION); + print_model_proto_duration(); + DumpOpenVINOEPModel(onnx_model_path_name, model_proto.get(), fused_node); + ORT_ENFORCE(status.IsOK(), status.ErrorMessage()); + return model_proto; } else { LOGS_DEFAULT(INFO) << "[OpenVINO-EP] OVEP QDQ optimization pass is disabled"; auto model = subgraph.CreateModel(logger); diff --git a/onnxruntime/core/providers/openvino/ov_protobuf_utils.cpp b/onnxruntime/core/providers/openvino/ov_protobuf_utils.cpp new file mode 100644 index 0000000000000..73bba4595f790 --- /dev/null +++ b/onnxruntime/core/providers/openvino/ov_protobuf_utils.cpp @@ -0,0 +1,18 @@ +// Copyright (C) Intel Corporation +// Licensed under the MIT License + +#include "ov_protobuf_utils.h" + +#include "core/graph/onnx_protobuf.h" +#include "core/common/common.h" + +namespace onnxruntime { +namespace openvino_ep { +float get_float_initializer_data(const void* initializer) { + const auto *tp = reinterpret_cast(initializer); + ORT_ENFORCE((tp->has_data_type() && (tp->data_type() == ONNX_NAMESPACE::TensorProto_DataType_FLOAT))); + // ORT_ENFORCE(initializer.dims_size() == 1); + return tp->float_data(0); +} +} // namespace openvino_ep +} // namespace onnxruntime \ No newline at end of file diff --git a/onnxruntime/core/providers/openvino/ov_protobuf_utils.h b/onnxruntime/core/providers/openvino/ov_protobuf_utils.h new file mode 100644 index 0000000000000..188f19bf09196 --- /dev/null +++ b/onnxruntime/core/providers/openvino/ov_protobuf_utils.h @@ -0,0 +1,9 @@ +// Copyright (C) Intel Corporation +// Licensed under the MIT License + +#pragma once +namespace onnxruntime { +namespace openvino_ep { +float get_float_initializer_data(const void* initializer); +} +} // namespace onnxruntime \ No newline at end of file diff --git a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.cpp b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.cpp new file mode 100644 index 0000000000000..bbe97131cae30 --- /dev/null +++ b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.cpp @@ -0,0 +1,760 @@ +// Copyright (C) Intel Corporation +// Licensed under the MIT License + +#include "qdq_scales_fix.h" +#include "core/providers/openvino/ov_protobuf_utils.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace onnxruntime { +namespace openvino_ep { + +namespace qdq_scales_fix { + +namespace fs = std::filesystem; +using NodeRef = std::reference_wrapper; +struct GraphNode; +float get_initializer_value(const Graph& graph, const std::string& initializer_name); +void remove_node_and_reconnect(Graph& graph, const GraphNode& node_to_remove, NodeIndex next_input); +void generate_graph_from_memory(Graph& graph, fs::path path); + +template +bool contains(V&& begin, V&& end, const T& val) { + for (V& iter = begin; iter != end; iter.operator++()) { + if (iter->Name() == val) { + return true; + } + } + return false; +} + +template +bool contains(const R& vec, const T& val) { + for (auto iter = vec.begin(); iter != vec.end(); iter++) { + if ((*iter)->Name() == val) { + return true; + } + } + return false; +} + +bool contains(const std::vector& container, const std::string& value) { + return std::find(container.begin(), container.end(), value) != container.end(); +} + +struct GraphNode { + GraphNode() = delete; + + template + GraphNode(const N& node, const std::string& op_type = {}) { + node_name = node.Name(); + if constexpr (std::is_same_v) { + node_ptr = &node; + this->op_type = node.OpType(); + for (const auto iter : node.InputDefs()) { + node_input_name.push_back(iter->Name()); + } + for (const auto iter : node.OutputDefs()) { + node_output_name.push_back(iter->Name()); + } + } else { + this->op_type = op_type; + //** node_input_name = [] + //** node_output_name = [] + } + + if (op_type == "output") { + down_to_output = true; + } + } + + bool operator==(const GraphNode&) const = default; + + void add_edge_to(GraphNode& dst_node) { + to_node.push_back(&dst_node); + } + + void add_edge_from(GraphNode& src_node) { + from_node.push_back(&src_node); + } + + std::vector apply_scale_to_graph(float scale_adj) { + std::vector affected_dq; + + auto extend = [&affected_dq, scale_adj](const std::vector& new_nodes) { + affected_dq.insert(affected_dq.end(), new_nodes.begin(), new_nodes.end()); + }; + + if (op_type == "DequantizeLinear") { + scale_factor *= scale_adj; + affected_dq.push_back(this); + } else if ((op_type == "Add") || (op_type == "QuantizeLinear")) { + for (auto node : from_node) { + extend(node->apply_scale_to_graph(scale_adj)); + } + } else if (op_type == "Conv") { + // just adjust w&b for conv&mul, stop propagate + for (auto node : from_node) { + if (node->op_type == "DequantizeLinear") { + extend(node->apply_scale_to_graph(scale_adj)); + } + } + } else if ((op_type == "Mul") || (op_type == "MatMul")) { + bool find_dq{false}; + for (auto node : from_node) { + if (node->op_type == "DequantizeLinear" && !find_dq) { + find_dq = true; + extend(node->apply_scale_to_graph(scale_adj)); + } + } + if (!find_dq) { + // cannot scale dq from here, choose input 0 to propagate + extend(from_node.back()->from_node[0]->apply_scale_to_graph(scale_adj)); + } + } else { + ORT_THROW("Unknown case, node: %s", ToString().data()); + } + + return affected_dq; + } + + std::vector down_propagate_scale() { + std::vector affected_nodes; + + if (processed) { + return affected_nodes; + } + + if ((op_type == "InstanceNormalization") || (op_type == "Softmax")) { + // pass + } else if (op_type == "Add") { + auto up_new_nodes = up_propagate_scale(); + affected_nodes.insert(affected_nodes.end(), up_new_nodes.begin(), up_new_nodes.end()); + + for (auto node : to_node) { + auto down_new_nodes = node->down_propagate_scale(); + affected_nodes.insert(affected_nodes.end(), down_new_nodes.begin(), down_new_nodes.end()); + } + } else { + affected_nodes.push_back(this); + processed = true; + + for (auto node : to_node) { + auto new_nodes = node->down_propagate_scale(); + affected_nodes.insert(affected_nodes.end(), new_nodes.begin(), new_nodes.end()); + } + } + return affected_nodes; + } + + std::vector up_propagate_scale() { + std::vector affected_nodes; + + if (processed) { + return affected_nodes; + } + + if ((op_type == "InstanceNormalization") || (op_type == "Softmax")) { + ORT_THROW("Cannot propagate up through norm layer"); + } else if (op_type == "Conv") { + affected_nodes.push_back(this); + processed = true; + + for (auto node : from_node) { + if (node->op_type == "DequantizeLinear") { + affected_nodes.push_back(node); + } + } + } else if ((op_type == "Mul") || (op_type == "MatMul")) { + affected_nodes.push_back(this); + processed = true; + bool find_dq{false}; + + for (auto node : from_node) { + if ((node->op_type == "DequantizeLinear") && !find_dq) { + find_dq = true; + affected_nodes.push_back(node); + } + } + if (!find_dq) { + auto new_nodes = from_node.back()->from_node[0]->up_propagate_scale(); + affected_nodes.insert(affected_nodes.end(), new_nodes.begin(), new_nodes.end()); + } + } else { + affected_nodes.push_back(this); + processed = true; + + for (auto node : from_node) { + auto new_nodes = node->up_propagate_scale(); + affected_nodes.insert(affected_nodes.end(), new_nodes.begin(), new_nodes.end()); + } + } + + return affected_nodes; + } + + bool down_propagate_to_output() { + if (down_to_output.has_value()) { + return down_to_output.value(); + } + + bool local_down_to_output{false}; + if (op_type == "output") { + local_down_to_output = true; + } else if ((op_type == "InstanceNormalization") || (op_type == "Softmax")) { + local_down_to_output = false; + } else { + for (auto node : to_node) { + local_down_to_output = local_down_to_output || node->down_propagate_to_output(); + } + } + + down_to_output = local_down_to_output; + return local_down_to_output; + } + + std::string ToString() const { + // auto string = std::format("op={} name={} queued={} visited={} scale_factor={}", + // op_type, + // node_name, + // queued, + // visited, + // scale_factor); + auto print_node_vector = [](std::vector nodes) -> std::string { + // auto comp = [](const GraphNode* left, const GraphNode* right) -> bool { + // return left->node_name < right->node_name; + // }; + // std::sort(nodes.begin(), nodes.end(), comp); + std::string ret = "["; + for (size_t i = 0, size = nodes.size(); auto pnode : nodes) { + if (pnode->node_name.size() == 0) continue; + ret += pnode->node_name; + if (++i < size) { + ret += ", "; + } + } + ret += "]"; + return ret; + }; + std::string from_node_str = print_node_vector(from_node); + std::string to_node_str = print_node_vector(to_node); + + auto print_string_vector = [](std::vector nodes) -> std::string { + // std::sort(nodes.begin(), nodes.end()); + std::string ret = "["; + for (size_t i = 0, size = nodes.size(); auto node : nodes) { + ret += node; + if (++i < size) { + ret += ", "; + } + } + ret += "]"; + return ret; + }; + std::string node_input_name_str = print_string_vector(node_input_name); + std::string node_output_name_str = print_string_vector(node_output_name); + + auto print_bool = [](bool val) -> std::string { + return (val) ? "True" : "False"; + }; + + auto print_opt_bool = [print_bool](std::optional val) -> std::string { + return (val.has_value()) ? print_bool(val.value()) : "None"; + }; + + auto string = std::format("node_name={} op_type={} scale_factor={:.2f} visited={} queued={} down_to_output={} processed={} from_node={} to_node={} node_input_name={} node_output_name={}", + node_name, + op_type, + scale_factor, + visited, + print_bool(queued), + print_opt_bool(down_to_output), + print_bool(processed), + from_node_str, + to_node_str, + node_input_name_str, + node_output_name_str); + return string; + } + + const Node* node_ptr{nullptr}; + std::string node_name; + std::string op_type; + std::vector node_input_name; + std::vector node_output_name; + std::vector from_node; + std::vector to_node; + float scale_factor{1.f}; + int visited{0}; + bool queued{false}; + std::optional down_to_output; + bool processed{false}; +}; + +struct CustomGraph { + CustomGraph() = delete; + CustomGraph(Graph& graph) : original_graph{graph} {} + + void sort() { + auto comp_node = [](const GraphNode& left, const GraphNode& right) -> bool { + return left.node_name < right.node_name; + }; + nodes.sort(comp_node); + + for (auto& node : nodes) { + auto comp_pnode = [](const GraphNode* left, const GraphNode* right) -> bool { + return left->node_name < right->node_name; + }; + std::sort(node.from_node.begin(), node.from_node.end(), comp_pnode); + std::sort(node.to_node.begin(), node.to_node.end(), comp_pnode); + } + } + + void add_node(const GraphNode& node) { + nodes.push_back(node); + } + + void add_edge(GraphNode& src, GraphNode& dst) { + src.add_edge_to(dst); + dst.add_edge_from(src); + } + + auto get_start_nodes() { + std::list start_nodes; + + for (auto& node : nodes) { + if (node.from_node.empty()) { + start_nodes.push_back(&node); + node.queued = true; + } + } + return start_nodes; + } + + void initailize_search(float threshold = 1.f, bool scale_output = false) { + remove_qdq(threshold, scale_output); + for (auto& node : nodes) { + node.visited = 0; + node.queued = false; + } + } + + void init_propagate() { + for (auto& node : nodes) { + node.processed = false; + } + } + + void remove_qdq_pair(const GraphNode& node, std::list& removed) { + auto& q = node; + auto& dq = *node.to_node[0]; + // q only have 1 input + auto& prev = *node.from_node[0]; + + for (auto sup : dq.to_node) { + for (uint32_t index = 0; auto x : sup->from_node) { + if (dq == *x) { + sup->from_node[index] = &prev; + } + index++; + } + if (auto iter = std::find(sup->node_input_name.begin(), sup->node_input_name.end(), dq.node_output_name[0]); iter != sup->node_input_name.end()) { + *iter = prev.node_output_name[0]; + } + } + + prev.to_node = dq.to_node; + + for (auto& output : original_graph.GetOutputs()) { + if (output->Name() == dq.to_node[0]->node_name) { + prev.node_output_name[0] = output->Name(); + } + } + + auto q_iter = std::find(nodes.begin(), nodes.end(), q); + if (q_iter != nodes.end()) { + removed.splice(removed.end(), nodes, q_iter); + } + auto dq_iter = std::find(nodes.begin(), nodes.end(), dq); + if (dq_iter != nodes.end()) { + removed.splice(removed.end(), nodes, dq_iter); + } + + const auto& q_node = *q.node_ptr; + const auto& dq_node = *dq.node_ptr; + ORT_ENFORCE(q_node.GetInputEdgesCount() == 1); // One input to q + ORT_ENFORCE(q_node.GetOutputEdgesCount() == 1); // One q->dq edge + auto in_edge = q_node.InputEdgesBegin(); + + auto remove_edge = [this](const Node& src, const Node& dst, int src_arg, int dst_arg) { + original_graph.RemoveEdge(src.Index(), dst.Index(), src_arg, dst_arg); + }; + + // Remove input edge to q + remove_edge(in_edge->GetNode(), q_node, in_edge->GetSrcArgIndex(), in_edge->GetDstArgIndex()); + + // Remove q edge to dq + remove_edge(q_node, dq_node, 0, 0); + + // Replace all edges from dq to outputs with input to output + for (auto out_edge = dq_node.OutputEdgesBegin(); out_edge != dq_node.OutputEdgesEnd(); out_edge.operator++()) { + // Remove dq edge to output + remove_edge(dq_node, out_edge->GetNode(), out_edge->GetSrcArgIndex(), out_edge->GetDstArgIndex()); + + // Add edge input->output + { + auto in_edge_src_index = in_edge->GetNode().Index(); + auto out_edge_dst_index = out_edge->GetNode().Index(); + original_graph.AddEdge(in_edge_src_index, out_edge_dst_index, in_edge->GetSrcArgIndex(), out_edge->GetDstArgIndex()); + } + } + + original_graph.RemoveNode(q_node.Index()); + original_graph.RemoveNode(dq_node.Index()); + } + + std::list remove_qdq(float threshold = 1.f, bool scale_output = false) { + std::list removed; + std::list nodes_copy; + std::for_each(nodes.begin(), nodes.end(), [&nodes_copy](GraphNode& node) { nodes_copy.push_back(&node); }); + + for (auto node : nodes_copy) { + if (std::find(nodes.begin(), nodes.end(), *node) == nodes.end()) { + continue; + } + + if ((node->op_type == "QuantizeLinear") && + (node->to_node[0]->op_type == "DequantizeLinear")) { + if (!scale_output && node->down_propagate_to_output()) { + remove_qdq_pair(*node, removed); + continue; + } + + auto scale_name = node->node_input_name[1]; // Scale + auto scale_value = get_initializer_value(original_graph, scale_name); + if (scale_value / node->scale_factor < threshold) { + remove_qdq_pair(*node, removed); + } + } + } + + // Reconnect graph outputs if disconnected + bool update_outputs{false}; + auto outputs = original_graph.GetOutputs(); + for (auto output : outputs) { + bool found{false}; + for (auto node : original_graph.Nodes()) { + if (contains(node->OutputNodesBegin(), node->OutputNodesEnd(), output->Name())) { + found = true; + break; + } + } + + if (!found) { + // Connect the last valid node to the graph output + for (auto node : std::ranges::reverse_view(original_graph.Nodes())) { + if (!node->OutputDefs().empty()) { + const auto& name = (*node->OutputDefs().begin())->Name(); + auto& node_arg = original_graph.GetOrCreateNodeArg(name, output->TypeAsProto()); + output = &node_arg; + update_outputs = true; + } + } + } + } + + if (update_outputs) { + original_graph.SetOutputs(outputs); + } + + return removed; + } + + void dump_custom_graph(fs::path path) { + if (auto file = std::ofstream(path)) { + std::vector node_ref; + for (auto& node : nodes) { + node_ref.emplace_back(&node); + } + + for (const auto& node : node_ref) { + std::string node_str = node->ToString(); + file << node_str << "\n"; + } + } + } + + std::list nodes; + std::list removed_nodes; + Graph& original_graph; +}; + +float get_initializer_value(const Graph& graph, const std::string& initializer_name) { + const auto p_initializer = graph.GetConstantInitializer(initializer_name, false); + + return get_float_initializer_data(p_initializer); +} + +void update_initializer_value(Graph& graph, const std::string& initializer_name, const float new_value) { + const auto p_initializer = graph.GetConstantInitializer(initializer_name, false); + + if (p_initializer == nullptr) { + return; + } + + const auto& initializer = *p_initializer; + + // Verify 1D tensor + ORT_ENFORCE(initializer.dims_size() == 1); + ORT_ENFORCE(initializer.data_type() == onnx::TensorProto_DataType_FLOAT); + + // Create new tensor with updated value + auto new_tensor = onnx::TensorProto::Create(); + new_tensor->copy_from(p_initializer); + *(float*)new_tensor->mutable_raw_data()->data() = new_value; + graph.RemoveInitializedTensor(initializer_name); + graph.AddInitializedTensor(*new_tensor); +} + +void remove_node_and_reconnect(Graph& graph, const GraphNode& node_to_remove, NodeIndex next_input) { + const auto& n2r = *(node_to_remove.node_ptr); + + ORT_ENFORCE(n2r.GetOutputEdgesCount() == 1); + ORT_ENFORCE(n2r.GetInputEdgesCount() == 1); + + auto in_edge = n2r.InputEdgesBegin(); + auto out_edge = n2r.OutputEdgesBegin(); + + // Remove in_edge + { + const auto& src_node = in_edge->GetNode(); + graph.RemoveEdge(src_node.Index(), n2r.Index(), in_edge->GetSrcArgIndex(), in_edge->GetDstArgIndex()); + } + + // Remove out_edge + { + const auto& dst_node = out_edge->GetNode(); + graph.RemoveEdge(n2r.Index(), dst_node.Index(), out_edge->GetSrcArgIndex(), out_edge->GetDstArgIndex()); + } + + // Add next_input->out_edge node + { + auto in_edge_src_index = in_edge->GetNode().Index(); + auto out_edge_dst_index = out_edge->GetNode().Index(); + graph.AddEdge(in_edge_src_index, out_edge_dst_index, in_edge->GetSrcArgIndex(), out_edge->GetDstArgIndex()); + } + + graph.RemoveNode(n2r.Index()); +} + +CustomGraph generate_graph_from_onnx(Graph& graph) { + CustomGraph gen_graph{graph}; + + for (auto pnode : graph.Nodes()) { + if (pnode->NodeType() == Node::Type::Fused) continue; + gen_graph.nodes.emplace_back(*pnode); + } + + for (auto& src_node : gen_graph.nodes) { + for (auto& dst_node : gen_graph.nodes) { + if (src_node == dst_node) { + continue; + } + + for (auto& src_output : src_node.node_output_name) { + if (contains(dst_node.node_input_name, src_output)) { + gen_graph.add_edge(src_node, dst_node); + } + } + } + } + + for (auto& input_node : graph.GetInputs()) { + auto& cur_input = gen_graph.nodes.emplace_back(*input_node, "input"); + for (auto& dst_node : gen_graph.nodes) { + for (const auto& dst_output : dst_node.node_input_name) { + if (dst_output == input_node->Name()) { + gen_graph.add_edge(cur_input, dst_node); + } + } + } + } + + for (auto& output_node : graph.GetOutputs()) { + auto& cur_output = gen_graph.nodes.emplace_back(*output_node, "output"); + for (auto& src_node : gen_graph.nodes) { + for (const auto& dst_outputs : src_node.node_output_name) { + if (dst_outputs == output_node->Name()) { + gen_graph.add_edge(src_node, cur_output); + } + } + } + } + + gen_graph.sort(); + return gen_graph; +} + +void scale_graph(CustomGraph& gen_graph, + float threshold = 1.f, + float ratio = 10, + bool scale_output = false) { + gen_graph.initailize_search(threshold, scale_output); + auto q = gen_graph.get_start_nodes(); + auto pred = [](const GraphNode* left, const GraphNode* right) -> bool { + return left->node_name < right->node_name; + }; + q.sort(pred); + auto special_node = std::find_if(gen_graph.nodes.begin(), gen_graph.nodes.end(), [](const GraphNode& node) { return node.node_name == "/encoder/down_blocks.3/resnets.0/Add"; }); + while (q.size() > 0) { + auto cur_node = q.front(); + q.pop_front(); + if (cur_node->visited < cur_node->from_node.size()) { + cur_node->queued = false; + } else { + + if (cur_node->op_type == "QuantizeLinear" && + cur_node->to_node[0]->op_type == "DequantizeLinear") { + auto scale_name = *std::next(cur_node->node_input_name.begin()); // Scale + auto scale_value = get_initializer_value(gen_graph.original_graph, scale_name); + + // QDQ pair with scale over 1 + if (scale_value / cur_node->scale_factor > threshold) { + gen_graph.init_propagate(); + // adjust previous op scale to threshold / 10 + auto scale_adj = scale_value / cur_node->scale_factor / threshold * ratio; + + // find related const dq to scale down + auto affected_dq = cur_node->apply_scale_to_graph(scale_adj); + std::vector affected_nodes; + + // then propage to graph to update scale + for (auto& dq : affected_dq) { + auto cur_affected = dq->down_propagate_scale(); + affected_nodes.insert(affected_nodes.end(), cur_affected.begin(), cur_affected.end()); + } + + for (auto& node : affected_nodes) { + bool found = std::find(affected_dq.begin(), affected_dq.end(), node) != affected_dq.end(); + if (!found) { + node->scale_factor *= scale_adj; + } + } + + auto removed_qdq = gen_graph.remove_qdq(threshold, scale_output); + for (auto& qdq : removed_qdq) { + try { + q.remove(&qdq); + } catch (...) { + } + } + + gen_graph.removed_nodes.splice(gen_graph.removed_nodes.end(), removed_qdq); + + cur_node = cur_node->to_node[0]; + } + } + + for (auto dst : cur_node->to_node) { + dst->visited += 1; + if (!dst->queued) { + dst->queued = true; + q.push_back(dst); + } + } + } + } +} + +Status copy_model(const GraphViewer& src_graph_viewer, + const logging::Logger& logger, std::unique_ptr& model) { + // Constructs model from scratch using the metadata in src_graph + model = src_graph_viewer.CreateModel(logger); + const auto& src_graph = src_graph_viewer.GetGraph(); + + // + // Initialize model/graph metadata. + // + auto& dst_graph = model->MainGraph(); + + // Set inputs outputs explicitly to make sure the order is same as the user model. + auto inputs = src_graph.GetInputs(); + auto outputs = src_graph.GetOutputs(); + + InlinedVector dst_graph_inputs; + dst_graph_inputs.reserve(inputs.size()); + for (auto& input : inputs) { + auto input_arg = src_graph.GetNodeArg(input->Name()); + auto& dst_graph_input_arg = dst_graph.GetOrCreateNodeArg(input_arg->Name(), input_arg->TypeAsProto()); + dst_graph_inputs.push_back(&dst_graph_input_arg); + } + + InlinedVector dst_graph_outputs; + dst_graph_outputs.reserve(outputs.size()); + for (auto& output : outputs) { + auto output_arg = src_graph.GetNodeArg(output->Name()); + auto& dst_graph_output_arg = dst_graph.GetOrCreateNodeArg(output_arg->Name(), output_arg->TypeAsProto()); + dst_graph_outputs.push_back(&dst_graph_output_arg); + } + + dst_graph.SetInputs(dst_graph_inputs); + dst_graph.SetOutputs(dst_graph_outputs); + dst_graph.SetName(src_graph.Name()); + + // Mark outer scope NodeArgs + for (const auto& name : src_graph_viewer.GetOuterScopeNodeArgNames()) { + auto* node_arg = src_graph.GetNodeArg(name); + ORT_RETURN_IF_NOT(node_arg != nullptr, "Outer scope node arg name '" + name + "'was added but does not exist. "); + dst_graph.AddOuterScopeNodeArg(name); + } + + // Add nodes + for (auto pnode : src_graph.Nodes()) { + if (pnode->NodeType() == Node::Type::Fused) continue; + dst_graph.AddNode(*pnode); + } + + // Handle constant initializers + for (auto& [name, tensor_proto] : src_graph.GetAllInitializedTensors()) { + dst_graph.AddInitializedTensor(*tensor_proto); + } + for (const auto node_arg : src_graph.GetInputsIncludingInitializers()) { + // Skip non initializer + auto check_inputs = [node_arg](const NodeArg* input_node_arg) { return input_node_arg->Name() == node_arg->Name(); }; + if (std::find_if(dst_graph_inputs.begin(), dst_graph_inputs.end(), check_inputs) != dst_graph_inputs.end()) continue; + + // Add initializer + const auto src_tensor_proto = src_graph.GetConstantInitializer(node_arg->Name(), true); + auto dst_tensor_proto = onnx::TensorProto::Create(); + dst_tensor_proto->copy_from(src_tensor_proto); + dst_graph.AddInitializedTensor(*dst_tensor_proto); + } + + // Validate graph, remove unnecessary initializers, and run type/shape inference. + ORT_RETURN_IF_ERROR(dst_graph.Resolve()); + + return Status::OK(); +} + +Status Transform(const GraphViewer& src_graph_viewer, + const logging::Logger& logger, + /*out*/ std::unique_ptr& model) { + auto status = copy_model(src_graph_viewer, logger, model); + auto g = generate_graph_from_onnx(model->MainGraph()); + + float threshold{1.f}; + float ratio{10.f}; + bool scale_output{false}; + scale_graph(g, threshold, ratio, scale_output); + return status; +} +} // namespace qdq_scales_fix +} // namespace openvino_ep +} // namespace onnxruntime \ No newline at end of file diff --git a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.h b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.h new file mode 100644 index 0000000000000..c54c531e1bd40 --- /dev/null +++ b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.h @@ -0,0 +1,19 @@ +// Copyright (C) Intel Corporation +// Licensed under the MIT License + +#pragma once + +#include "core/providers/shared_library/provider_api.h" + +namespace onnxruntime { +class GraphViewer; + +namespace openvino_ep { + +namespace qdq_scales_fix { +Status Transform(const GraphViewer& src_graph, + const logging::Logger& logger, + /*out*/ std::unique_ptr& model); +} +} // namespace openvino_ep +} // namespace onnxruntime From 19a5e7bb2787891ff6b11c59dde80a599cc2c7f7 Mon Sep 17 00:00:00 2001 From: "Javier E. Martinez" Date: Mon, 16 Jun 2025 16:06:25 -0700 Subject: [PATCH 2/6] Fix disconnected outptu node --- .../qdq_transformations/qdq_scales_fix.cpp | 54 +++++++++++++++---- 1 file changed, 43 insertions(+), 11 deletions(-) diff --git a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.cpp b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.cpp index bbe97131cae30..aac7788e5440f 100644 --- a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.cpp +++ b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.cpp @@ -23,7 +23,6 @@ using NodeRef = std::reference_wrapper; struct GraphNode; float get_initializer_value(const Graph& graph, const std::string& initializer_name); void remove_node_and_reconnect(Graph& graph, const GraphNode& node_to_remove, NodeIndex next_input); -void generate_graph_from_memory(Graph& graph, fs::path path); template bool contains(V&& begin, V&& end, const T& val) { @@ -389,6 +388,7 @@ struct CustomGraph { const auto& q_node = *q.node_ptr; const auto& dq_node = *dq.node_ptr; + const auto& prev_node = *prev.node_ptr; ORT_ENFORCE(q_node.GetInputEdgesCount() == 1); // One input to q ORT_ENFORCE(q_node.GetOutputEdgesCount() == 1); // One q->dq edge auto in_edge = q_node.InputEdgesBegin(); @@ -404,16 +404,49 @@ struct CustomGraph { remove_edge(q_node, dq_node, 0, 0); // Replace all edges from dq to outputs with input to output - for (auto out_edge = dq_node.OutputEdgesBegin(); out_edge != dq_node.OutputEdgesEnd(); out_edge.operator++()) { - // Remove dq edge to output - remove_edge(dq_node, out_edge->GetNode(), out_edge->GetSrcArgIndex(), out_edge->GetDstArgIndex()); - - // Add edge input->output - { - auto in_edge_src_index = in_edge->GetNode().Index(); - auto out_edge_dst_index = out_edge->GetNode().Index(); - original_graph.AddEdge(in_edge_src_index, out_edge_dst_index, in_edge->GetSrcArgIndex(), out_edge->GetDstArgIndex()); + if (dq_node.GetOutputEdgesCount() > 0) { + for (auto out_edge = dq_node.OutputEdgesBegin(); out_edge != dq_node.OutputEdgesEnd(); out_edge.operator++()) { + // Remove dq edge to output + remove_edge(dq_node, out_edge->GetNode(), out_edge->GetSrcArgIndex(), out_edge->GetDstArgIndex()); + + // Add edge input->output + { + auto in_edge_src_index = in_edge->GetNode().Index(); + auto out_edge_dst_index = out_edge->GetNode().Index(); + original_graph.AddEdge(in_edge_src_index, out_edge_dst_index, in_edge->GetSrcArgIndex(), out_edge->GetDstArgIndex()); + } } + } else { + // Copy input/output defs + std::vector prev_input_defs(prev_node.InputDefs().size()); + std::vector prev_output_defs(prev_node.OutputDefs().size()); + auto transform_f = [this](const NodeArg* iter) { return &original_graph.GetOrCreateNodeArg(iter->Name(), iter->TypeAsProto()); }; + auto fill_vectors = [transform_f](const auto& src, auto& dst) { + std::transform(src.begin(), src.end(), dst.begin(), transform_f); + }; + fill_vectors(prev_node.InputDefs(), prev_input_defs); + fill_vectors(prev_node.OutputDefs(), prev_output_defs); + + // Update def corresponding to DQ output + ORT_ENFORCE(dq_node.OutputDefs().size() == 1); // One dq->output + auto dq_output_def = dq_node.OutputDefs()[0]; + prev_output_defs[in_edge->GetSrcArgIndex()] = &original_graph.GetOrCreateNodeArg(dq_output_def->Name(), dq_output_def->TypeAsProto()); + + // Get attributes + auto attributes = NodeAttributes::Create(); + *attributes = prev_node.GetAttributes(); + + // Add new input node + original_graph.AddNode(prev_node.Name(), + prev_node.OpType(), + prev_node.Description(), + prev_input_defs, + prev_output_defs, + std::move(*attributes.release()), + prev_node.Domain()); + + // Remove original input node + original_graph.RemoveNode(prev_node.Index()); } original_graph.RemoveNode(q_node.Index()); @@ -619,7 +652,6 @@ void scale_graph(CustomGraph& gen_graph, if (cur_node->visited < cur_node->from_node.size()) { cur_node->queued = false; } else { - if (cur_node->op_type == "QuantizeLinear" && cur_node->to_node[0]->op_type == "DequantizeLinear") { auto scale_name = *std::next(cur_node->node_input_name.begin()); // Scale From 66587aa3199767236dbf5d89834c8d8e0f122542 Mon Sep 17 00:00:00 2001 From: "Javier E. Martinez" Date: Mon, 23 Jun 2025 13:59:52 -0700 Subject: [PATCH 3/6] Fixes to support session.disable_quant_qdq output, remove dangling nodes and duplicate DQ nodes --- .../providers/openvino/backend_manager.cc | 3 +- .../qdq_transformations/qdq_scales_fix.cpp | 331 ++++++++++++------ 2 files changed, 229 insertions(+), 105 deletions(-) diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc index baff8145902d3..253bae3d92a36 100644 --- a/onnxruntime/core/providers/openvino/backend_manager.cc +++ b/onnxruntime/core/providers/openvino/backend_manager.cc @@ -440,7 +440,8 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node, DumpOpenVINOEPModel(onnx_model_path_name, model_proto.get(), fused_node); ORT_ENFORCE(status.IsOK(), status.ErrorMessage()); return model_proto; - } else if (session_context_.device_type.find("GPU") != std::string::npos) { + } else if ((session_context_.device_type.find("GPU") != std::string::npos) && + enable_ovep_qdq_optimizer) { // Create a copy of the model std::unique_ptr model; Status status = qdq_scales_fix::Transform(subgraph, logger, model); diff --git a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.cpp b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.cpp index aac7788e5440f..fc8336e474399 100644 --- a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.cpp +++ b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.cpp @@ -248,7 +248,7 @@ struct GraphNode { auto print_string_vector = [](std::vector nodes) -> std::string { // std::sort(nodes.begin(), nodes.end()); std::string ret = "["; - for (size_t i = 0, size = nodes.size(); auto node : nodes) { + for (size_t i = 0, size = nodes.size(); const auto& node : nodes) { ret += node; if (++i < size) { ret += ", "; @@ -353,111 +353,147 @@ struct CustomGraph { void remove_qdq_pair(const GraphNode& node, std::list& removed) { auto& q = node; - auto& dq = *node.to_node[0]; - // q only have 1 input - auto& prev = *node.from_node[0]; + InlinedVector dq_ptrs; - for (auto sup : dq.to_node) { - for (uint32_t index = 0; auto x : sup->from_node) { - if (dq == *x) { - sup->from_node[index] = &prev; - } - index++; + for (auto& child : q.to_node) { + if (child->node_ptr && child->node_ptr->OpType() == "DequantizeLinear") { + dq_ptrs.push_back(child); } - if (auto iter = std::find(sup->node_input_name.begin(), sup->node_input_name.end(), dq.node_output_name[0]); iter != sup->node_input_name.end()) { - *iter = prev.node_output_name[0]; + } + + if (dq_ptrs.empty()) { + return; + } + + for (std::size_t i = 1; i < dq_ptrs.size(); ++i) { + if (dq_ptrs[i]->node_input_name[1] != dq_ptrs[0]->node_input_name[1] || + dq_ptrs[i]->node_input_name[2] != dq_ptrs[0]->node_input_name[2]) { + return; } } - prev.to_node = dq.to_node; + auto& prev = *node.from_node[0]; + const auto& q_node = *q.node_ptr; + + bool is_prev_input = (prev.node_ptr == nullptr); + std::string prev_output_name = is_prev_input ? prev.node_name : prev.node_output_name[0]; + + InlinedVector> output_replacements; + for (auto dq_ptr : dq_ptrs) { + for (auto dst_node : dq_ptr->to_node) { + for (auto& scr_node : dst_node->from_node) { + if (*dq_ptr == *scr_node) { + scr_node = &prev; + } + } - for (auto& output : original_graph.GetOutputs()) { - if (output->Name() == dq.to_node[0]->node_name) { - prev.node_output_name[0] = output->Name(); + auto it = std::find(dst_node->node_input_name.begin(), dst_node->node_input_name.end(), dq_ptr->node_output_name[0]); + if (it != dst_node->node_input_name.end()) { + *it = prev_output_name; + } + } + for (auto& output : original_graph.GetOutputs()) { + if (output->Name() == dq_ptr->node_output_name[0]) { + const NodeArg* replacement_arg = nullptr; + if (!is_prev_input) { + replacement_arg = prev.node_ptr->OutputDefs()[0]; + } else { + replacement_arg = original_graph.GetNodeArg(prev.node_name); + ORT_ENFORCE(replacement_arg != nullptr, "Input not found: " + prev.node_name); + } + output_replacements.emplace_back(output, replacement_arg); + } } } + prev.to_node.erase(std::remove(prev.to_node.begin(), prev.to_node.end(), &q), prev.to_node.end()); + for (auto dq_ptr : dq_ptrs) { + for (auto dst_node : dq_ptr->to_node) { + auto it = std::find(prev.to_node.begin(), prev.to_node.end(), dst_node); + if (it == prev.to_node.end()) { + prev.to_node.push_back(dst_node); + } + } + } auto q_iter = std::find(nodes.begin(), nodes.end(), q); if (q_iter != nodes.end()) { removed.splice(removed.end(), nodes, q_iter); } - auto dq_iter = std::find(nodes.begin(), nodes.end(), dq); - if (dq_iter != nodes.end()) { - removed.splice(removed.end(), nodes, dq_iter); - } - const auto& q_node = *q.node_ptr; - const auto& dq_node = *dq.node_ptr; - const auto& prev_node = *prev.node_ptr; - ORT_ENFORCE(q_node.GetInputEdgesCount() == 1); // One input to q - ORT_ENFORCE(q_node.GetOutputEdgesCount() == 1); // One q->dq edge - auto in_edge = q_node.InputEdgesBegin(); + for (auto dq_ptr : dq_ptrs) { + auto dq_iter = std::find(nodes.begin(), nodes.end(), *dq_ptr); + if (dq_iter != nodes.end()) { + removed.splice(removed.end(), nodes, dq_iter); + } + } auto remove_edge = [this](const Node& src, const Node& dst, int src_arg, int dst_arg) { original_graph.RemoveEdge(src.Index(), dst.Index(), src_arg, dst_arg); }; - // Remove input edge to q - remove_edge(in_edge->GetNode(), q_node, in_edge->GetSrcArgIndex(), in_edge->GetDstArgIndex()); - - // Remove q edge to dq - remove_edge(q_node, dq_node, 0, 0); + auto in_edge = q_node.InputEdgesBegin(); + ORT_ENFORCE(in_edge != q_node.InputEdgesEnd(), "Q node must have an input edge"); + const int prev_output_index = in_edge->GetSrcArgIndex(); - // Replace all edges from dq to outputs with input to output - if (dq_node.GetOutputEdgesCount() > 0) { - for (auto out_edge = dq_node.OutputEdgesBegin(); out_edge != dq_node.OutputEdgesEnd(); out_edge.operator++()) { - // Remove dq edge to output - remove_edge(dq_node, out_edge->GetNode(), out_edge->GetSrcArgIndex(), out_edge->GetDstArgIndex()); + if (in_edge != q_node.InputEdgesEnd()) { + remove_edge(in_edge->GetNode(), q_node, + in_edge->GetSrcArgIndex(), in_edge->GetDstArgIndex()); + } + for (auto dq_ptr : dq_ptrs) { + auto& dq_node_ref = *dq_ptr->node_ptr; - // Add edge input->output - { - auto in_edge_src_index = in_edge->GetNode().Index(); - auto out_edge_dst_index = out_edge->GetNode().Index(); - original_graph.AddEdge(in_edge_src_index, out_edge_dst_index, in_edge->GetSrcArgIndex(), out_edge->GetDstArgIndex()); + for (auto edge_it = dq_node_ref.InputEdgesBegin(); edge_it != dq_node_ref.InputEdgesEnd(); ++edge_it) { + if (edge_it->GetNode().Index() == q_node.Index()) { + remove_edge(edge_it->GetNode(), dq_node_ref, edge_it->GetSrcArgIndex(), edge_it->GetDstArgIndex()); + break; } } - } else { - // Copy input/output defs - std::vector prev_input_defs(prev_node.InputDefs().size()); - std::vector prev_output_defs(prev_node.OutputDefs().size()); - auto transform_f = [this](const NodeArg* iter) { return &original_graph.GetOrCreateNodeArg(iter->Name(), iter->TypeAsProto()); }; - auto fill_vectors = [transform_f](const auto& src, auto& dst) { - std::transform(src.begin(), src.end(), dst.begin(), transform_f); - }; - fill_vectors(prev_node.InputDefs(), prev_input_defs); - fill_vectors(prev_node.OutputDefs(), prev_output_defs); - // Update def corresponding to DQ output - ORT_ENFORCE(dq_node.OutputDefs().size() == 1); // One dq->output - auto dq_output_def = dq_node.OutputDefs()[0]; - prev_output_defs[in_edge->GetSrcArgIndex()] = &original_graph.GetOrCreateNodeArg(dq_output_def->Name(), dq_output_def->TypeAsProto()); + std::vector> output_edges; // (dst_node_index, src_arg, dst_arg) + for (auto out_edge_it = dq_node_ref.OutputEdgesBegin(); out_edge_it != dq_node_ref.OutputEdgesEnd(); ++out_edge_it) { + output_edges.emplace_back(out_edge_it->GetNode().Index(), + out_edge_it->GetSrcArgIndex(), + out_edge_it->GetDstArgIndex()); + } - // Get attributes - auto attributes = NodeAttributes::Create(); - *attributes = prev_node.GetAttributes(); + for (const auto& edge : output_edges) { + original_graph.RemoveEdge(dq_node_ref.Index(), std::get<0>(edge), + std::get<1>(edge), std::get<2>(edge)); + } - // Add new input node - original_graph.AddNode(prev_node.Name(), - prev_node.OpType(), - prev_node.Description(), - prev_input_defs, - prev_output_defs, - std::move(*attributes.release()), - prev_node.Domain()); + if (!is_prev_input) { + for (const auto& edge : output_edges) { + original_graph.AddEdge(prev.node_ptr->Index(), + std::get<0>(edge), + prev_output_index, + std::get<2>(edge)); + } + } + } - // Remove original input node - original_graph.RemoveNode(prev_node.Index()); + if (!output_replacements.empty()) { + auto outputs = original_graph.GetOutputs(); + for (auto& output : outputs) { + for (const auto& replacement : output_replacements) { + if (output == replacement.first) { + output = replacement.second; + break; + } + } + } + original_graph.SetOutputs(outputs); } original_graph.RemoveNode(q_node.Index()); - original_graph.RemoveNode(dq_node.Index()); + for (auto dq_ptr : dq_ptrs) { + original_graph.RemoveNode(dq_ptr->node_ptr->Index()); + } } std::list remove_qdq(float threshold = 1.f, bool scale_output = false) { std::list removed; - std::list nodes_copy; + std::vector nodes_copy; std::for_each(nodes.begin(), nodes.end(), [&nodes_copy](GraphNode& node) { nodes_copy.push_back(&node); }); - for (auto node : nodes_copy) { if (std::find(nodes.begin(), nodes.end(), *node) == nodes.end()) { continue; @@ -465,7 +501,13 @@ struct CustomGraph { if ((node->op_type == "QuantizeLinear") && (node->to_node[0]->op_type == "DequantizeLinear")) { - if (!scale_output && node->down_propagate_to_output()) { + const auto& zero_point_name = node->node_input_name[2]; + const auto p_initializer = original_graph.GetConstantInitializer(zero_point_name, false); + bool is_16_bit = p_initializer->has_data_type() && + (p_initializer->data_type() == ONNX_NAMESPACE::TensorProto_DataType_INT16 || + p_initializer->data_type() == ONNX_NAMESPACE::TensorProto_DataType_UINT16); + + if (!scale_output && node->down_propagate_to_output() && is_16_bit) { remove_qdq_pair(*node, removed); continue; } @@ -707,71 +749,152 @@ void scale_graph(CustomGraph& gen_graph, Status copy_model(const GraphViewer& src_graph_viewer, const logging::Logger& logger, std::unique_ptr& model) { - // Constructs model from scratch using the metadata in src_graph model = src_graph_viewer.CreateModel(logger); const auto& src_graph = src_graph_viewer.GetGraph(); - - // - // Initialize model/graph metadata. - // auto& dst_graph = model->MainGraph(); - // Set inputs outputs explicitly to make sure the order is same as the user model. - auto inputs = src_graph.GetInputs(); - auto outputs = src_graph.GetOutputs(); + const auto& inputs = src_graph.GetInputs(); + const auto& outputs = src_graph.GetOutputs(); + + struct InputReplacement { + NodeArg* graph_input; + NodeArg* identity_output; + }; + std::unordered_map input_replacement_map; + + struct OutputReplacement { + NodeArg* intermediate_arg; + NodeArg* original_output; + }; + std::unordered_map output_replacement_map; InlinedVector dst_graph_inputs; dst_graph_inputs.reserve(inputs.size()); for (auto& input : inputs) { - auto input_arg = src_graph.GetNodeArg(input->Name()); - auto& dst_graph_input_arg = dst_graph.GetOrCreateNodeArg(input_arg->Name(), input_arg->TypeAsProto()); - dst_graph_inputs.push_back(&dst_graph_input_arg); + const auto& input_name = input->Name(); + auto input_arg = src_graph.GetNodeArg(input_name); + + auto& dst_input_arg = dst_graph.GetOrCreateNodeArg(input_name, input_arg->TypeAsProto()); + dst_graph_inputs.push_back(&dst_input_arg); + + auto output_name = input_name + "_identity_output"; + auto& identity_output_arg = dst_graph.GetOrCreateNodeArg(output_name, input_arg->TypeAsProto()); + + input_replacement_map[input_name] = {&dst_input_arg, &identity_output_arg}; } InlinedVector dst_graph_outputs; - dst_graph_outputs.reserve(outputs.size()); for (auto& output : outputs) { - auto output_arg = src_graph.GetNodeArg(output->Name()); - auto& dst_graph_output_arg = dst_graph.GetOrCreateNodeArg(output_arg->Name(), output_arg->TypeAsProto()); - dst_graph_outputs.push_back(&dst_graph_output_arg); + const auto& output_name = output->Name(); + auto output_arg = src_graph.GetNodeArg(output_name); + + std::string intermediate_name = "tmp_" + output_name; + auto& intermediate_out = dst_graph.GetOrCreateNodeArg(intermediate_name, output_arg->TypeAsProto()); + + auto& original_out = dst_graph.GetOrCreateNodeArg(output_name, output_arg->TypeAsProto()); + + output_replacement_map[output_name] = {&intermediate_out, &original_out}; + dst_graph_outputs.push_back(&original_out); } dst_graph.SetInputs(dst_graph_inputs); dst_graph.SetOutputs(dst_graph_outputs); dst_graph.SetName(src_graph.Name()); - // Mark outer scope NodeArgs for (const auto& name : src_graph_viewer.GetOuterScopeNodeArgNames()) { - auto* node_arg = src_graph.GetNodeArg(name); + auto node_arg = src_graph.GetNodeArg(name); ORT_RETURN_IF_NOT(node_arg != nullptr, "Outer scope node arg name '" + name + "'was added but does not exist. "); dst_graph.AddOuterScopeNodeArg(name); } - // Add nodes + for (auto& input : inputs) { + const auto& input_name = input->Name(); + auto it = input_replacement_map.find(input_name); + ORT_RETURN_IF_NOT(it != input_replacement_map.end(), "Missing replacement for input: " + input_name); + + InputReplacement& repl = it->second; + InlinedVector input_args = {repl.graph_input}; + InlinedVector output_args = {repl.identity_output}; + + std::string node_name = "IdentityInsertion_" + input_name; + dst_graph.AddNode(node_name, "Identity", "Inserted identity node", + input_args, output_args, + nullptr, ""); + } + for (auto pnode : src_graph.Nodes()) { if (pnode->NodeType() == Node::Type::Fused) continue; - dst_graph.AddNode(*pnode); + + InlinedVector new_input_args; + for (auto input_arg : pnode->InputDefs()) { + if (!input_arg) { + new_input_args.push_back(nullptr); + continue; + } + + auto it = input_replacement_map.find(input_arg->Name()); + if (it != input_replacement_map.end()) { + new_input_args.push_back(it->second.identity_output); + } else { + auto& new_arg = dst_graph.GetOrCreateNodeArg(input_arg->Name(), input_arg->TypeAsProto()); + new_input_args.push_back(&new_arg); + } + } + InlinedVector new_output_args; + for (auto output_arg : pnode->OutputDefs()) { + if (output_arg == nullptr) { + new_output_args.push_back(nullptr); + continue; + } + + auto it_output = output_replacement_map.find(output_arg->Name()); + if (it_output != output_replacement_map.end()) { + new_output_args.push_back(it_output->second.intermediate_arg); + } else { + auto& new_arg = dst_graph.GetOrCreateNodeArg(output_arg->Name(), output_arg->TypeAsProto()); + new_output_args.push_back(&new_arg); + } + } + + dst_graph.AddNode(pnode->Name(), pnode->OpType(), pnode->Description(), + new_input_args, new_output_args, + &pnode->GetAttributes(), pnode->Domain()); + } + + for (auto& output : outputs) { + const std::string& output_name = output->Name(); + auto it = output_replacement_map.find(output_name); + if (it == output_replacement_map.end()) continue; + + OutputReplacement& repl = it->second; + InlinedVector input_args = {repl.intermediate_arg}; + InlinedVector output_args = {repl.original_output}; + + std::string node_name = "IdentityInsertion_" + output_name; + dst_graph.AddNode(node_name, "Identity", "Inserted identitynode", + input_args, output_args, nullptr, ""); } - // Handle constant initializers for (auto& [name, tensor_proto] : src_graph.GetAllInitializedTensors()) { dst_graph.AddInitializedTensor(*tensor_proto); } - for (const auto node_arg : src_graph.GetInputsIncludingInitializers()) { - // Skip non initializer - auto check_inputs = [node_arg](const NodeArg* input_node_arg) { return input_node_arg->Name() == node_arg->Name(); }; - if (std::find_if(dst_graph_inputs.begin(), dst_graph_inputs.end(), check_inputs) != dst_graph_inputs.end()) continue; - // Add initializer - const auto src_tensor_proto = src_graph.GetConstantInitializer(node_arg->Name(), true); - auto dst_tensor_proto = onnx::TensorProto::Create(); - dst_tensor_proto->copy_from(src_tensor_proto); - dst_graph.AddInitializedTensor(*dst_tensor_proto); + for (auto node_arg : src_graph.GetInputsIncludingInitializers()) { + auto check_inputs = [node_arg](auto input_node_arg) { + return input_node_arg->Name() == node_arg->Name(); + }; + if (std::find_if(dst_graph_inputs.begin(), dst_graph_inputs.end(), check_inputs) != dst_graph_inputs.end()) + continue; + + auto src_tensor_proto = src_graph.GetConstantInitializer(node_arg->Name(), true); + if (src_tensor_proto) { + auto dst_tensor_proto = onnx::TensorProto::Create(); + dst_tensor_proto->copy_from(src_tensor_proto); + dst_graph.AddInitializedTensor(*dst_tensor_proto); + } } - // Validate graph, remove unnecessary initializers, and run type/shape inference. ORT_RETURN_IF_ERROR(dst_graph.Resolve()); - return Status::OK(); } @@ -789,4 +912,4 @@ Status Transform(const GraphViewer& src_graph_viewer, } } // namespace qdq_scales_fix } // namespace openvino_ep -} // namespace onnxruntime \ No newline at end of file +} // namespace onnxruntime From 304a8d214ef05e0fe806ce9cbd1563083991960b Mon Sep 17 00:00:00 2001 From: "Javier E. Martinez" Date: Tue, 24 Jun 2025 09:34:41 -0700 Subject: [PATCH 4/6] Fix lack of scales updates and remove stray QDQ nodes in certain models --- .../providers/openvino/ov_protobuf_utils.cpp | 10 ++- .../providers/openvino/ov_protobuf_utils.h | 3 +- .../qdq_transformations/qdq_scales_fix.cpp | 65 +++++++++++++++++-- 3 files changed, 70 insertions(+), 8 deletions(-) diff --git a/onnxruntime/core/providers/openvino/ov_protobuf_utils.cpp b/onnxruntime/core/providers/openvino/ov_protobuf_utils.cpp index 73bba4595f790..e28330e0bd433 100644 --- a/onnxruntime/core/providers/openvino/ov_protobuf_utils.cpp +++ b/onnxruntime/core/providers/openvino/ov_protobuf_utils.cpp @@ -9,10 +9,16 @@ namespace onnxruntime { namespace openvino_ep { float get_float_initializer_data(const void* initializer) { - const auto *tp = reinterpret_cast(initializer); + const auto* tp = reinterpret_cast(initializer); ORT_ENFORCE((tp->has_data_type() && (tp->data_type() == ONNX_NAMESPACE::TensorProto_DataType_FLOAT))); // ORT_ENFORCE(initializer.dims_size() == 1); return tp->float_data(0); } +void set_float_initializer_data(const void* initializer, float data) { + auto* tp = (ONNX_NAMESPACE::TensorProto*)(initializer); + ORT_ENFORCE((tp->has_data_type() && (tp->data_type() == ONNX_NAMESPACE::TensorProto_DataType_FLOAT))); + // ORT_ENFORCE(initializer.dims_size() == 1); + tp->set_float_data(0, data); +} } // namespace openvino_ep -} // namespace onnxruntime \ No newline at end of file +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/openvino/ov_protobuf_utils.h b/onnxruntime/core/providers/openvino/ov_protobuf_utils.h index 188f19bf09196..2a6d914ee2920 100644 --- a/onnxruntime/core/providers/openvino/ov_protobuf_utils.h +++ b/onnxruntime/core/providers/openvino/ov_protobuf_utils.h @@ -5,5 +5,6 @@ namespace onnxruntime { namespace openvino_ep { float get_float_initializer_data(const void* initializer); +void set_float_initializer_data(const void* initializer, float data); } -} // namespace onnxruntime \ No newline at end of file +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.cpp b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.cpp index fc8336e474399..9c1af1ec3517a 100644 --- a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.cpp +++ b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.cpp @@ -577,6 +577,32 @@ float get_initializer_value(const Graph& graph, const std::string& initializer_n return get_float_initializer_data(p_initializer); } +template +T* get_mutable_initializer_data(Graph& graph, const std::string& name) { + auto initializer = graph.GetConstantInitializer(name, true); + if (!initializer) return nullptr; + + if constexpr (std::is_same_v) { + if (initializer->data_type() != ONNX_NAMESPACE::TensorProto_DataType_FLOAT) + return nullptr; + } + + return reinterpret_cast(const_cast(initializer->raw_data().data())); +} + +std::size_t get_initializer_size(const Graph& graph, const std::string& name) { + auto initializer = graph.GetConstantInitializer(name, true); + if (!initializer) return 0; + + std::size_t size = 1; + if (!initializer->dims_size()) + return 0; + for (int i = 0; i < initializer->dims_size(); ++i) { + size *= initializer->dims()[i]; + } + return size; +} + void update_initializer_value(Graph& graph, const std::string& initializer_name, const float new_value) { const auto p_initializer = graph.GetConstantInitializer(initializer_name, false); @@ -677,18 +703,19 @@ CustomGraph generate_graph_from_onnx(Graph& graph) { return gen_graph; } -void scale_graph(CustomGraph& gen_graph, +bool scale_graph(CustomGraph& gen_graph, float threshold = 1.f, float ratio = 10, bool scale_output = false) { + bool needs_second_run = false; gen_graph.initailize_search(threshold, scale_output); auto q = gen_graph.get_start_nodes(); auto pred = [](const GraphNode* left, const GraphNode* right) -> bool { return left->node_name < right->node_name; }; q.sort(pred); - auto special_node = std::find_if(gen_graph.nodes.begin(), gen_graph.nodes.end(), [](const GraphNode& node) { return node.node_name == "/encoder/down_blocks.3/resnets.0/Add"; }); - while (q.size() > 0) { + + while (!q.empty()) { auto cur_node = q.front(); q.pop_front(); if (cur_node->visited < cur_node->from_node.size()) { @@ -696,7 +723,8 @@ void scale_graph(CustomGraph& gen_graph, } else { if (cur_node->op_type == "QuantizeLinear" && cur_node->to_node[0]->op_type == "DequantizeLinear") { - auto scale_name = *std::next(cur_node->node_input_name.begin()); // Scale + needs_second_run = true; + auto scale_name = *std::next(cur_node->node_input_name.begin()); auto scale_value = get_initializer_value(gen_graph.original_graph, scale_name); // QDQ pair with scale over 1 @@ -745,8 +773,33 @@ void scale_graph(CustomGraph& gen_graph, } } } + + for (auto& node : gen_graph.nodes) { + if (node.op_type == "DequantizeLinear" && node.scale_factor != 1.0f) { + const auto& scale_name = node.node_input_name[1]; + + auto scale_data = get_mutable_initializer_data(gen_graph.original_graph, scale_name); + if (scale_data) { + const auto scale_size = get_initializer_size(gen_graph.original_graph, scale_name); + if (!scale_size) { + auto it = gen_graph.original_graph.GetConstantInitializer(scale_name, true); + auto cur_scale = get_float_initializer_data(it); + cur_scale /= node.scale_factor; + set_float_initializer_data(it, cur_scale); + } else { + for (std::size_t i = 0; i < scale_size; ++i) { + scale_data[i] /= node.scale_factor; + } + } + } + + node.scale_factor = 1.0f; + } + } + return needs_second_run; } + Status copy_model(const GraphViewer& src_graph_viewer, const logging::Logger& logger, std::unique_ptr& model) { model = src_graph_viewer.CreateModel(logger); @@ -907,7 +960,9 @@ Status Transform(const GraphViewer& src_graph_viewer, float threshold{1.f}; float ratio{10.f}; bool scale_output{false}; - scale_graph(g, threshold, ratio, scale_output); + auto needs_second_run = scale_graph(g, threshold, ratio, scale_output); + if (needs_second_run) + scale_graph(g, threshold * 100, ratio, scale_output); return status; } } // namespace qdq_scales_fix From ce35466b586836b3b3801df4ceb35724443fcb68 Mon Sep 17 00:00:00 2001 From: "Javier E. Martinez" Date: Wed, 25 Jun 2025 07:38:02 -0700 Subject: [PATCH 5/6] Address issues with Linux CI --- .../qdq_transformations/qdq_scales_fix.cpp | 43 +++---------------- 1 file changed, 6 insertions(+), 37 deletions(-) diff --git a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.cpp b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.cpp index 9c1af1ec3517a..37b06b607e3c5 100644 --- a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.cpp +++ b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.cpp @@ -22,7 +22,6 @@ namespace fs = std::filesystem; using NodeRef = std::reference_wrapper; struct GraphNode; float get_initializer_value(const Graph& graph, const std::string& initializer_name); -void remove_node_and_reconnect(Graph& graph, const GraphNode& node_to_remove, NodeIndex next_input); template bool contains(V&& begin, V&& end, const T& val) { @@ -226,7 +225,7 @@ struct GraphNode { // queued, // visited, // scale_factor); - auto print_node_vector = [](std::vector nodes) -> std::string { + auto print_node_vector = [](const std::vector& nodes) -> std::string { // auto comp = [](const GraphNode* left, const GraphNode* right) -> bool { // return left->node_name < right->node_name; // }; @@ -245,7 +244,7 @@ struct GraphNode { std::string from_node_str = print_node_vector(from_node); std::string to_node_str = print_node_vector(to_node); - auto print_string_vector = [](std::vector nodes) -> std::string { + auto print_string_vector = [](const std::vector& nodes) -> std::string { // std::sort(nodes.begin(), nodes.end()); std::string ret = "["; for (size_t i = 0, size = nodes.size(); const auto& node : nodes) { @@ -506,8 +505,9 @@ struct CustomGraph { bool is_16_bit = p_initializer->has_data_type() && (p_initializer->data_type() == ONNX_NAMESPACE::TensorProto_DataType_INT16 || p_initializer->data_type() == ONNX_NAMESPACE::TensorProto_DataType_UINT16); - - if (!scale_output && node->down_propagate_to_output() && is_16_bit) { + if (!is_16_bit) + continue; + if (!scale_output && node->down_propagate_to_output()) { remove_qdq_pair(*node, removed); continue; } @@ -624,37 +624,6 @@ void update_initializer_value(Graph& graph, const std::string& initializer_name, graph.AddInitializedTensor(*new_tensor); } -void remove_node_and_reconnect(Graph& graph, const GraphNode& node_to_remove, NodeIndex next_input) { - const auto& n2r = *(node_to_remove.node_ptr); - - ORT_ENFORCE(n2r.GetOutputEdgesCount() == 1); - ORT_ENFORCE(n2r.GetInputEdgesCount() == 1); - - auto in_edge = n2r.InputEdgesBegin(); - auto out_edge = n2r.OutputEdgesBegin(); - - // Remove in_edge - { - const auto& src_node = in_edge->GetNode(); - graph.RemoveEdge(src_node.Index(), n2r.Index(), in_edge->GetSrcArgIndex(), in_edge->GetDstArgIndex()); - } - - // Remove out_edge - { - const auto& dst_node = out_edge->GetNode(); - graph.RemoveEdge(n2r.Index(), dst_node.Index(), out_edge->GetSrcArgIndex(), out_edge->GetDstArgIndex()); - } - - // Add next_input->out_edge node - { - auto in_edge_src_index = in_edge->GetNode().Index(); - auto out_edge_dst_index = out_edge->GetNode().Index(); - graph.AddEdge(in_edge_src_index, out_edge_dst_index, in_edge->GetSrcArgIndex(), out_edge->GetDstArgIndex()); - } - - graph.RemoveNode(n2r.Index()); -} - CustomGraph generate_graph_from_onnx(Graph& graph) { CustomGraph gen_graph{graph}; @@ -718,7 +687,7 @@ bool scale_graph(CustomGraph& gen_graph, while (!q.empty()) { auto cur_node = q.front(); q.pop_front(); - if (cur_node->visited < cur_node->from_node.size()) { + if (static_cast(cur_node->visited) < cur_node->from_node.size()) { cur_node->queued = false; } else { if (cur_node->op_type == "QuantizeLinear" && From e0cc75c5b31ff334bf2b1f1130cd872ebec818e3 Mon Sep 17 00:00:00 2001 From: "Javier E. Martinez" Date: Wed, 2 Jul 2025 11:00:32 -0700 Subject: [PATCH 6/6] Fix for double QDQ issue --- .../optimizer/double_qdq_pairs_remover.cc | 1 + .../qdq_transformations/qdq_scales_fix.cpp | 21 ++++++++++++------- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/onnxruntime/core/optimizer/double_qdq_pairs_remover.cc b/onnxruntime/core/optimizer/double_qdq_pairs_remover.cc index 1841dfa2791e0..7f214e656e0ab 100644 --- a/onnxruntime/core/optimizer/double_qdq_pairs_remover.cc +++ b/onnxruntime/core/optimizer/double_qdq_pairs_remover.cc @@ -52,6 +52,7 @@ static void ApplyNewInputValue(Graph& graph, Node& node, QDQ::InputIndex index, input_init.ToProto(new_input_tensor); auto new_name = graph.GenerateNodeArgName("DoubleQDQRemoved_" + node.InputDefs()[index]->Name()); new_input_tensor.set_name(new_name); + new_input_tensor.add_dims(1); NodeArg& new_input = graph_utils::AddInitializerWithExternalData(graph, new_input_tensor); graph_utils::ReplaceNodeInput(node, index, new_input); } diff --git a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.cpp b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.cpp index 37b06b607e3c5..571aa57c99f33 100644 --- a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.cpp +++ b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.cpp @@ -571,14 +571,8 @@ struct CustomGraph { Graph& original_graph; }; -float get_initializer_value(const Graph& graph, const std::string& initializer_name) { - const auto p_initializer = graph.GetConstantInitializer(initializer_name, false); - - return get_float_initializer_data(p_initializer); -} - template -T* get_mutable_initializer_data(Graph& graph, const std::string& name) { +T* get_mutable_initializer_data(const Graph& graph, const std::string& name) { auto initializer = graph.GetConstantInitializer(name, true); if (!initializer) return nullptr; @@ -603,6 +597,19 @@ std::size_t get_initializer_size(const Graph& graph, const std::string& name) { return size; } +float get_initializer_value(const Graph& graph, const std::string& initializer_name) { + const auto p_initializer = graph.GetConstantInitializer(initializer_name, false); + + if (p_initializer->has_raw_data()) { + auto raw_data = get_mutable_initializer_data(graph, initializer_name); + auto size = get_initializer_size(graph, initializer_name); + ORT_ENFORCE(size == 1, "Expected an initializer to be of size 1"); + return raw_data[0]; + } + else + return get_float_initializer_data(p_initializer); +} + void update_initializer_value(Graph& graph, const std::string& initializer_name, const float new_value) { const auto p_initializer = graph.GetConstantInitializer(initializer_name, false);