diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc index cadeab4cbd4cc..2af414bd359bf 100644 --- a/onnxruntime/core/providers/openvino/backend_manager.cc +++ b/onnxruntime/core/providers/openvino/backend_manager.cc @@ -387,6 +387,44 @@ static bool IsModelBF16(const onnxruntime::GraphViewer& graph_viewer) { return false; } +static bool Is16BitTensor(const onnxruntime::NodeArg* node_arg) { + const auto* type_proto = node_arg ? node_arg->TypeAsProto() : nullptr; + return type_proto && type_proto->has_tensor_type() && + (type_proto->tensor_type().elem_type() == ONNX_NAMESPACE::TensorProto_DataType_UINT16 || + type_proto->tensor_type().elem_type() == ONNX_NAMESPACE::TensorProto_DataType_INT16); +} + +// Check to see if the graph has Q/DQ nodes with int16 or uint16 quantization +static bool IsQDQGraphWithUint16OrInt16(const onnxruntime::GraphViewer& graph_viewer) { + std::unordered_set qdq_ops = {"QuantizeLinear", "DequantizeLinear"}; + const auto& node_indices = graph_viewer.GetNodesInTopologicalOrder(); + + for (size_t i = 0; i < node_indices.size(); i++) { + gsl::not_null node(graph_viewer.GetNode(node_indices[i])); + + if (qdq_ops.find(node->OpType()) != qdq_ops.end()) { + const auto& input_defs = node->InputDefs(); + + if (node->OpType() == "DequantizeLinear") { + // DequantizeLinear: [quantized_input, scale, zero_point] -> [float_output] + // Check quantized input tensor and optional zero point + if (Is16BitTensor(input_defs.empty() ? nullptr : input_defs[0]) || + (input_defs.size() >= 3 && Is16BitTensor(input_defs[2]))) { + return true; + } + } else if (node->OpType() == "QuantizeLinear") { + // QuantizeLinear: [float_input, scale, zero_point] -> [quantized_output] + const auto& output_defs = node->OutputDefs(); + if (Is16BitTensor(output_defs.empty() ? nullptr : output_defs[0]) || + (input_defs.size() >= 3 && Is16BitTensor(input_defs[2]))) { + return true; + } + } + } + } + return false; +} + static void DumpOpenVINOEPModel([[maybe_unused]] const std::filesystem::path& onnx_model_path_name, [[maybe_unused]] ONNX_NAMESPACE::ModelProto* model_proto, [[maybe_unused]] const onnxruntime::Node& fused_node) { @@ -445,6 +483,10 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node, } #endif + // Check if the graph is QDQ and has int16 or uint16 quantization + // If so, we will apply the QDQ scales fix transformation (for GPU device only) + bool is_qdq_graph_uint16_or_int16 = IsQDQGraphWithUint16OrInt16(subgraph); + const auto& onnx_model_path_name = subgraph.ModelPath(); // QDQ stripping enabled only for the NPU and experimentally on the GPU if ((session_context_.device_type.find("NPU") != std::string::npos) && @@ -458,7 +500,7 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node, ORT_ENFORCE(status.IsOK(), status.ErrorMessage()); return model_proto; } else if ((session_context_.device_type.find("GPU") != std::string::npos) && - enable_ovep_qdq_optimizer) { + is_qdq_graph_uint16_or_int16) { // Create a copy of the model std::unique_ptr model; Status status = qdq_scales_fix::Transform(subgraph, logger, model);