From 99a6bbfbfa8783f4b84c452f482e33af55ddb706 Mon Sep 17 00:00:00 2001 From: Alexandra Sidorova Date: Thu, 7 Aug 2025 12:21:11 +0400 Subject: [PATCH] [GPU][Snippets] JIT Kernelemitter - entire kernel --- .../jit/emitters/jit_eltwise_emitters.hpp | 12 +- .../graph/impls/jit/emitters/jit_emitter.hpp | 54 +++++-- .../impls/jit/emitters/jit_kernel_emitter.cpp | 132 ++++++++++++++++++ .../impls/jit/emitters/jit_kernel_emitter.hpp | 43 ++++++ .../jit/emitters/jit_snippets_emitters.hpp | 10 +- .../src/graph/impls/jit/gpu_generator.cpp | 39 +++++- .../src/graph/impls/jit/jit_generator.hpp | 2 + .../src/graph/impls/jit/subgraph.cpp | 2 + .../lowered/set_single_kernel_work_amount.cpp | 6 +- 9 files changed, 279 insertions(+), 21 deletions(-) create mode 100644 src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_kernel_emitter.cpp create mode 100644 src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_kernel_emitter.hpp diff --git a/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_eltwise_emitters.hpp b/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_eltwise_emitters.hpp index d64bdd72062d0d..75b4d19722296c 100644 --- a/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_eltwise_emitters.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_eltwise_emitters.hpp @@ -6,19 +6,29 @@ #include "jit_emitter.hpp" +#include "snippets/lowered/expression.hpp" + namespace ov::intel_gpu::jit { template class jit_add_emitter : public jit_emitter { public: - jit_add_emitter(dnnl::impl::gpu::intel::jit::ngen_code_generator_t* host, + jit_add_emitter(jit_snippet_t* host, + [[maybe_unused]] const ov::snippets::lowered::ExpressionPtr& expr, ov::element::Type exec_prc = ov::element::f32) : jit_emitter(host, exec_prc) {}; static std::set> get_supported_precisions( [[maybe_unused]] const std::shared_ptr& node) { return {{element::f32, element::f32}, {element::f16, element::f16}}; } + + size_t get_inputs_count() const override { return 2; }; + +protected: + void emit_impl(const std::vector& in_idxs, const std::vector& out_idxs) const override { + OPENVINO_THROW("Unimplemented"); + } }; } // namespace ov::intel_gpu::jit diff --git a/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_emitter.hpp b/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_emitter.hpp index 7c3b844bbc8195..de0a462bc180f0 100644 --- a/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_emitter.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_emitter.hpp @@ -16,8 +16,7 @@ namespace ov::intel_gpu::jit { template class jit_emitter : public ov::snippets::Emitter { public: - jit_emitter(dnnl::impl::gpu::intel::jit::ngen_code_generator_t* host, - ov::element::Type exec_prc = ov::element::f32) : + jit_emitter(jit_snippet_t* host, ov::element::Type exec_prc = ov::element::f32) : m_h(host), m_exec_prc(exec_prc) {} @@ -31,16 +30,55 @@ class jit_emitter : public ov::snippets::Emitter { return {}; } + virtual size_t get_inputs_count() const = 0; + virtual size_t get_aux_vecs_count() const { return 0; } + virtual size_t get_aux_gprs_count() const { return 0; } + protected: - void emit_code_impl(const std::vector& in, - const std::vector& out, - const std::vector& pool, - const std::vector& gpr) const override { - OPENVINO_THROW("Unimplemented"); + void emit_code_impl(const std::vector& in_idxs, + const std::vector& out_idxs, + const std::vector& pool_aux_vec_idxs, + const std::vector& pool_aux_gpr_idxs) const override { + emitter_preamble(in_idxs, out_idxs, pool_aux_vec_idxs, pool_aux_gpr_idxs); + + emit_impl(in_idxs, out_idxs); + + emitter_postamble(); + } + + virtual void emitter_preamble(const std::vector& in_idxs, + const std::vector& out_idxs, + const std::vector& pool_aux_vec_idxs, + const std::vector& pool_aux_gpr_idxs) const { + aux_vec_idxs = pool_aux_vec_idxs; + aux_gpr_idxs = pool_aux_gpr_idxs; + OPENVINO_ASSERT(aux_vec_idxs.size() >= get_aux_vecs_count(), "Not enough aux vec regs"); + OPENVINO_ASSERT(aux_gpr_idxs.size() >= get_aux_gprs_count(), "Not enough aux gpr regs"); } - dnnl::impl::gpu::intel::jit::ngen_code_generator_t* m_h; + virtual void emitter_postamble() const {} + + virtual void emit_impl(const std::vector& in_idxs, const std::vector& out_idxs) const = 0; + + jit_snippet_t* m_h; ov::element::Type m_exec_prc; + + mutable std::vector aux_vec_idxs; + mutable std::vector aux_gpr_idxs; }; +#define TEMPLATE_INSTANCE(emitter, hw) \ + template class emitter; + +#define TEMPLATE_INSTANCES(emitter) \ + TEMPLATE_INSTANCE(emitter, ngen::HW::Gen9) \ + TEMPLATE_INSTANCE(emitter, ngen::HW::Gen11) \ + TEMPLATE_INSTANCE(emitter, ngen::HW::Gen12LP) \ + TEMPLATE_INSTANCE(emitter, ngen::HW::XeHP) \ + TEMPLATE_INSTANCE(emitter, ngen::HW::XeHPG) \ + TEMPLATE_INSTANCE(emitter, ngen::HW::XeHPC) \ + TEMPLATE_INSTANCE(emitter, ngen::HW::Xe2) \ + TEMPLATE_INSTANCE(emitter, ngen::HW::Xe3) + + } // namespace ov::intel_gpu::jit diff --git a/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_kernel_emitter.cpp b/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_kernel_emitter.cpp new file mode 100644 index 00000000000000..765e82072c1141 --- /dev/null +++ b/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_kernel_emitter.cpp @@ -0,0 +1,132 @@ +// Copyright (C) 2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "jit_kernel_emitter.hpp" + +#include "snippets/op/kernel.hpp" +#include "snippets/lowered/linear_ir.hpp" + +namespace ov::intel_gpu::jit { + +using namespace ngen; + +template +jit_kernel_emitter::jit_kernel_emitter(jit_snippet_t* host, + const ov::snippets::lowered::ExpressionPtr& expr) + : jit_emitter(host, ov::element::dynamic) { + const auto kernel = ov::as_type_ptr(expr->get_node()); + OPENVINO_ASSERT(kernel != nullptr, "invoked with invalid op argument"); + OPENVINO_ASSERT(!kernel->region->empty(), "invoked with empty body"); + body = kernel->region; + + const auto& parameters = body->get_parameters(); + const auto& results = body->get_results(); + std::vector data_ptr_regs; + for (const auto& param : parameters) { + const auto& reg = param->get_output_port_descriptor(0)->get_reg(); + if (!reg.is_address()) { + data_ptr_regs.push_back(reg); + } + } + num_inputs = data_ptr_regs.size(); + for (const auto& result : results) { + data_ptr_regs.push_back(result->get_input_port_descriptor(0)->get_reg()); + } + num_outputs = data_ptr_regs.size() - num_inputs; +}; + +template +void jit_kernel_emitter::validate_arguments(const std::vector& in, const std::vector& out) const { + OPENVINO_ASSERT(out.empty() && out.empty(), "Unexpected number of input/output arguments"); +} + +template +void jit_kernel_emitter::emit_code_impl(const std::vector& in, + const std::vector& out, + const std::vector& pool_vec_idxs, + const std::vector& pool_gpr_idxs) const { + validate_arguments(in, out); + this->aux_vec_idxs = pool_vec_idxs; + this->aux_gpr_idxs = pool_gpr_idxs; + emit_impl(in, out); +} + +template +void jit_kernel_emitter::emit_impl(const std::vector& in, + [[maybe_unused]] const std::vector& out) const { + // Define kernel interface for OpenCL. + for (size_t i = 0; i < num_inputs; ++i) { + this->m_h->newArgument("src" + std::to_string(i), ExternalArgumentType::GlobalPtr); + } + + for (size_t i = 0; i < num_outputs; ++i) { + this->m_h->newArgument("dst" + std::to_string(i), ExternalArgumentType::GlobalPtr); + } + + this->m_h->requireLocalID(1); + this->m_h->requireLocalSize(); + + this->m_h->finalizeInterface(); + + auto src0_ptr = this->m_h->getArgument("src0"); + auto src1_ptr = this->m_h->getArgument("src1"); + auto dst_ptr = this->m_h->getArgument("dst0"); + + auto local_size = this->m_h->getLocalSize(0).uw(); + auto local_id = this->m_h->getLocalID(0); // Vector of local IDs. + auto group_id = this->m_h->r0.ud(1); // Thread group (a.k.a. workgroup) IDs are in r0.ud(1) (X) r0.ud(6) (Y) r0.ud(7) (Z) + + // Local variables. + auto global_id = this->m_h->r12.ud(0); + auto header = this->m_h->r13; + auto temp = this->m_h->r11; + + auto reg_src0 = this->m_h->r14; + auto reg_src1 = this->m_h->r15; + + // All instructions use W (NoMask) by default. + this->m_h->setDefaultNoMask(); + + // Enable automatic SWSB for Gen12. + this->m_h->setDefaultAutoSWSB(); + + // Prologue for ATS+. + this->m_h->prologue(); + + // Enable IEEE denormals. + this->m_h->or_(1 | this->m_h->Switch, this->m_h->cr0[0], this->m_h->cr0[0], 0x4C0); + + // Calculate global ID = (group ID) * (local size) + (local ID for lane 0). + this->m_h->mul(1, global_id, group_id, local_size); + this->m_h->add(1, global_id, global_id, local_id[0]); + + this->m_h->shl(1, global_id, global_id, 2); + { + this->m_h->addc(1, header.ud(0), src0_ptr.ud(0), global_id); + this->m_h->mov(1, temp.ud(0), this->m_h->acc0.ud(0)); + this->m_h->add(1, header.ud(1), src0_ptr.ud(1), temp.ud(0)); + this->m_h->load(1, reg_src0, this->m_h->D32 | this->m_h->V8T, this->m_h->A64, header); + } + { + this->m_h->addc(1, header.ud(0), src1_ptr.ud(0), global_id); + this->m_h->mov(1, temp.ud(0), this->m_h->acc0.ud(0)); + this->m_h->add(1, header.ud(1), src1_ptr.ud(1), temp.ud(0)); + this->m_h->load(1, reg_src1, this->m_h->D32 | this->m_h->V8T, this->m_h->A64, header); + } + + this->m_h->template add(8, reg_src0, reg_src0, reg_src1); + + { + this->m_h->addc(1, header.ud(0), dst_ptr.ud(0), global_id); + this->m_h->mov(1, temp.ud(0), this->m_h->acc0.ud(0)); + this->m_h->add(1, header.ud(1), dst_ptr.ud(1), temp.ud(0)); + this->m_h->store(1, this->m_h->D32 | this->m_h->V8T, this->m_h->A64, header, reg_src0); + } + + this->m_h->epilogue(); +} + +TEMPLATE_INSTANCES(jit_kernel_emitter) + +} // namespace ov::intel_gpu::jit diff --git a/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_kernel_emitter.hpp b/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_kernel_emitter.hpp new file mode 100644 index 00000000000000..14f418d745139d --- /dev/null +++ b/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_kernel_emitter.hpp @@ -0,0 +1,43 @@ +// Copyright (C) 2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "jit_emitter.hpp" + +#include "snippets/lowered/expression.hpp" + +namespace ov::intel_gpu::jit { + +template +class jit_kernel_emitter : public jit_emitter { +public: + jit_kernel_emitter(jit_snippet_t* host, + const ov::snippets::lowered::ExpressionPtr& expr); + + static std::set> get_supported_precisions( + [[maybe_unused]] const std::shared_ptr& node) { + return {}; + } + + size_t get_inputs_count() const override { return 0; }; + + void emit_code_impl(const std::vector& in_idxs, + const std::vector& out_idxs, + const std::vector& pool_vec_idxs, + const std::vector& pool_gpr_idxs) const override; + +protected: + void validate_arguments(const std::vector& in, const std::vector& out) const; + + void emit_impl(const std::vector& in_idxs, const std::vector& out_idxs) const override; + + std::vector data_ptr_regs_idx; + size_t num_inputs = 0; + size_t num_outputs = 0; + + std::shared_ptr body; +}; + +} // namespace ov::intel_gpu::jit diff --git a/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_snippets_emitters.hpp b/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_snippets_emitters.hpp index 2d2b6f4383579a..f92ad399b08eef 100644 --- a/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_snippets_emitters.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_snippets_emitters.hpp @@ -6,19 +6,27 @@ #include "jit_emitter.hpp" +#include "snippets/lowered/expression.hpp" + namespace ov::intel_gpu::jit { template class jit_nop_emitter : public jit_emitter { public: - jit_nop_emitter(dnnl::impl::gpu::intel::jit::ngen_code_generator_t* host, + jit_nop_emitter(jit_snippet_t* host, + [[maybe_unused]] const ov::snippets::lowered::ExpressionPtr& expr, ov::element::Type exec_prc = ov::element::f32) : jit_emitter(host, exec_prc) {}; static std::set> get_supported_precisions( [[maybe_unused]] const std::shared_ptr& node) { return {}; } + + size_t get_inputs_count() const override { return 0; }; + +protected: + void emit_impl(const std::vector& in_idxs, const std::vector& out_idxs) const override { } }; } // namespace ov::intel_gpu::jit diff --git a/src/plugins/intel_gpu/src/graph/impls/jit/gpu_generator.cpp b/src/plugins/intel_gpu/src/graph/impls/jit/gpu_generator.cpp index e805881e5ceed5..2af318ecb0b945 100644 --- a/src/plugins/intel_gpu/src/graph/impls/jit/gpu_generator.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/jit/gpu_generator.cpp @@ -5,19 +5,24 @@ #include "gpu_generator.hpp" #include "snippets/runtime_configurator.hpp" +#include "snippets/op/load.hpp" +#include "snippets/op/kernel.hpp" +#include "snippets/op/store.hpp" #include "emitters/jit_eltwise_emitters.hpp" +#include "emitters/jit_kernel_emitter.hpp" #include "emitters/jit_snippets_emitters.hpp" #include "openvino/op/add.hpp" + using namespace dnnl::impl::gpu::intel::jit; namespace ov::intel_gpu::jit { #define CREATE_SNIPPETS_EMITTER(e_type, ...) \ {[this](const snippets::lowered::ExpressionPtr& expr) -> std::shared_ptr { \ - return std::make_shared>(m_h.get(), ##__VA_ARGS__); \ + return std::make_shared>(m_h.get(), expr, ##__VA_ARGS__); \ }, \ [](const std::shared_ptr& n) -> std::set> { \ return e_type::get_supported_precisions(n); \ @@ -29,6 +34,11 @@ GPUTargetMachine::GPUTargetMachine() m_h(std::make_unique>()) { jitters[op::v0::Parameter::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(jit_nop_emitter); jitters[op::v0::Result::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(jit_nop_emitter); + + jitters[ov::snippets::op::KernelStatic::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(jit_kernel_emitter); + jitters[ov::snippets::op::Load::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(jit_nop_emitter); + jitters[ov::snippets::op::Store::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(jit_nop_emitter); + jitters[op::v1::Add::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(jit_add_emitter); } @@ -47,20 +57,35 @@ size_t GPUTargetMachine::get_lanes() const { template std::vector GPUTargetMachine::get_abi_arg_regs() const { - OPENVINO_THROW("Unimplemented!"); - return {}; + // OPENVINO_THROW("Unimplemented!"); + // TODO: REWRIE THIS PART, THIS IS TEMPORARY SOLUTION + std::vector regs(10); + for (size_t i = 0; i < regs.size(); ++i) { + regs[i] = {ov::snippets::RegType::vec, 10 + i}; + } + return regs; } template std::vector GPUTargetMachine::get_gp_reg_pool() const { - OPENVINO_THROW("Unimplemented!"); - return {}; + // OPENVINO_THROW("Unimplemented!"); + // TODO: REWRIE THIS PART, THIS IS TEMPORARY SOLUTION + std::vector regs(30); + for (size_t i = 0; i < regs.size(); ++i) { + regs[i] = {ov::snippets::RegType::vec, 20 + i}; + } + return regs; } template std::vector GPUTargetMachine::get_vec_reg_pool() const { - OPENVINO_THROW("Unimplemented!"); - return {}; + // OPENVINO_THROW("Unimplemented!"); + // TODO: REWRIE THIS PART, THIS IS TEMPORARY SOLUTION + std::vector regs(30); + for (size_t i = 0; i < regs.size(); ++i) { + regs[i] = {ov::snippets::RegType::vec, 50 + i}; + } + return regs; } template diff --git a/src/plugins/intel_gpu/src/graph/impls/jit/jit_generator.hpp b/src/plugins/intel_gpu/src/graph/impls/jit/jit_generator.hpp index 1575a0c468c566..bfadd73ad4218a 100644 --- a/src/plugins/intel_gpu/src/graph/impls/jit/jit_generator.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/jit/jit_generator.hpp @@ -19,6 +19,8 @@ class jit_snippet_t : public ngen::OpenCLCodeGenerator { public: jit_snippet_t() : ngen::OpenCLCodeGenerator(0, {GENERATOR_NAME, GENERATOR_LINE, false}) {}; + + NGEN_FORWARD_OPENCL(hw); }; } // namespace ov::intel_gpu::jit diff --git a/src/plugins/intel_gpu/src/graph/impls/jit/subgraph.cpp b/src/plugins/intel_gpu/src/graph/impls/jit/subgraph.cpp index 1d81d445a98653..c2b9c753d0b13f 100644 --- a/src/plugins/intel_gpu/src/graph/impls/jit/subgraph.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/jit/subgraph.cpp @@ -50,6 +50,8 @@ class SubgraphImpl : public primitive_impl { std::make_shared(), control_flow_config, getControlFlowPasses()); + + auto snippet = m_subgraph->generate(nullptr); } ControlFlowPasses getControlFlowPasses() const { diff --git a/src/plugins/intel_gpu/src/plugin/transformations/snippets/lowered/set_single_kernel_work_amount.cpp b/src/plugins/intel_gpu/src/plugin/transformations/snippets/lowered/set_single_kernel_work_amount.cpp index 0ca914a8c9fa39..45c57e942867b2 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/snippets/lowered/set_single_kernel_work_amount.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/snippets/lowered/set_single_kernel_work_amount.cpp @@ -48,10 +48,8 @@ bool SetSingleKernelWorkAmount::run(snippets::lowered::LinearIR& linear_ir) { auto CollapseDims = [](ov::snippets::VectorDims& dims) { OPENVINO_ASSERT(dims.size() >= 2, "CollapseDims can't process shape with less than two dims"); - const auto full_wa_idx = dims.size() - 2; - dims[full_wa_idx] *= dims[dims.size() - 1]; - dims[dims.size() - 1] = 1; - for (size_t i = 0; i < full_wa_idx; i++) { + const auto full_wa_idx = 0; + for (size_t i = full_wa_idx + 1; i < dims.size(); ++i) { dims[full_wa_idx] *= dims[i]; dims[i] = 1; }