sshlyapn · sshlyapn · Aug 7, 2025 · Aug 7, 2025
diff --git a/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_eltwise_emitters.hpp b/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_eltwise_emitters.hpp
@@ -6,19 +6,29 @@
 
 #include "jit_emitter.hpp"
 
+#include "snippets/lowered/expression.hpp"
+
 
 namespace ov::intel_gpu::jit {
 
 template <dnnl::impl::gpu::intel::jit::gpu_gen_t hw>
 class jit_add_emitter : public jit_emitter<hw> {
 public:
-    jit_add_emitter(dnnl::impl::gpu::intel::jit::ngen_code_generator_t<hw>* host,
+    jit_add_emitter(jit_snippet_t<hw>* host,
+                    [[maybe_unused]] const ov::snippets::lowered::ExpressionPtr& expr,
                     ov::element::Type exec_prc = ov::element::f32) : jit_emitter<hw>(host, exec_prc) {};
 
     static std::set<std::vector<ov::element::Type>> get_supported_precisions(
         [[maybe_unused]] const std::shared_ptr<ov::Node>& node) {
         return {{element::f32, element::f32}, {element::f16, element::f16}};
     }
+
+    size_t get_inputs_count() const override { return 2; };
+
+protected:
+    void emit_impl(const std::vector<size_t>& in_idxs, const std::vector<size_t>& out_idxs) const override { 
+        OPENVINO_THROW("Unimplemented");
+    }
 };
 
 }  // namespace ov::intel_gpu::jit
diff --git a/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_emitter.hpp b/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_emitter.hpp
@@ -16,8 +16,7 @@ namespace ov::intel_gpu::jit {
 template <dnnl::impl::gpu::intel::jit::gpu_gen_t hw>
 class jit_emitter : public ov::snippets::Emitter {
 public:
-    jit_emitter(dnnl::impl::gpu::intel::jit::ngen_code_generator_t<hw>* host,
-                ov::element::Type exec_prc = ov::element::f32) :
+    jit_emitter(jit_snippet_t<hw>* host, ov::element::Type exec_prc = ov::element::f32) :
         m_h(host),
         m_exec_prc(exec_prc) {}
 
@@ -31,16 +30,55 @@ class jit_emitter : public ov::snippets::Emitter {
         return {};
     }
 
+    virtual size_t get_inputs_count() const = 0;
+    virtual size_t get_aux_vecs_count() const { return 0; }
+    virtual size_t get_aux_gprs_count() const { return 0; }
+
 protected:
-    void emit_code_impl(const std::vector<size_t>& in,
-                        const std::vector<size_t>& out,
-                        const std::vector<size_t>& pool,
-                        const std::vector<size_t>& gpr) const override {
-        OPENVINO_THROW("Unimplemented");
+    void emit_code_impl(const std::vector<size_t>& in_idxs,
+                        const std::vector<size_t>& out_idxs,
+                        const std::vector<size_t>& pool_aux_vec_idxs,
+                        const std::vector<size_t>& pool_aux_gpr_idxs) const override {
+        emitter_preamble(in_idxs, out_idxs, pool_aux_vec_idxs, pool_aux_gpr_idxs);
+
+        emit_impl(in_idxs, out_idxs);
+
+        emitter_postamble();
+    }
+
+    virtual void emitter_preamble(const std::vector<size_t>& in_idxs,
+                                  const std::vector<size_t>& out_idxs,
+                                  const std::vector<size_t>& pool_aux_vec_idxs,
+                                  const std::vector<size_t>& pool_aux_gpr_idxs) const {
+        aux_vec_idxs = pool_aux_vec_idxs;
+        aux_gpr_idxs = pool_aux_gpr_idxs;
+        OPENVINO_ASSERT(aux_vec_idxs.size() >= get_aux_vecs_count(), "Not enough aux vec regs");
+        OPENVINO_ASSERT(aux_gpr_idxs.size() >= get_aux_gprs_count(), "Not enough aux gpr regs");
     }
 
-    dnnl::impl::gpu::intel::jit::ngen_code_generator_t<hw>* m_h;
+    virtual void emitter_postamble() const {}
+
+    virtual void emit_impl(const std::vector<size_t>& in_idxs, const std::vector<size_t>& out_idxs) const = 0;
+
+    jit_snippet_t<hw>* m_h;
     ov::element::Type m_exec_prc;
+
+    mutable std::vector<size_t> aux_vec_idxs;
+    mutable std::vector<size_t> aux_gpr_idxs;
 };
 
+#define TEMPLATE_INSTANCE(emitter, hw) \
+    template class emitter<hw>;
+
+#define TEMPLATE_INSTANCES(emitter) \
+    TEMPLATE_INSTANCE(emitter, ngen::HW::Gen9)    \
+    TEMPLATE_INSTANCE(emitter, ngen::HW::Gen11)   \
+    TEMPLATE_INSTANCE(emitter, ngen::HW::Gen12LP) \
+    TEMPLATE_INSTANCE(emitter, ngen::HW::XeHP)    \
+    TEMPLATE_INSTANCE(emitter, ngen::HW::XeHPG)   \
+    TEMPLATE_INSTANCE(emitter, ngen::HW::XeHPC)   \
+    TEMPLATE_INSTANCE(emitter, ngen::HW::Xe2)     \
+    TEMPLATE_INSTANCE(emitter, ngen::HW::Xe3)
+
+
 }  // namespace ov::intel_gpu::jit
diff --git a/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_kernel_emitter.cpp b/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_kernel_emitter.cpp
@@ -0,0 +1,132 @@
+// Copyright (C) 2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "jit_kernel_emitter.hpp"
+
+#include "snippets/op/kernel.hpp"
+#include "snippets/lowered/linear_ir.hpp"
+
+namespace ov::intel_gpu::jit {
+
+using namespace ngen;
+
+template <dnnl::impl::gpu::intel::jit::gpu_gen_t hw>
+jit_kernel_emitter<hw>::jit_kernel_emitter(jit_snippet_t<hw>* host,
+                                           const ov::snippets::lowered::ExpressionPtr& expr)
+    : jit_emitter<hw>(host, ov::element::dynamic) {
+    const auto kernel = ov::as_type_ptr<snippets::op::KernelStatic>(expr->get_node());
+    OPENVINO_ASSERT(kernel != nullptr, "invoked with invalid op argument");
+    OPENVINO_ASSERT(!kernel->region->empty(), "invoked with empty body");
+    body = kernel->region;
+
+    const auto& parameters = body->get_parameters();
+    const auto& results = body->get_results();
+    std::vector<snippets::Reg> data_ptr_regs;
+    for (const auto& param : parameters) {
+        const auto& reg = param->get_output_port_descriptor(0)->get_reg();
+        if (!reg.is_address()) {
+            data_ptr_regs.push_back(reg);
+        }
+    }
+    num_inputs = data_ptr_regs.size();
+    for (const auto& result : results) {
+        data_ptr_regs.push_back(result->get_input_port_descriptor(0)->get_reg());
+    }
+    num_outputs = data_ptr_regs.size() - num_inputs;
+};
+
+template <dnnl::impl::gpu::intel::jit::gpu_gen_t hw>
+void jit_kernel_emitter<hw>::validate_arguments(const std::vector<size_t>& in, const std::vector<size_t>& out) const {
+    OPENVINO_ASSERT(out.empty() && out.empty(), "Unexpected number of input/output arguments");
+}
+
+template <dnnl::impl::gpu::intel::jit::gpu_gen_t hw>
+void jit_kernel_emitter<hw>::emit_code_impl(const std::vector<size_t>& in,
+                                            const std::vector<size_t>& out,
+                                            const std::vector<size_t>& pool_vec_idxs,
+                                            const std::vector<size_t>& pool_gpr_idxs) const {
+    validate_arguments(in, out);
+    this->aux_vec_idxs = pool_vec_idxs;
+    this->aux_gpr_idxs = pool_gpr_idxs;
+    emit_impl(in, out);
+}
+
+template <dnnl::impl::gpu::intel::jit::gpu_gen_t hw>
+void jit_kernel_emitter<hw>::emit_impl(const std::vector<size_t>& in,
+                                      [[maybe_unused]] const std::vector<size_t>& out) const {
+    // Define kernel interface for OpenCL.
+    for (size_t i = 0; i < num_inputs; ++i) {
+        this->m_h->newArgument("src" + std::to_string(i), ExternalArgumentType::GlobalPtr);
+    }
+
+    for (size_t i = 0; i < num_outputs; ++i) {
+        this->m_h->newArgument("dst" + std::to_string(i), ExternalArgumentType::GlobalPtr);
+    }
+
+    this->m_h->requireLocalID(1);
+    this->m_h->requireLocalSize();
+
+    this->m_h->finalizeInterface();
+
+    auto src0_ptr = this->m_h->getArgument("src0");
+    auto src1_ptr = this->m_h->getArgument("src1");
+    auto dst_ptr = this->m_h->getArgument("dst0");
+
+    auto local_size = this->m_h->getLocalSize(0).uw();
+    auto local_id = this->m_h->getLocalID(0);               // Vector of local IDs.
+    auto group_id = this->m_h->r0.ud(1);                    // Thread group (a.k.a. workgroup) IDs are in r0.ud(1) (X) r0.ud(6) (Y) r0.ud(7) (Z)
+
+    // Local variables.
+    auto global_id = this->m_h->r12.ud(0);
+    auto header = this->m_h->r13;
+    auto temp = this->m_h->r11;
+
+    auto reg_src0 = this->m_h->r14;
+    auto reg_src1 = this->m_h->r15;
+
+    // All instructions use W (NoMask) by default.
+    this->m_h->setDefaultNoMask();
+
+    // Enable automatic SWSB for Gen12.
+    this->m_h->setDefaultAutoSWSB();
+
+    // Prologue for ATS+.
+    this->m_h->prologue();
+
+    // Enable IEEE denormals.
+    this->m_h->or_(1 | this->m_h->Switch, this->m_h->cr0[0], this->m_h->cr0[0], 0x4C0);
+
+    // Calculate global ID = (group ID) * (local size) + (local ID for lane 0).
+    this->m_h->mul(1, global_id, group_id, local_size);
+    this->m_h->add(1, global_id, global_id, local_id[0]);
+
+    this->m_h->shl(1, global_id, global_id, 2);
+    {
+        this->m_h->addc(1, header.ud(0), src0_ptr.ud(0), global_id);
+        this->m_h->mov(1, temp.ud(0), this->m_h->acc0.ud(0));
+        this->m_h->add(1, header.ud(1), src0_ptr.ud(1), temp.ud(0));
+        this->m_h->load(1, reg_src0, this->m_h->D32 | this->m_h->V8T, this->m_h->A64, header);
+    }
+    {
+        this->m_h->addc(1, header.ud(0), src1_ptr.ud(0), global_id);
+        this->m_h->mov(1, temp.ud(0), this->m_h->acc0.ud(0));
+        this->m_h->add(1, header.ud(1), src1_ptr.ud(1), temp.ud(0));
+        this->m_h->load(1, reg_src1, this->m_h->D32 | this->m_h->V8T, this->m_h->A64, header);
+    }
+
+    this->m_h->template add<float>(8, reg_src0, reg_src0, reg_src1);
+
+    {
+        this->m_h->addc(1, header.ud(0), dst_ptr.ud(0), global_id);
+        this->m_h->mov(1, temp.ud(0), this->m_h->acc0.ud(0));
+        this->m_h->add(1, header.ud(1), dst_ptr.ud(1), temp.ud(0));
+        this->m_h->store(1, this->m_h->D32 | this->m_h->V8T, this->m_h->A64, header, reg_src0);
+    }
+
+    this->m_h->epilogue();
+}
+
+TEMPLATE_INSTANCES(jit_kernel_emitter)
+
+}  // namespace ov::intel_gpu::jit
diff --git a/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_kernel_emitter.hpp b/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_kernel_emitter.hpp
@@ -0,0 +1,43 @@
+// Copyright (C) 2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "jit_emitter.hpp"
+
+#include "snippets/lowered/expression.hpp"
+
+namespace ov::intel_gpu::jit {
+
+template <dnnl::impl::gpu::intel::jit::gpu_gen_t hw>
+class jit_kernel_emitter : public jit_emitter<hw> {
+public:
+    jit_kernel_emitter(jit_snippet_t<hw>* host,
+                       const ov::snippets::lowered::ExpressionPtr& expr);
+
+    static std::set<std::vector<ov::element::Type>> get_supported_precisions(
+        [[maybe_unused]] const std::shared_ptr<ov::Node>& node) {
+        return {};
+    }
+
+    size_t get_inputs_count() const override { return 0; };
+
+    void emit_code_impl(const std::vector<size_t>& in_idxs,
+                        const std::vector<size_t>& out_idxs,
+                        const std::vector<size_t>& pool_vec_idxs,
+                        const std::vector<size_t>& pool_gpr_idxs) const override;
+
+protected:
+    void validate_arguments(const std::vector<size_t>& in, const std::vector<size_t>& out) const;
+
+    void emit_impl(const std::vector<size_t>& in_idxs, const std::vector<size_t>& out_idxs) const override;
+
+    std::vector<size_t> data_ptr_regs_idx;
+    size_t num_inputs = 0;
+    size_t num_outputs = 0;
+
+    std::shared_ptr<snippets::lowered::LinearIR> body;
+};
+
+}  // namespace ov::intel_gpu::jit
diff --git a/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_snippets_emitters.hpp b/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_snippets_emitters.hpp
@@ -6,19 +6,27 @@
 
 #include "jit_emitter.hpp"
 
+#include "snippets/lowered/expression.hpp"
+
 
 namespace ov::intel_gpu::jit {
 
 template <dnnl::impl::gpu::intel::jit::gpu_gen_t hw>
 class jit_nop_emitter : public jit_emitter<hw> {
 public:
-    jit_nop_emitter(dnnl::impl::gpu::intel::jit::ngen_code_generator_t<hw>* host,
+    jit_nop_emitter(jit_snippet_t<hw>* host,
+                    [[maybe_unused]] const ov::snippets::lowered::ExpressionPtr& expr,
                     ov::element::Type exec_prc = ov::element::f32) : jit_emitter<hw>(host, exec_prc) {};
 
     static std::set<std::vector<ov::element::Type>> get_supported_precisions(
         [[maybe_unused]] const std::shared_ptr<ov::Node>& node) {
         return {};
     }
+
+    size_t get_inputs_count() const override { return 0; };
+
+protected:
+    void emit_impl(const std::vector<size_t>& in_idxs, const std::vector<size_t>& out_idxs) const override { }
 };
 
 }  // namespace ov::intel_gpu::jit
diff --git a/src/plugins/intel_gpu/src/graph/impls/jit/gpu_generator.cpp b/src/plugins/intel_gpu/src/graph/impls/jit/gpu_generator.cpp
@@ -5,19 +5,24 @@
 #include "gpu_generator.hpp"
 
 #include "snippets/runtime_configurator.hpp"
+#include "snippets/op/load.hpp"
+#include "snippets/op/kernel.hpp"
+#include "snippets/op/store.hpp"
 #include "emitters/jit_eltwise_emitters.hpp"
+#include "emitters/jit_kernel_emitter.hpp"
 #include "emitters/jit_snippets_emitters.hpp"
 
 #include "openvino/op/add.hpp"
 
 
+
 using namespace dnnl::impl::gpu::intel::jit;
 
 namespace ov::intel_gpu::jit {
 
 #define CREATE_SNIPPETS_EMITTER(e_type, ...)                                                          \
         {[this](const snippets::lowered::ExpressionPtr& expr) -> std::shared_ptr<snippets::Emitter> { \
-             return std::make_shared<e_type<hw>>(m_h.get(), ##__VA_ARGS__);                           \
+             return std::make_shared<e_type<hw>>(m_h.get(), expr, ##__VA_ARGS__);                     \
          },                                                                                           \
          [](const std::shared_ptr<ov::Node>& n) -> std::set<std::vector<element::Type>> {             \
              return e_type<hw>::get_supported_precisions(n);                                          \
@@ -29,6 +34,11 @@ GPUTargetMachine<hw>::GPUTargetMachine()
       m_h(std::make_unique<jit_snippet_t<hw>>()) {
     jitters[op::v0::Parameter::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(jit_nop_emitter);
     jitters[op::v0::Result::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(jit_nop_emitter);
+
+    jitters[ov::snippets::op::KernelStatic::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(jit_kernel_emitter);
+    jitters[ov::snippets::op::Load::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(jit_nop_emitter);
+    jitters[ov::snippets::op::Store::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(jit_nop_emitter);
+
     jitters[op::v1::Add::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(jit_add_emitter);
 }
 
@@ -47,20 +57,35 @@ size_t GPUTargetMachine<hw>::get_lanes() const {
 
 template <ngen::HW hw>
 std::vector<snippets::Reg> GPUTargetMachine<hw>::get_abi_arg_regs() const {
-    OPENVINO_THROW("Unimplemented!");
-    return {};
+    // OPENVINO_THROW("Unimplemented!");
+    // TODO: REWRIE THIS PART, THIS IS TEMPORARY SOLUTION
+    std::vector<snippets::Reg> regs(10);
+    for (size_t i = 0; i < regs.size(); ++i) {
+        regs[i] = {ov::snippets::RegType::vec, 10 + i};
+    }
+    return regs;
 }
 
 template <ngen::HW hw>
 std::vector<snippets::Reg> GPUTargetMachine<hw>::get_gp_reg_pool() const {
-    OPENVINO_THROW("Unimplemented!");
-    return {};
+    // OPENVINO_THROW("Unimplemented!");
+    // TODO: REWRIE THIS PART, THIS IS TEMPORARY SOLUTION
+    std::vector<snippets::Reg> regs(30);
+    for (size_t i = 0; i < regs.size(); ++i) {
+        regs[i] = {ov::snippets::RegType::vec, 20 + i};
+    }
+    return regs;
 }
 
 template <ngen::HW hw>
 std::vector<snippets::Reg> GPUTargetMachine<hw>::get_vec_reg_pool() const {
-    OPENVINO_THROW("Unimplemented!");
-    return {};
+    // OPENVINO_THROW("Unimplemented!");
+    // TODO: REWRIE THIS PART, THIS IS TEMPORARY SOLUTION
+    std::vector<snippets::Reg> regs(30);
+    for (size_t i = 0; i < regs.size(); ++i) {
+        regs[i] = {ov::snippets::RegType::vec, 50 + i};
+    }
+    return regs;
 }
 
 template <ngen::HW hw>

diff --git a/src/plugins/intel_gpu/src/graph/impls/jit/jit_generator.hpp b/src/plugins/intel_gpu/src/graph/impls/jit/jit_generator.hpp
@@ -19,6 +19,8 @@ class jit_snippet_t : public ngen::OpenCLCodeGenerator<hw> {
 public:
     jit_snippet_t()
         : ngen::OpenCLCodeGenerator<hw>(0, {GENERATOR_NAME, GENERATOR_LINE, false}) {};
+
+    NGEN_FORWARD_OPENCL(hw);
 };
 
 }  // namespace ov::intel_gpu::jit