From 5ff2078de139710f53a54325574956b9dea75d2a Mon Sep 17 00:00:00 2001 From: Alexandra Sidorova Date: Wed, 30 Jul 2025 12:00:11 +0400 Subject: [PATCH 1/6] [Snippets][GPU] Template generator --- .../include/intel_gpu/primitives/subgraph.hpp | 25 +++- .../src/graph/impls/jit/CMakeLists.txt | 2 +- .../src/graph/impls/jit/gpu_generator.cpp | 117 ++++++++++++++++++ .../src/graph/impls/jit/gpu_generator.hpp | 61 +++++++++ .../src/graph/impls/jit/jit_generator.hpp | 42 +++++++ .../intel_gpu/src/plugin/ops/subgraph.cpp | 2 +- 6 files changed, 245 insertions(+), 4 deletions(-) create mode 100644 src/plugins/intel_gpu/src/graph/impls/jit/gpu_generator.cpp create mode 100644 src/plugins/intel_gpu/src/graph/impls/jit/gpu_generator.hpp create mode 100644 src/plugins/intel_gpu/src/graph/impls/jit/jit_generator.hpp diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/subgraph.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/subgraph.hpp index d4de1678709a35..c625984b20ce46 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/subgraph.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/subgraph.hpp @@ -6,6 +6,9 @@ #include "snippets/op/subgraph.hpp" #include "primitive.hpp" +#include "ocl/ocl_engine.hpp" +#include "jit/gpu_generator.hpp" + namespace cldnn { /// @brief Subgraph primitive @@ -19,8 +22,26 @@ struct subgraph : public primitive_base { /// @param id This primitive id /// @param inputs Input primitive ids /// @param subgraph Original subgraph node - subgraph(const primitive_id& id, const std::vector& inputs, const std::shared_ptr& subgraph) - : primitive_base(id, inputs), ov_subgraph(subgraph->clone()) {} + /// @param eng ocl engine + subgraph(const primitive_id& id, const std::vector& inputs, + const std::shared_ptr& subgraph, const cldnn::engine& eng) + : primitive_base(id, inputs), ov_subgraph(subgraph->clone()) { + ngen::HW hw; + switch (eng.get_device()->get_info().arch) { + case gpu_arch::gen9: hw = ngen::HW::Gen9; break; + case gpu_arch::gen11: hw = ngen::HW::Gen11; break; + case gpu_arch::xe_lp: hw = ngen::HW::XeLP; break; + case gpu_arch::xe_hp: hw = ngen::HW::XeHP; break; + case gpu_arch::xe_hpg: hw = ngen::HW::XeHPG; break; + case gpu_arch::xe_hpc: hw = ngen::HW::XeHPC; break; + case gpu_arch::xe2: hw = ngen::HW::Xe2; break; + case gpu_arch::xe3: hw = ngen::HW::Xe3; break; + case gpu_arch::unknown: hw = ngen::HW::Unknown; break; + default: + OPENVINO_THROW("Unexpected arch"); + } + ov_subgraph->set_generator(std::make_shared(hw)); + } std::shared_ptr ov_subgraph; diff --git a/src/plugins/intel_gpu/src/graph/impls/jit/CMakeLists.txt b/src/plugins/intel_gpu/src/graph/impls/jit/CMakeLists.txt index db4ccd4c89c858..9f80ea10aafa1e 100644 --- a/src/plugins/intel_gpu/src/graph/impls/jit/CMakeLists.txt +++ b/src/plugins/intel_gpu/src/graph/impls/jit/CMakeLists.txt @@ -10,7 +10,7 @@ set(TARGET_NAME "openvino_intel_gpu_jit_obj") ov_gpu_add_backend_target( NAME ${TARGET_NAME} - LINK_LIBRARIES onednn_gpu_tgt + LINK_LIBRARIES onednn_gpu_tgt openvino::snippets ) ov_build_target_faster(${TARGET_NAME} PCH PCH_EXCLUDE detection_output.cpp) diff --git a/src/plugins/intel_gpu/src/graph/impls/jit/gpu_generator.cpp b/src/plugins/intel_gpu/src/graph/impls/jit/gpu_generator.cpp new file mode 100644 index 00000000000000..a5a0c99f0fda7f --- /dev/null +++ b/src/plugins/intel_gpu/src/graph/impls/jit/gpu_generator.cpp @@ -0,0 +1,117 @@ +// Copyright (C) 2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "gpu_generator.hpp" + +#include "snippets/runtime_configurator.hpp" + + +using namespace dnnl::impl::gpu::intel::jit; + +namespace ov::intel_gpu::jit { + +#define CREATE_SNIPPETS_EMITTER(e_type, ...) \ + {[this](const snippets::lowered::ExpressionPtr& expr) -> std::shared_ptr { \ + return std::make_shared(h.get(), isa, expr, ##__VA_ARGS__); \ + }, \ + [](const std::shared_ptr& n) -> std::set> { \ + return e_type::get_supported_precisions(n); \ + }} + +GPUTargetMachine::GPUTargetMachine(dnnl::impl::gpu::intel::jit::gpu_gen_t hw) + : TargetMachine(std::make_shared(std::make_shared())), + m_hw(hw) { + // init generator by hw + switch (hw) { + case ngen::HW::Gen9: m_h = std::make_unique>(); break; + case ngen::HW::Gen11: m_h = std::make_unique>(); break; + case ngen::HW::Gen12LP: m_h = std::make_unique>(); break; + case ngen::HW::XeHP: m_h = std::make_unique>(); break; + case ngen::HW::XeHPG: m_h = std::make_unique>(); break; + case ngen::HW::XeHPC: m_h = std::make_unique>(); break; + case ngen::HW::Xe2: m_h = std::make_unique>(); break; + case ngen::HW::Xe3: m_h = std::make_unique>(); break; + default: + OPENVINO_THROW("Unknown GPU hardware!"); + } + OPENVINO_ASSERT(m_h, "Unitialized generator"); + + // data movement + //jitters[op::v0::Parameter::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(jit_nop_emitter); + //jitters[op::v0::Result::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(jit_nop_emitter); +} + +std::shared_ptr GPUTargetMachine::clone() const { + const auto cloned = std::make_shared(m_hw); + cloned->configurator = std::make_shared(*configurator); + return cloned; +} + +size_t GPUTargetMachine::get_lanes() const { + assert(m_h); + return m_h->getSIMD(); +} + +std::vector GPUTargetMachine::get_abi_arg_regs() const { + OPENVINO_THROW("Unimplemented!"); + return {}; +} + +std::vector GPUTargetMachine::get_gp_reg_pool() const { + OPENVINO_THROW("Unimplemented!"); + return {}; +} + +std::vector GPUTargetMachine::get_vec_reg_pool() const { + OPENVINO_THROW("Unimplemented!"); + return {}; +} + +dnnl::impl::gpu::intel::jit::gpu_gen_t GPUTargetMachine::get_hw() const { + return m_hw; +} + + +snippets::CompiledSnippetPtr GPUTargetMachine::get_snippet() { + // OPENVINO_ASSERT(h->create_kernel() == dnnl::impl::status::success, "Failed to create jit_kernel in get_snippet()"); + // const auto& result = + // std::make_shared(std::unique_ptr(h.release())); + // // Note that we reset all the generated code, since it was copied into CompiledSnippetGPU + // h = std::make_unique(); + // return result; + OPENVINO_THROW("Unimplemented!"); + return nullptr; +} + +CompiledSnippetGPU::CompiledSnippetGPU(std::unique_ptr h) + : h_compiled(std::move(h)) { + //OPENVINO_ASSERT(h_compiled && h_compiled->jit_ker(), "Got invalid jit generator or kernel was nopt compiled"); +} + +const uint8_t* CompiledSnippetGPU::get_code() const { + //return h_compiled->jit_ker(); + OPENVINO_THROW("Unimplemented!"); + return nullptr; +} + +size_t CompiledSnippetGPU::get_code_size() const { + OPENVINO_THROW("Unimplemented!"); +} + +bool CompiledSnippetGPU::empty() const { + return get_code_size() == 0; +} + +GPUGenerator::GPUGenerator(dnnl::impl::gpu::intel::jit::gpu_gen_t hw) + : Generator(std::make_shared(hw)) {} +GPUGenerator::GPUGenerator(const std::shared_ptr& target) : Generator(target) {} + +std::shared_ptr GPUGenerator::clone() const { + const auto& cpu_target_machine = std::dynamic_pointer_cast(target->clone()); + OPENVINO_ASSERT(cpu_target_machine, + "Failed to clone GPUGenerator: the instance contains incompatible TargetMachine type"); + return std::make_shared(cpu_target_machine); +} + +} // namespace ov::intel_gpu::jit diff --git a/src/plugins/intel_gpu/src/graph/impls/jit/gpu_generator.hpp b/src/plugins/intel_gpu/src/graph/impls/jit/gpu_generator.hpp new file mode 100644 index 00000000000000..3576c38ef38457 --- /dev/null +++ b/src/plugins/intel_gpu/src/graph/impls/jit/gpu_generator.hpp @@ -0,0 +1,61 @@ +// Copyright (C) 2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include +#include + +#include "jit_generator.hpp" +#include "gpu/intel/jit/generator.hpp" +#include "openvino/core/node.hpp" +#include "openvino/core/node_output.hpp" +#include "snippets/emitter.hpp" +#include "snippets/generator.hpp" +#include "snippets/target_machine.hpp" + +namespace ov::intel_gpu::jit { + +class CompiledSnippetGPU : public snippets::CompiledSnippet { + const std::unique_ptr h_compiled; + +public: + [[nodiscard]] const uint8_t* get_code() const override; + [[nodiscard]] size_t get_code_size() const override; + [[nodiscard]] bool empty() const override; + explicit CompiledSnippetGPU(std::unique_ptr h); +}; + +class GPUTargetMachine : public ov::snippets::TargetMachine { +public: + explicit GPUTargetMachine(dnnl::impl::gpu::intel::jit::gpu_gen_t hw); + + [[nodiscard]] bool is_supported() const override { return true; } + [[nodiscard]] std::shared_ptr clone() const override; + + [[nodiscard]] size_t get_lanes() const override; + + [[nodiscard]] std::vector get_abi_arg_regs() const override; + [[nodiscard]] std::vector get_gp_reg_pool() const override; + [[nodiscard]] std::vector get_vec_reg_pool() const override; + + [[nodiscard]] dnnl::impl::gpu::intel::jit::gpu_gen_t get_hw() const; + + snippets::CompiledSnippetPtr get_snippet() override; + +private: + dnnl::impl::gpu::intel::jit::gpu_gen_t m_hw; + std::unique_ptr m_h; +}; + +class GPUGenerator : public ov::snippets::Generator { +public: + GPUGenerator(dnnl::impl::gpu::intel::jit::gpu_gen_t hw); + GPUGenerator(const std::shared_ptr& target); + std::shared_ptr clone() const override; +}; + +} // namespace ov::intel_gpu::jit diff --git a/src/plugins/intel_gpu/src/graph/impls/jit/jit_generator.hpp b/src/plugins/intel_gpu/src/graph/impls/jit/jit_generator.hpp new file mode 100644 index 00000000000000..1d179186d13b28 --- /dev/null +++ b/src/plugins/intel_gpu/src/graph/impls/jit/jit_generator.hpp @@ -0,0 +1,42 @@ +// Copyright (C) 2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include +#include + +#include "gpu/intel/jit/generator.hpp" + + +namespace ov::intel_gpu::jit { + +struct jit_snippet_base_t { + virtual ~jit_snippet_base_t() = default; + virtual const char *kernel_name() const = 0; + + virtual int getSIMD() const = 0; + virtual int getGRFCount() const = 0; +}; + +template +class jit_snippet_t : public dnnl::impl::gpu::intel::jit::ngen_code_generator_t, public jit_snippet_base_t { +public: + jit_snippet_t() + : dnnl::impl::gpu::intel::jit::ngen_code_generator_t(0, {GENERATOR_NAME, GENERATOR_LINE, false}) {}; + + const char *kernel_name() const override { + return dnnl::impl::gpu::intel::jit::ngen_code_generator_t::getExternalName().c_str(); + } + int getSIMD() const override { + return dnnl::impl::gpu::intel::jit::ngen_code_generator_t::getSIMD(); + }; + int getGRFCount() const override { + return dnnl::impl::gpu::intel::jit::ngen_code_generator_t::getGRFCount(); + } +}; + +} // namespace ov::intel_gpu::jit diff --git a/src/plugins/intel_gpu/src/plugin/ops/subgraph.cpp b/src/plugins/intel_gpu/src/plugin/ops/subgraph.cpp index 3cae322f8aedf9..3c66ec8834cc0b 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/subgraph.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/subgraph.cpp @@ -18,7 +18,7 @@ static void CreateSubgraphOp(ProgramBuilder& p, const std::shared_ptr Date: Thu, 31 Jul 2025 10:35:40 +0400 Subject: [PATCH 2/6] wip --- .../include/intel_gpu/primitives/subgraph.hpp | 22 +---- .../jit/emitters/jit_eltwise_emitters.hpp | 22 +++++ .../graph/impls/jit/emitters/jit_emitter.hpp | 37 +++++++ .../jit/emitters/jit_snippets_emitters.hpp | 22 +++++ .../src/graph/impls/jit/gpu_generator.cpp | 17 +++- .../src/graph/impls/jit/gpu_generator.hpp | 2 + .../src/graph/impls/jit/subgraph.cpp | 98 ++++++++++++------- .../intel_gpu/src/plugin/ops/subgraph.cpp | 2 +- 8 files changed, 163 insertions(+), 59 deletions(-) create mode 100644 src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_eltwise_emitters.hpp create mode 100644 src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_emitter.hpp create mode 100644 src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_snippets_emitters.hpp diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/subgraph.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/subgraph.hpp index c625984b20ce46..c17f10f86cc91e 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/subgraph.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/subgraph.hpp @@ -7,7 +7,6 @@ #include "primitive.hpp" #include "ocl/ocl_engine.hpp" -#include "jit/gpu_generator.hpp" namespace cldnn { @@ -22,26 +21,9 @@ struct subgraph : public primitive_base { /// @param id This primitive id /// @param inputs Input primitive ids /// @param subgraph Original subgraph node - /// @param eng ocl engine subgraph(const primitive_id& id, const std::vector& inputs, - const std::shared_ptr& subgraph, const cldnn::engine& eng) - : primitive_base(id, inputs), ov_subgraph(subgraph->clone()) { - ngen::HW hw; - switch (eng.get_device()->get_info().arch) { - case gpu_arch::gen9: hw = ngen::HW::Gen9; break; - case gpu_arch::gen11: hw = ngen::HW::Gen11; break; - case gpu_arch::xe_lp: hw = ngen::HW::XeLP; break; - case gpu_arch::xe_hp: hw = ngen::HW::XeHP; break; - case gpu_arch::xe_hpg: hw = ngen::HW::XeHPG; break; - case gpu_arch::xe_hpc: hw = ngen::HW::XeHPC; break; - case gpu_arch::xe2: hw = ngen::HW::Xe2; break; - case gpu_arch::xe3: hw = ngen::HW::Xe3; break; - case gpu_arch::unknown: hw = ngen::HW::Unknown; break; - default: - OPENVINO_THROW("Unexpected arch"); - } - ov_subgraph->set_generator(std::make_shared(hw)); - } + const std::shared_ptr& subgraph) + : primitive_base(id, inputs), ov_subgraph(subgraph->clone()) {} std::shared_ptr ov_subgraph; diff --git a/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_eltwise_emitters.hpp b/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_eltwise_emitters.hpp new file mode 100644 index 00000000000000..ab2b5729300737 --- /dev/null +++ b/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_eltwise_emitters.hpp @@ -0,0 +1,22 @@ +// Copyright (C) 2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "jit_emitter.hpp" + + +namespace ov::intel_gpu::jit { + +class jit_add_emitter : public jit_emitter { +public: + jit_add_emitter() = default; + + static std::set> get_supported_precisions( + [[maybe_unused]] const std::shared_ptr& node) { + return {{element::f32, element::f32}, {element::f16, element::f16}}; + } +}; + +} // namespace ov::intel_gpu::jit diff --git a/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_emitter.hpp b/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_emitter.hpp new file mode 100644 index 00000000000000..971ed5e540e52f --- /dev/null +++ b/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_emitter.hpp @@ -0,0 +1,37 @@ +// Copyright (C) 2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "snippets/emitter.hpp" + +#include "openvino/core/type/element_type.hpp" +#include "openvino/core/node.hpp" + +namespace ov::intel_gpu::jit { + +class jit_emitter : public ov::snippets::Emitter { +public: + jit_emitter() = default; + + /** + * @brief Returns supported precisions. + * Precisions are ordered, the first bigger bitness precision with the same type will be selected. + * Empty collection means the emitter supports any input precisions. + */ + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr) { + return {}; + } + +protected: + void emit_code_impl(const std::vector& in, + const std::vector& out, + const std::vector& pool, + const std::vector& gpr) const override { + OPENVINO_THROW("Unimplemented"); + } +}; + +} // namespace ov::intel_gpu::jit diff --git a/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_snippets_emitters.hpp b/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_snippets_emitters.hpp new file mode 100644 index 00000000000000..739afec517daad --- /dev/null +++ b/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_snippets_emitters.hpp @@ -0,0 +1,22 @@ +// Copyright (C) 2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "jit_emitter.hpp" + + +namespace ov::intel_gpu::jit { + +class jit_nop_emitter : public jit_emitter { +public: + jit_nop_emitter() = default; + + static std::set> get_supported_precisions( + [[maybe_unused]] const std::shared_ptr& node) { + return {}; + } +}; + +} // namespace ov::intel_gpu::jit diff --git a/src/plugins/intel_gpu/src/graph/impls/jit/gpu_generator.cpp b/src/plugins/intel_gpu/src/graph/impls/jit/gpu_generator.cpp index a5a0c99f0fda7f..6beb9d8d55bd54 100644 --- a/src/plugins/intel_gpu/src/graph/impls/jit/gpu_generator.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/jit/gpu_generator.cpp @@ -5,6 +5,10 @@ #include "gpu_generator.hpp" #include "snippets/runtime_configurator.hpp" +#include "emitters/jit_eltwise_emitters.hpp" +#include "emitters/jit_snippets_emitters.hpp" + +#include "openvino/op/add.hpp" using namespace dnnl::impl::gpu::intel::jit; @@ -13,7 +17,8 @@ namespace ov::intel_gpu::jit { #define CREATE_SNIPPETS_EMITTER(e_type, ...) \ {[this](const snippets::lowered::ExpressionPtr& expr) -> std::shared_ptr { \ - return std::make_shared(h.get(), isa, expr, ##__VA_ARGS__); \ + /*return std::make_shared(h.get(), isa, expr, ##__VA_ARGS__);*/ \ + return std::make_shared(##__VA_ARGS__); \ }, \ [](const std::shared_ptr& n) -> std::set> { \ return e_type::get_supported_precisions(n); \ @@ -37,9 +42,9 @@ GPUTargetMachine::GPUTargetMachine(dnnl::impl::gpu::intel::jit::gpu_gen_t hw) } OPENVINO_ASSERT(m_h, "Unitialized generator"); - // data movement - //jitters[op::v0::Parameter::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(jit_nop_emitter); - //jitters[op::v0::Result::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(jit_nop_emitter); + jitters[op::v0::Parameter::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(jit_nop_emitter); + jitters[op::v0::Result::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(jit_nop_emitter); + jitters[op::v1::Add::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(jit_add_emitter); } std::shared_ptr GPUTargetMachine::clone() const { @@ -114,4 +119,8 @@ std::shared_ptr GPUGenerator::clone() const { return std::make_shared(cpu_target_machine); } +ov::snippets::RegType GPUGenerator::get_specific_op_out_reg_type(const ov::Output& out) const { + return ov::snippets::RegType::undefined; +} + } // namespace ov::intel_gpu::jit diff --git a/src/plugins/intel_gpu/src/graph/impls/jit/gpu_generator.hpp b/src/plugins/intel_gpu/src/graph/impls/jit/gpu_generator.hpp index 3576c38ef38457..4dc896117c9e7b 100644 --- a/src/plugins/intel_gpu/src/graph/impls/jit/gpu_generator.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/jit/gpu_generator.hpp @@ -56,6 +56,8 @@ class GPUGenerator : public ov::snippets::Generator { GPUGenerator(dnnl::impl::gpu::intel::jit::gpu_gen_t hw); GPUGenerator(const std::shared_ptr& target); std::shared_ptr clone() const override; + + ov::snippets::RegType get_specific_op_out_reg_type(const ov::Output& out) const override; }; } // namespace ov::intel_gpu::jit diff --git a/src/plugins/intel_gpu/src/graph/impls/jit/subgraph.cpp b/src/plugins/intel_gpu/src/graph/impls/jit/subgraph.cpp index f20749c255c5be..1c6854a470953c 100644 --- a/src/plugins/intel_gpu/src/graph/impls/jit/subgraph.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/jit/subgraph.cpp @@ -13,6 +13,10 @@ #include "intel_gpu/graph/serialization/binary_buffer.hpp" +#include "snippets/lowered/pass/optimize_domain.hpp" +#include "snippets/utils/utils.hpp" +#include "gpu_generator.hpp" + #include namespace ov::intel_gpu::jit { @@ -102,41 +106,26 @@ class VectorScaleKernelGenerator : public OpenCLCodeGenerator class SubgraphImpl : public primitive_impl { using primitive_impl::primitive_impl; -public: - explicit SubgraphImpl(const program_node& /*node*/, const kernel_impl_params& impl_params) - : primitive_impl("jit::subgraph") { - const auto& engine = downcast(impl_params.get_program().get_engine()); - - HW hw = VectorScaleKernelGenerator::detectHW(engine.get_cl_context().get(), engine.get_cl_device().get()); - const char *gpuString = "unknown"; - - switch (hw) { - case HW::Gen9: gpuString = "Gen9"; break; - case HW::Gen11: gpuString = "Gen11"; break; - case HW::Gen12LP: gpuString = "Gen12LP"; break; - case HW::XeHP: gpuString = "XeHP"; break; - case HW::XeHPG: gpuString = "XeHPG"; break; - case HW::XeHPC: gpuString = "XeHPC"; break; - case HW::Xe2: gpuString = "Xe2"; break; - case HW::Xe3: gpuString = "Xe3"; break; - default: OPENVINO_THROW("[GPU] Unsupported architecture"); - } - - std::cout << "GPU arch: " << gpuString << "\n"; - - // Create appropriate kernel generator object for the detected HW, and get a cl_kernel. - // switch (hw) { - // case HW::Gen9: VectorScaleKernelGenerator().getKernel(engine.get_cl_context().get(), engine.get_cl_device().get()); - // case HW::Gen11: VectorScaleKernelGenerator().getKernel(engine.get_cl_context().get(), engine.get_cl_device().get()); - // case HW::Gen12LP: VectorScaleKernelGenerator().getKernel(engine.get_cl_context().get(), engine.get_cl_device().get()); - // case HW::XeHP: VectorScaleKernelGenerator().getKernel(engine.get_cl_context().get(), engine.get_cl_device().get()); - // case HW::XeHPG: VectorScaleKernelGenerator().getKernel(engine.get_cl_context().get(), engine.get_cl_device().get()); - // case HW::XeHPC: VectorScaleKernelGenerator().getKernel(engine.get_cl_context().get(), engine.get_cl_device().get()); - // case HW::Xe2: VectorScaleKernelGenerator().getKernel(engine.get_cl_context().get(), engine.get_cl_device().get()); - // case HW::Xe3: VectorScaleKernelGenerator().getKernel(engine.get_cl_context().get(), engine.get_cl_device().get()); - // default: OPENVINO_THROW("[GPU] Unsupported architecture");; - // } + std::shared_ptr m_subgraph {nullptr}; +public: + explicit SubgraphImpl(const program_node& node, const kernel_impl_params& impl_params) + : primitive_impl("jit::subgraph"), m_subgraph(node.as().get_primitive()->ov_subgraph->clone()) { + m_subgraph->set_generator( + std::make_shared(ngenHW2pluginHW(impl_params.get_device_info().arch))); + + const auto in_blocked_shapes = getSnippetsBlockedShapes(impl_params); + const auto precisions = getIOPrecisions(impl_params); + m_subgraph->data_flow_transformations(in_blocked_shapes, precisions.first, precisions.second, {}); + + const auto control_flow_config = std::make_shared(); + control_flow_config->disable(); + m_subgraph->set_tile_rank(1UL); + + m_subgraph->control_flow_transformations(0, // unused + 256, // unused + std::make_shared(), + control_flow_config); } SubgraphImpl() : primitive_impl() {} @@ -159,6 +148,47 @@ class SubgraphImpl : public primitive_impl { } void update(primitive_inst& inst, const kernel_impl_params& impl_param) override { } + +private: + static ngen::HW ngenHW2pluginHW(gpu_arch arch) { + switch (arch) { + case gpu_arch::gen9: return ngen::HW::Gen9; + case gpu_arch::gen11: return ngen::HW::Gen11; + case gpu_arch::xe_lp: return ngen::HW::XeLP; + case gpu_arch::xe_hp: return ngen::HW::XeHP; + case gpu_arch::xe_hpg: return ngen::HW::XeHPG; + case gpu_arch::xe_hpc: return ngen::HW::XeHPC; + case gpu_arch::xe2: return ngen::HW::Xe2; + case gpu_arch::xe3: return ngen::HW::Xe3; + case gpu_arch::unknown: return ngen::HW::Unknown; + default: + OPENVINO_THROW("Unexpected arch"); + } + } + + static ov::snippets::op::Subgraph::BlockedShapeVector getSnippetsBlockedShapes(const kernel_impl_params& impl_params) { + ov::snippets::op::Subgraph::BlockedShapeVector in_blocked_shapes(impl_params.input_layouts.size()); + for (size_t i = 0; i < in_blocked_shapes.size(); i++) { + // support only planar shapes + const auto blocked_dims = ov::snippets::utils::pshape_to_vdims(impl_params.input_layouts[i].get_partial_shape()); + const auto blocked_layout = ov::snippets::utils::get_planar_layout(blocked_dims.size()); + in_blocked_shapes[i] = {blocked_dims, blocked_layout}; + } + return in_blocked_shapes; + } + + static std::pair, std::vector> getIOPrecisions(const kernel_impl_params& impl_params) { + std::pair, std::vector> prc; + prc.first.reserve(impl_params.input_layouts.size()); + prc.second.reserve(impl_params.output_layouts.size()); + for (const auto& in : impl_params.input_layouts) { + prc.first.push_back(in.data_type); + } + for (const auto& out : impl_params.output_layouts) { + prc.second.push_back(out.data_type); + } + return prc; + } }; std::unique_ptr Subgraph::create_impl(const program_node& node, const RuntimeParams& params) const { diff --git a/src/plugins/intel_gpu/src/plugin/ops/subgraph.cpp b/src/plugins/intel_gpu/src/plugin/ops/subgraph.cpp index 3c66ec8834cc0b..3cae322f8aedf9 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/subgraph.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/subgraph.cpp @@ -18,7 +18,7 @@ static void CreateSubgraphOp(ProgramBuilder& p, const std::shared_ptr Date: Fri, 1 Aug 2025 18:55:55 +0400 Subject: [PATCH 3/6] [Snippets][GPU] SetSingleKernelWorkAmount --- .../src/graph/impls/jit/jit_generator.hpp | 2 +- .../src/graph/impls/jit/subgraph.cpp | 105 ++++-------------- .../lowered/set_single_kernel_work_amount.cpp | 90 +++++++++++++++ .../lowered/set_single_kernel_work_amount.hpp | 29 +++++ 4 files changed, 142 insertions(+), 84 deletions(-) create mode 100644 src/plugins/intel_gpu/src/plugin/transformations/snippets/lowered/set_single_kernel_work_amount.cpp create mode 100644 src/plugins/intel_gpu/src/plugin/transformations/snippets/lowered/set_single_kernel_work_amount.hpp diff --git a/src/plugins/intel_gpu/src/graph/impls/jit/jit_generator.hpp b/src/plugins/intel_gpu/src/graph/impls/jit/jit_generator.hpp index 1d179186d13b28..ae8c034ab9d57a 100644 --- a/src/plugins/intel_gpu/src/graph/impls/jit/jit_generator.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/jit/jit_generator.hpp @@ -33,7 +33,7 @@ class jit_snippet_t : public dnnl::impl::gpu::intel::jit::ngen_code_generator_t< } int getSIMD() const override { return dnnl::impl::gpu::intel::jit::ngen_code_generator_t::getSIMD(); - }; + } int getGRFCount() const override { return dnnl::impl::gpu::intel::jit::ngen_code_generator_t::getGRFCount(); } diff --git a/src/plugins/intel_gpu/src/graph/impls/jit/subgraph.cpp b/src/plugins/intel_gpu/src/graph/impls/jit/subgraph.cpp index 1c6854a470953c..1d81d445a98653 100644 --- a/src/plugins/intel_gpu/src/graph/impls/jit/subgraph.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/jit/subgraph.cpp @@ -12,6 +12,7 @@ #include "runtime/ocl/ocl_engine.hpp" #include "intel_gpu/graph/serialization/binary_buffer.hpp" +#include "plugin/transformations/snippets/lowered/set_single_kernel_work_amount.hpp" #include "snippets/lowered/pass/optimize_domain.hpp" #include "snippets/utils/utils.hpp" @@ -22,90 +23,12 @@ namespace ov::intel_gpu::jit { using namespace dnnl::impl::gpu::intel::jit; using namespace ngen; - -template -class VectorScaleKernelGenerator : public OpenCLCodeGenerator -{ -protected: - NGEN_FORWARD_OPENCL(hw); - -public: - VectorScaleKernelGenerator() : OpenCLCodeGenerator() - { - // Define kernel interface for OpenCL. - newArgument("buffer", ExternalArgumentType::GlobalPtr); - newArgument("alpha", DataType::f); - requireLocalID(1); - requireLocalSize(); - requireSIMD((GRF::bytes(hw) == 64) ? 16 : 8); - externalName("vector_scale"); - - finalizeInterface(); - - auto bufferSurface = Surface(getArgumentSurfaceIfExists("buffer")); // Surface # for buffer. - auto bufferPtr = getArgument("buffer"); // A64 pointer for buffer. - auto alpha = getArgument("alpha"); - - auto localSize = getLocalSize(0).uw(); - auto localID = getLocalID(0); // Vector of local IDs. - auto groupID = r0.ud(1); // Thread group (a.k.a. workgroup) IDs are in r0.ud(1) (X) r0.ud(6) (Y) r0.ud(7) (Z) - - // Local variables. - auto globalID = r12.ud(0); - auto header = r13; - auto data = r14; - auto temp = r15; - - // Decide on load/store messages. - bool useLSC = (hw >= HW::XeHPC); - - // All instructions use W (NoMask) by default. - setDefaultNoMask(); - - // Enable automatic SWSB for Gen12. - setDefaultAutoSWSB(); - - // Prologue for ATS+. - prologue(); - - // Enable IEEE denormals. - or_(1 | Switch, cr0[0], cr0[0], 0x4C0); - - // Calculate global ID = (group ID) * (local size) + (local ID for lane 0). - mul(1, globalID, groupID, localSize); - add(1, globalID, globalID, localID[0]); - - // Do 32 byte (2 OWord) block read at offset (global ID) * sizeof(float). - if (!useLSC) { - shr(1, header[2], globalID, 2); - load(8, data, block_oword(2), bufferSurface, header); - } else { - shl(1, globalID, globalID, 2); - addc(1, header.ud(0), bufferPtr.ud(0), globalID); - mov(1, temp.ud(0), acc0.ud(0)); - add(1, header.ud(1), bufferPtr.ud(1), temp.ud(0)); - load(1, data, D32 | V8T, A64, header); - } - - // Scale data. - mul(8, data, data, alpha); - - // Store updated data. - if (!useLSC) - store(8, block_oword(2), bufferSurface, header, data); - else - store(1, D32 | V8T, A64, header, data); - - // End thread. Must move r0 to one of r112-r127, then call threadend. - mov(8, r127, r0); - threadend(r127); - } -}; - - class SubgraphImpl : public primitive_impl { using primitive_impl::primitive_impl; + using DataFlowPasses = std::vector; + using ControlFlowPasses = std::vector; + std::shared_ptr m_subgraph {nullptr}; public: @@ -116,7 +39,7 @@ class SubgraphImpl : public primitive_impl { const auto in_blocked_shapes = getSnippetsBlockedShapes(impl_params); const auto precisions = getIOPrecisions(impl_params); - m_subgraph->data_flow_transformations(in_blocked_shapes, precisions.first, precisions.second, {}); + m_subgraph->data_flow_transformations(in_blocked_shapes, precisions.first, precisions.second); const auto control_flow_config = std::make_shared(); control_flow_config->disable(); @@ -125,8 +48,24 @@ class SubgraphImpl : public primitive_impl { m_subgraph->control_flow_transformations(0, // unused 256, // unused std::make_shared(), - control_flow_config); + control_flow_config, + getControlFlowPasses()); } + + ControlFlowPasses getControlFlowPasses() const { + using PassPosition = ov::snippets::pass::PassPosition; + using Place = PassPosition::Place; + + ControlFlowPasses backend_passes; +#define SNIPPETS_REGISTER_PASS_ABSOLUTE(PASS_PLACE, PASS, ...) \ + backend_passes.emplace_back(PassPosition(PASS_PLACE), std::make_shared(__VA_ARGS__)) + + + SNIPPETS_REGISTER_PASS_ABSOLUTE(Place::PipelineStart, + ov::intel_gpu::pass::SetSingleKernelWorkAmount); +#undef SNIPPETS_REGISTER_PASS_ABSOLUTE + return backend_passes; + } SubgraphImpl() : primitive_impl() {} diff --git a/src/plugins/intel_gpu/src/plugin/transformations/snippets/lowered/set_single_kernel_work_amount.cpp b/src/plugins/intel_gpu/src/plugin/transformations/snippets/lowered/set_single_kernel_work_amount.cpp new file mode 100644 index 00000000000000..0ca914a8c9fa39 --- /dev/null +++ b/src/plugins/intel_gpu/src/plugin/transformations/snippets/lowered/set_single_kernel_work_amount.cpp @@ -0,0 +1,90 @@ +// Copyright (C) 2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "set_single_kernel_work_amount.hpp" + +#include +#include +#include +#include +#include + +#include "openvino/core/except.hpp" +#include "openvino/core/type.hpp" +#include "snippets/lowered/expression.hpp" +#include "snippets/lowered/linear_ir.hpp" +#include "snippets/lowered/pass/pass.hpp" +#include "snippets/op/rank_normalization.hpp" +#include "snippets/shape_types.hpp" +#include "snippets/utils/utils.hpp" + +namespace ov::intel_gpu::pass { + +bool SetSingleKernelWorkAmount::run(snippets::lowered::LinearIR& linear_ir) { + // GPU Plugin requires 1D tile + linear_ir.set_loop_depth(1); + + const auto& config = linear_ir.get_config(); + if (linear_ir.empty()) { + return false; + } + + if (!config.m_enable_domain_optimization) { + // Unsupported + return false; + } + + if (linear_ir.is_dynamic()) { + // [134873] In dynamic case we need to implement own shape inference in runtime configurator + return false; + } + + auto master_shape = linear_ir.get_master_shape(); + if (master_shape.back() == 1) { + // Already single work amount + return false; + } + + auto CollapseDims = [](ov::snippets::VectorDims& dims) { + OPENVINO_ASSERT(dims.size() >= 2, "CollapseDims can't process shape with less than two dims"); + const auto full_wa_idx = dims.size() - 2; + dims[full_wa_idx] *= dims[dims.size() - 1]; + dims[dims.size() - 1] = 1; + for (size_t i = 0; i < full_wa_idx; i++) { + dims[full_wa_idx] *= dims[i]; + dims[i] = 1; + } + }; + + const auto& params = linear_ir.get_parameters(); + std::vector input_shapes; + for (const auto& param : params) { + const auto& desc = param->get_output_port_descriptor(0); + OPENVINO_ASSERT(ov::snippets::utils::is_planar_layout(desc->get_layout()), + "SetSingleKernelWorkAmount supports only planar layout on inputs"); + auto shape = desc->get_shape(); + OPENVINO_ASSERT(std::none_of(shape.begin(), + shape.end(), + [](size_t d) { + return ov::snippets::utils::is_dynamic_value(d); + }), + "SetSingleKernelWorkAmount pass does not support dynamic shapes"); + OPENVINO_ASSERT(shape == params.front()->get_output_port_descriptor(0)->get_shape(), + "SetSingleKernelWorkAmount pass supports only similar shapes on input"); + CollapseDims(shape); + input_shapes.emplace_back(shape); + } + + std::vector infer_shapes; + infer_shapes.reserve(input_shapes.size()); + for (const auto& is : input_shapes) { + infer_shapes.emplace_back(is); + } + // Need to propagate updated shapes through LIR + linear_ir.shape_infer(infer_shapes); + + return true; +} + +} // namespace ov::intel_gpu::pass diff --git a/src/plugins/intel_gpu/src/plugin/transformations/snippets/lowered/set_single_kernel_work_amount.hpp b/src/plugins/intel_gpu/src/plugin/transformations/snippets/lowered/set_single_kernel_work_amount.hpp new file mode 100644 index 00000000000000..2ae80a9db2971a --- /dev/null +++ b/src/plugins/intel_gpu/src/plugin/transformations/snippets/lowered/set_single_kernel_work_amount.hpp @@ -0,0 +1,29 @@ +// Copyright (C) 2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +#include "snippets/lowered/linear_ir.hpp" +#include "snippets/lowered/pass/pass.hpp" +#include "snippets/shape_types.hpp" + +namespace ov::intel_gpu::pass { + +/** + * @interface SetSingleKernelWorkAmount + * @brief TODO + * @ingroup snippets + */ + +class SetSingleKernelWorkAmount : public ov::snippets::lowered::pass::Pass { +public: + OPENVINO_RTTI("SetSingleKernelWorkAmount", "", Pass) + explicit SetSingleKernelWorkAmount() = default; + bool run(ov::snippets::lowered::LinearIR& linear_ir) override; +}; + +} // namespace ov::intel_gpu::pass From 025503f0823660e4fff9e5c9aeec725966b2387f Mon Sep 17 00:00:00 2001 From: Alexandra Sidorova Date: Mon, 4 Aug 2025 12:25:01 +0400 Subject: [PATCH 4/6] [GPU][Snippets] Added snippets Add tests and snippets_mode (doesn't work) --- .../intel_gpu/runtime/internal_properties.hpp | 38 +++++++++++++++++++ .../include/intel_gpu/runtime/options.inl | 1 + .../src/plugin/transformations_pipeline.cpp | 4 +- .../intel_gpu/tests/functional/CMakeLists.txt | 2 + .../shared_tests_instances/snippets/add.cpp | 33 ++++++++++++++++ .../plugin/shared/src/snippets/add.cpp | 2 +- 6 files changed, 76 insertions(+), 4 deletions(-) create mode 100644 src/plugins/intel_gpu/tests/functional/shared_tests_instances/snippets/add.cpp diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/internal_properties.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/internal_properties.hpp index 13c892b69814b0..1e8a0ea542034e 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/internal_properties.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/internal_properties.hpp @@ -113,6 +113,43 @@ inline std::istream& operator>>(std::istream& is, DumpTensors& val) { return is; } +/** + * @brief Enum to define possible snippets mode hints. + */ +enum class SnippetsMode : uint8_t { + ENABLE = 0, //!< Enable + IGNORE_CALLBACK = 1, //!< Ignore callback + DISABLE = 2, //!< Disable +}; + +inline std::ostream& operator<<(std::ostream& os, const SnippetsMode& mode) { + switch (mode) { + case SnippetsMode::ENABLE: + return os << "ENABLE"; + case SnippetsMode::IGNORE_CALLBACK: + return os << "IGNORE_CALLBACK"; + case SnippetsMode::DISABLE: + return os << "DISABLE"; + default: + OPENVINO_THROW("Unsupported snippets mode value"); + } +} + +inline std::istream& operator>>(std::istream& is, SnippetsMode& mode) { + std::string str; + is >> str; + if (str == "ENABLE") { + mode = SnippetsMode::ENABLE; + } else if (str == "IGNORE_CALLBACK") { + mode = SnippetsMode::IGNORE_CALLBACK; + } else if (str == "DISABLE") { + mode = SnippetsMode::DISABLE; + } else { + OPENVINO_THROW("Unsupported snippets mode: ", str); + } + return is; +} + /** * @brief Defines queue type that must be used for model execution */ @@ -168,6 +205,7 @@ static constexpr Property static constexpr Property, ov::PropertyMutability::RW> load_dump_raw_binary{"GPU_LOAD_DUMP_RAW_BINARY"}; static constexpr Property could_use_flashattn_v2{"GPU_COULD_USE_FLASHATTN_V2"}; static constexpr Property dynamic_quantization_group_size_max{"GPU_DYNAMIC_QUANTIZATION_GROUP_SIZE_MAX"}; +static constexpr Property snippets_mode{"SNIPPETS_MODE"}; } // namespace ov::intel_gpu namespace cldnn { diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/options.inl b/src/plugins/intel_gpu/include/intel_gpu/runtime/options.inl index e2eeef0569a009..24a2151ff83de1 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/options.inl +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/options.inl @@ -52,6 +52,7 @@ OV_CONFIG_RELEASE_INTERNAL_OPTION(ov::intel_gpu, impls_cache_capacity, 300, "Con OV_CONFIG_RELEASE_INTERNAL_OPTION(ov::intel_gpu, asym_dynamic_quantization, false, "Enforce asymmetric mode for dynamically quantized activations") OV_CONFIG_RELEASE_INTERNAL_OPTION(ov::intel_gpu, could_use_flashattn_v2, true, "Enable/Disable SDPA primitive executing with FlashAttenV2 online softmax tricks.") OV_CONFIG_RELEASE_INTERNAL_OPTION(ov::intel_gpu, dynamic_quantization_threshold, 64, "Apply dynamic quantization only when batch size is larger than this value in OneDNN") +OV_CONFIG_RELEASE_INTERNAL_OPTION(ov::intel_gpu, snippets_mode, ov::intel_gpu::SnippetsMode::DISABLE, "Define tokenization mode for Snippets.") OV_CONFIG_DEBUG_GLOBAL_OPTION(ov::intel_gpu, help, false, "Print help message for all config options") OV_CONFIG_DEBUG_GLOBAL_OPTION(ov::intel_gpu, verbose, 0, "Enable logging for debugging purposes. The higher value the more verbose output. 0 - Disabled, 4 - Maximum verbosity") diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index 1bedda8fccfd16..670cba9dd0511b 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -1296,8 +1296,7 @@ void TransformationsPipeline::apply(std::shared_ptr func) { } // TODO: Move to the right place - { - //ov::serialize(func, "pre_snippets.xml"); + if (config.get_snippets_mode() != ov::intel_gpu::SnippetsMode::DISABLE) { ov::pass::Manager manager("GPU:Snippets"); manager.set_per_pass_validation(false); @@ -1333,7 +1332,6 @@ void TransformationsPipeline::apply(std::shared_ptr func) { }); manager.run_passes(func); - //ov::serialize(func, "post_snippets.xml"); } } } // namespace ov::intel_gpu diff --git a/src/plugins/intel_gpu/tests/functional/CMakeLists.txt b/src/plugins/intel_gpu/tests/functional/CMakeLists.txt index c89083a026aed6..33d2c64fc34ba6 100644 --- a/src/plugins/intel_gpu/tests/functional/CMakeLists.txt +++ b/src/plugins/intel_gpu/tests/functional/CMakeLists.txt @@ -34,6 +34,8 @@ ov_add_test_target( funcSharedTests OpenCL::NewHeaders # should come before OpenCL::OpenCL OpenCL::OpenCL + openvino::snippets + ov_snippets_models ADD_CPPLINT LABELS OV GPU diff --git a/src/plugins/intel_gpu/tests/functional/shared_tests_instances/snippets/add.cpp b/src/plugins/intel_gpu/tests/functional/shared_tests_instances/snippets/add.cpp new file mode 100644 index 00000000000000..2f0d791770a27c --- /dev/null +++ b/src/plugins/intel_gpu/tests/functional/shared_tests_instances/snippets/add.cpp @@ -0,0 +1,33 @@ +// Copyright (C) 2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/add.hpp" +#include "common_test_utils/test_constants.hpp" + +namespace ov { +namespace test { +namespace snippets { + + +namespace { +// ===================================Add=========================================================// +// These inputs are needed to test static Loop optimizations (emit the whole tile, body with increments, set WA etc) +std::vector inShapesStatic1{{{}, {{128, 256, 512}}}}; +std::vector inShapesStatic2{{{}, {{128, 256, 512}}}}; + +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, Add, + ::testing::Combine( + ::testing::ValuesIn(inShapesStatic1), + ::testing::ValuesIn(inShapesStatic2), + ::testing::ValuesIn({ov::element::f16}), + ::testing::Values(1), // Add + ::testing::Values(1), // Subgraph is created, since the inputs are followed by converts + ::testing::Values(ov::test::utils::DEVICE_GPU)), + Add::getTestCaseName); + + +} // namespace +} // namespace snippets +} // namespace test +} // namespace ov \ No newline at end of file diff --git a/src/tests/functional/plugin/shared/src/snippets/add.cpp b/src/tests/functional/plugin/shared/src/snippets/add.cpp index 4c10b6cb2ad07b..e72cb5e4b4ffdc 100644 --- a/src/tests/functional/plugin/shared/src/snippets/add.cpp +++ b/src/tests/functional/plugin/shared/src/snippets/add.cpp @@ -44,7 +44,7 @@ void Add::SetUp() { auto f = ov::test::snippets::AddFunction(inputDynamicShapes); function = f.getOriginal(); setInferenceType(type); - setIgnoreCallbackMode(); + //setIgnoreCallbackMode(); } std::string AddConst::getTestCaseName(testing::TestParamInfo obj) { From cb0e8ae08f1ee6992e6d4287806f36e2231f7089 Mon Sep 17 00:00:00 2001 From: Alexandra Sidorova Date: Mon, 4 Aug 2025 15:13:04 +0400 Subject: [PATCH 5/6] [GPU][Snippets] Template target machine and emitters --- .../jit/emitters/jit_eltwise_emitters.hpp | 6 +- .../graph/impls/jit/emitters/jit_emitter.hpp | 11 ++- .../jit/emitters/jit_snippets_emitters.hpp | 6 +- .../src/graph/impls/jit/gpu_generator.cpp | 88 ++++++++++--------- .../src/graph/impls/jit/gpu_generator.hpp | 17 ++-- .../src/graph/impls/jit/jit_generator.hpp | 24 +---- 6 files changed, 76 insertions(+), 76 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_eltwise_emitters.hpp b/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_eltwise_emitters.hpp index ab2b5729300737..d64bdd72062d0d 100644 --- a/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_eltwise_emitters.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_eltwise_emitters.hpp @@ -9,9 +9,11 @@ namespace ov::intel_gpu::jit { -class jit_add_emitter : public jit_emitter { +template +class jit_add_emitter : public jit_emitter { public: - jit_add_emitter() = default; + jit_add_emitter(dnnl::impl::gpu::intel::jit::ngen_code_generator_t* host, + ov::element::Type exec_prc = ov::element::f32) : jit_emitter(host, exec_prc) {}; static std::set> get_supported_precisions( [[maybe_unused]] const std::shared_ptr& node) { diff --git a/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_emitter.hpp b/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_emitter.hpp index 971ed5e540e52f..7c3b844bbc8195 100644 --- a/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_emitter.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_emitter.hpp @@ -6,14 +6,20 @@ #include "snippets/emitter.hpp" +#include "graph/impls/jit/jit_generator.hpp" + #include "openvino/core/type/element_type.hpp" #include "openvino/core/node.hpp" namespace ov::intel_gpu::jit { +template class jit_emitter : public ov::snippets::Emitter { public: - jit_emitter() = default; + jit_emitter(dnnl::impl::gpu::intel::jit::ngen_code_generator_t* host, + ov::element::Type exec_prc = ov::element::f32) : + m_h(host), + m_exec_prc(exec_prc) {} /** * @brief Returns supported precisions. @@ -32,6 +38,9 @@ class jit_emitter : public ov::snippets::Emitter { const std::vector& gpr) const override { OPENVINO_THROW("Unimplemented"); } + + dnnl::impl::gpu::intel::jit::ngen_code_generator_t* m_h; + ov::element::Type m_exec_prc; }; } // namespace ov::intel_gpu::jit diff --git a/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_snippets_emitters.hpp b/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_snippets_emitters.hpp index 739afec517daad..2d2b6f4383579a 100644 --- a/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_snippets_emitters.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_snippets_emitters.hpp @@ -9,9 +9,11 @@ namespace ov::intel_gpu::jit { -class jit_nop_emitter : public jit_emitter { +template +class jit_nop_emitter : public jit_emitter { public: - jit_nop_emitter() = default; + jit_nop_emitter(dnnl::impl::gpu::intel::jit::ngen_code_generator_t* host, + ov::element::Type exec_prc = ov::element::f32) : jit_emitter(host, exec_prc) {}; static std::set> get_supported_precisions( [[maybe_unused]] const std::shared_ptr& node) { diff --git a/src/plugins/intel_gpu/src/graph/impls/jit/gpu_generator.cpp b/src/plugins/intel_gpu/src/graph/impls/jit/gpu_generator.cpp index 6beb9d8d55bd54..c1dec415d47a05 100644 --- a/src/plugins/intel_gpu/src/graph/impls/jit/gpu_generator.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/jit/gpu_generator.cpp @@ -15,70 +15,61 @@ using namespace dnnl::impl::gpu::intel::jit; namespace ov::intel_gpu::jit { -#define CREATE_SNIPPETS_EMITTER(e_type, ...) \ +#define CREATE_SNIPPETS_EMITTER(e_type, ...) \ {[this](const snippets::lowered::ExpressionPtr& expr) -> std::shared_ptr { \ - /*return std::make_shared(h.get(), isa, expr, ##__VA_ARGS__);*/ \ - return std::make_shared(##__VA_ARGS__); \ + return std::make_shared>(m_h.get(), ##__VA_ARGS__); \ }, \ [](const std::shared_ptr& n) -> std::set> { \ - return e_type::get_supported_precisions(n); \ + return e_type::get_supported_precisions(n); \ }} -GPUTargetMachine::GPUTargetMachine(dnnl::impl::gpu::intel::jit::gpu_gen_t hw) +template +GPUTargetMachine::GPUTargetMachine() : TargetMachine(std::make_shared(std::make_shared())), - m_hw(hw) { - // init generator by hw - switch (hw) { - case ngen::HW::Gen9: m_h = std::make_unique>(); break; - case ngen::HW::Gen11: m_h = std::make_unique>(); break; - case ngen::HW::Gen12LP: m_h = std::make_unique>(); break; - case ngen::HW::XeHP: m_h = std::make_unique>(); break; - case ngen::HW::XeHPG: m_h = std::make_unique>(); break; - case ngen::HW::XeHPC: m_h = std::make_unique>(); break; - case ngen::HW::Xe2: m_h = std::make_unique>(); break; - case ngen::HW::Xe3: m_h = std::make_unique>(); break; - default: - OPENVINO_THROW("Unknown GPU hardware!"); - } - OPENVINO_ASSERT(m_h, "Unitialized generator"); - + m_h(std::make_unique>()) { jitters[op::v0::Parameter::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(jit_nop_emitter); jitters[op::v0::Result::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(jit_nop_emitter); jitters[op::v1::Add::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(jit_add_emitter); } -std::shared_ptr GPUTargetMachine::clone() const { - const auto cloned = std::make_shared(m_hw); +template +std::shared_ptr GPUTargetMachine::clone() const { + const auto cloned = std::make_shared>(); cloned->configurator = std::make_shared(*configurator); return cloned; } -size_t GPUTargetMachine::get_lanes() const { +template +size_t GPUTargetMachine::get_lanes() const { assert(m_h); return m_h->getSIMD(); } -std::vector GPUTargetMachine::get_abi_arg_regs() const { +template +std::vector GPUTargetMachine::get_abi_arg_regs() const { OPENVINO_THROW("Unimplemented!"); return {}; } -std::vector GPUTargetMachine::get_gp_reg_pool() const { +template +std::vector GPUTargetMachine::get_gp_reg_pool() const { OPENVINO_THROW("Unimplemented!"); return {}; } -std::vector GPUTargetMachine::get_vec_reg_pool() const { +template +std::vector GPUTargetMachine::get_vec_reg_pool() const { OPENVINO_THROW("Unimplemented!"); return {}; } -dnnl::impl::gpu::intel::jit::gpu_gen_t GPUTargetMachine::get_hw() const { - return m_hw; +template +ngen::HW GPUTargetMachine::get_hw() const { + return hw; } - -snippets::CompiledSnippetPtr GPUTargetMachine::get_snippet() { +template +snippets::CompiledSnippetPtr GPUTargetMachine::get_snippet() { // OPENVINO_ASSERT(h->create_kernel() == dnnl::impl::status::success, "Failed to create jit_kernel in get_snippet()"); // const auto& result = // std::make_shared(std::unique_ptr(h.release())); @@ -89,11 +80,6 @@ snippets::CompiledSnippetPtr GPUTargetMachine::get_snippet() { return nullptr; } -CompiledSnippetGPU::CompiledSnippetGPU(std::unique_ptr h) - : h_compiled(std::move(h)) { - //OPENVINO_ASSERT(h_compiled && h_compiled->jit_ker(), "Got invalid jit generator or kernel was nopt compiled"); -} - const uint8_t* CompiledSnippetGPU::get_code() const { //return h_compiled->jit_ker(); OPENVINO_THROW("Unimplemented!"); @@ -108,19 +94,35 @@ bool CompiledSnippetGPU::empty() const { return get_code_size() == 0; } -GPUGenerator::GPUGenerator(dnnl::impl::gpu::intel::jit::gpu_gen_t hw) - : Generator(std::make_shared(hw)) {} -GPUGenerator::GPUGenerator(const std::shared_ptr& target) : Generator(target) {} +GPUGenerator::GPUGenerator(ngen::HW hw) + : Generator(create_target_machine(hw)) {} + +template +GPUGenerator::GPUGenerator(const std::shared_ptr>& target) : Generator(target) {} std::shared_ptr GPUGenerator::clone() const { - const auto& cpu_target_machine = std::dynamic_pointer_cast(target->clone()); - OPENVINO_ASSERT(cpu_target_machine, - "Failed to clone GPUGenerator: the instance contains incompatible TargetMachine type"); - return std::make_shared(cpu_target_machine); + //const auto hw = target->get_hw(); + //return std::make_shared(target->clone()); + OPENVINO_THROW("Unimplemented!"); } ov::snippets::RegType GPUGenerator::get_specific_op_out_reg_type(const ov::Output& out) const { return ov::snippets::RegType::undefined; } +std::shared_ptr GPUGenerator::create_target_machine(ngen::HW hw) { + switch (hw) { + case ngen::HW::Gen9: return std::make_unique>(); + case ngen::HW::Gen11: return std::make_unique>(); + case ngen::HW::Gen12LP: return std::make_unique>(); + case ngen::HW::XeHP: return std::make_unique>(); + case ngen::HW::XeHPG: return std::make_unique>(); + case ngen::HW::XeHPC: return std::make_unique>(); + case ngen::HW::Xe2: return std::make_unique>(); + case ngen::HW::Xe3: return std::make_unique>(); + default: + OPENVINO_THROW("Unknown GPU hardware!"); + } +} + } // namespace ov::intel_gpu::jit diff --git a/src/plugins/intel_gpu/src/graph/impls/jit/gpu_generator.hpp b/src/plugins/intel_gpu/src/graph/impls/jit/gpu_generator.hpp index 4dc896117c9e7b..377c76183fa910 100644 --- a/src/plugins/intel_gpu/src/graph/impls/jit/gpu_generator.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/jit/gpu_generator.hpp @@ -20,18 +20,17 @@ namespace ov::intel_gpu::jit { class CompiledSnippetGPU : public snippets::CompiledSnippet { - const std::unique_ptr h_compiled; - public: [[nodiscard]] const uint8_t* get_code() const override; [[nodiscard]] size_t get_code_size() const override; [[nodiscard]] bool empty() const override; - explicit CompiledSnippetGPU(std::unique_ptr h); + explicit CompiledSnippetGPU() = default; }; +template class GPUTargetMachine : public ov::snippets::TargetMachine { public: - explicit GPUTargetMachine(dnnl::impl::gpu::intel::jit::gpu_gen_t hw); + explicit GPUTargetMachine(); [[nodiscard]] bool is_supported() const override { return true; } [[nodiscard]] std::shared_ptr clone() const override; @@ -47,17 +46,21 @@ class GPUTargetMachine : public ov::snippets::TargetMachine { snippets::CompiledSnippetPtr get_snippet() override; private: - dnnl::impl::gpu::intel::jit::gpu_gen_t m_hw; - std::unique_ptr m_h; + std::unique_ptr> m_h; }; class GPUGenerator : public ov::snippets::Generator { public: GPUGenerator(dnnl::impl::gpu::intel::jit::gpu_gen_t hw); - GPUGenerator(const std::shared_ptr& target); std::shared_ptr clone() const override; ov::snippets::RegType get_specific_op_out_reg_type(const ov::Output& out) const override; + +private: + template + GPUGenerator(const std::shared_ptr>& target); + + static std::shared_ptr create_target_machine(ngen::HW hw); }; } // namespace ov::intel_gpu::jit diff --git a/src/plugins/intel_gpu/src/graph/impls/jit/jit_generator.hpp b/src/plugins/intel_gpu/src/graph/impls/jit/jit_generator.hpp index ae8c034ab9d57a..1575a0c468c566 100644 --- a/src/plugins/intel_gpu/src/graph/impls/jit/jit_generator.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/jit/jit_generator.hpp @@ -14,29 +14,11 @@ namespace ov::intel_gpu::jit { -struct jit_snippet_base_t { - virtual ~jit_snippet_base_t() = default; - virtual const char *kernel_name() const = 0; - - virtual int getSIMD() const = 0; - virtual int getGRFCount() const = 0; -}; - -template -class jit_snippet_t : public dnnl::impl::gpu::intel::jit::ngen_code_generator_t, public jit_snippet_base_t { +template +class jit_snippet_t : public ngen::OpenCLCodeGenerator { public: jit_snippet_t() - : dnnl::impl::gpu::intel::jit::ngen_code_generator_t(0, {GENERATOR_NAME, GENERATOR_LINE, false}) {}; - - const char *kernel_name() const override { - return dnnl::impl::gpu::intel::jit::ngen_code_generator_t::getExternalName().c_str(); - } - int getSIMD() const override { - return dnnl::impl::gpu::intel::jit::ngen_code_generator_t::getSIMD(); - } - int getGRFCount() const override { - return dnnl::impl::gpu::intel::jit::ngen_code_generator_t::getGRFCount(); - } + : ngen::OpenCLCodeGenerator(0, {GENERATOR_NAME, GENERATOR_LINE, false}) {}; }; } // namespace ov::intel_gpu::jit From 31f9bcb2047e8fcbe13b6e3b2536fb1f8d936794 Mon Sep 17 00:00:00 2001 From: Alexandra Sidorova Date: Mon, 4 Aug 2025 16:54:35 +0400 Subject: [PATCH 6/6] [GPU][Snippets] smth work --- .../include/intel_gpu/runtime/options.inl | 2 +- .../src/graph/impls/jit/gpu_generator.cpp | 17 ++++++++++++----- .../src/graph/impls/jit/gpu_generator.hpp | 3 +-- .../plugin/shared/src/snippets/add.cpp | 2 +- 4 files changed, 15 insertions(+), 9 deletions(-) diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/options.inl b/src/plugins/intel_gpu/include/intel_gpu/runtime/options.inl index 24a2151ff83de1..9cf533580bf717 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/options.inl +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/options.inl @@ -35,6 +35,7 @@ OV_CONFIG_RELEASE_OPTION(ov::hint, activations_scale_factor, -1.0f, "Scalar floa OV_CONFIG_RELEASE_OPTION(ov::internal, enable_lp_transformations, false, "Enable/Disable Low precision transformations set") OV_CONFIG_RELEASE_OPTION(ov::intel_gpu, config_file, "", "Path to custom layers config file") OV_CONFIG_RELEASE_OPTION(ov::hint, model, nullptr, "Shared pointer to the ov::Model") +OV_CONFIG_RELEASE_OPTION(ov::intel_gpu, snippets_mode, ov::intel_gpu::SnippetsMode::DISABLE, "Define tokenization mode for Snippets.") OV_CONFIG_RELEASE_INTERNAL_OPTION(ov::intel_gpu, shape_predictor_settings, {10, 16 * 1024, 2, 1.1f}, "Preallocation settings") OV_CONFIG_RELEASE_INTERNAL_OPTION(ov::intel_gpu, queue_type, QueueTypes::out_of_order, "Type of the queue that must be used for model execution. May be in-order or out-of-order") @@ -52,7 +53,6 @@ OV_CONFIG_RELEASE_INTERNAL_OPTION(ov::intel_gpu, impls_cache_capacity, 300, "Con OV_CONFIG_RELEASE_INTERNAL_OPTION(ov::intel_gpu, asym_dynamic_quantization, false, "Enforce asymmetric mode for dynamically quantized activations") OV_CONFIG_RELEASE_INTERNAL_OPTION(ov::intel_gpu, could_use_flashattn_v2, true, "Enable/Disable SDPA primitive executing with FlashAttenV2 online softmax tricks.") OV_CONFIG_RELEASE_INTERNAL_OPTION(ov::intel_gpu, dynamic_quantization_threshold, 64, "Apply dynamic quantization only when batch size is larger than this value in OneDNN") -OV_CONFIG_RELEASE_INTERNAL_OPTION(ov::intel_gpu, snippets_mode, ov::intel_gpu::SnippetsMode::DISABLE, "Define tokenization mode for Snippets.") OV_CONFIG_DEBUG_GLOBAL_OPTION(ov::intel_gpu, help, false, "Print help message for all config options") OV_CONFIG_DEBUG_GLOBAL_OPTION(ov::intel_gpu, verbose, 0, "Enable logging for debugging purposes. The higher value the more verbose output. 0 - Disabled, 4 - Maximum verbosity") diff --git a/src/plugins/intel_gpu/src/graph/impls/jit/gpu_generator.cpp b/src/plugins/intel_gpu/src/graph/impls/jit/gpu_generator.cpp index c1dec415d47a05..e805881e5ceed5 100644 --- a/src/plugins/intel_gpu/src/graph/impls/jit/gpu_generator.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/jit/gpu_generator.cpp @@ -97,13 +97,20 @@ bool CompiledSnippetGPU::empty() const { GPUGenerator::GPUGenerator(ngen::HW hw) : Generator(create_target_machine(hw)) {} -template -GPUGenerator::GPUGenerator(const std::shared_ptr>& target) : Generator(target) {} +GPUGenerator::GPUGenerator(const std::shared_ptr& target) + : Generator(target) { + OPENVINO_ASSERT(typeid(*target) == typeid(GPUTargetMachine) || + typeid(*target) == typeid(GPUTargetMachine) || + typeid(*target) == typeid(GPUTargetMachine) || + typeid(*target) == typeid(GPUTargetMachine) || + typeid(*target) == typeid(GPUTargetMachine) || + typeid(*target) == typeid(GPUTargetMachine) || + typeid(*target) == typeid(GPUTargetMachine) || + typeid(*target) == typeid(GPUTargetMachine)); +} std::shared_ptr GPUGenerator::clone() const { - //const auto hw = target->get_hw(); - //return std::make_shared(target->clone()); - OPENVINO_THROW("Unimplemented!"); + return std::shared_ptr(new GPUGenerator(target->clone())); } ov::snippets::RegType GPUGenerator::get_specific_op_out_reg_type(const ov::Output& out) const { diff --git a/src/plugins/intel_gpu/src/graph/impls/jit/gpu_generator.hpp b/src/plugins/intel_gpu/src/graph/impls/jit/gpu_generator.hpp index 377c76183fa910..3e245c175781b4 100644 --- a/src/plugins/intel_gpu/src/graph/impls/jit/gpu_generator.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/jit/gpu_generator.hpp @@ -57,8 +57,7 @@ class GPUGenerator : public ov::snippets::Generator { ov::snippets::RegType get_specific_op_out_reg_type(const ov::Output& out) const override; private: - template - GPUGenerator(const std::shared_ptr>& target); + GPUGenerator(const std::shared_ptr& target); static std::shared_ptr create_target_machine(ngen::HW hw); }; diff --git a/src/tests/functional/plugin/shared/src/snippets/add.cpp b/src/tests/functional/plugin/shared/src/snippets/add.cpp index e72cb5e4b4ffdc..4c10b6cb2ad07b 100644 --- a/src/tests/functional/plugin/shared/src/snippets/add.cpp +++ b/src/tests/functional/plugin/shared/src/snippets/add.cpp @@ -44,7 +44,7 @@ void Add::SetUp() { auto f = ov::test::snippets::AddFunction(inputDynamicShapes); function = f.getOriginal(); setInferenceType(type); - //setIgnoreCallbackMode(); + setIgnoreCallbackMode(); } std::string AddConst::getTestCaseName(testing::TestParamInfo obj) {