diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/subgraph.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/subgraph.hpp index d4de1678709a35..c17f10f86cc91e 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/subgraph.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/subgraph.hpp @@ -6,6 +6,8 @@ #include "snippets/op/subgraph.hpp" #include "primitive.hpp" +#include "ocl/ocl_engine.hpp" + namespace cldnn { /// @brief Subgraph primitive @@ -19,7 +21,8 @@ struct subgraph : public primitive_base { /// @param id This primitive id /// @param inputs Input primitive ids /// @param subgraph Original subgraph node - subgraph(const primitive_id& id, const std::vector& inputs, const std::shared_ptr& subgraph) + subgraph(const primitive_id& id, const std::vector& inputs, + const std::shared_ptr& subgraph) : primitive_base(id, inputs), ov_subgraph(subgraph->clone()) {} std::shared_ptr ov_subgraph; diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/internal_properties.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/internal_properties.hpp index 13c892b69814b0..1e8a0ea542034e 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/internal_properties.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/internal_properties.hpp @@ -113,6 +113,43 @@ inline std::istream& operator>>(std::istream& is, DumpTensors& val) { return is; } +/** + * @brief Enum to define possible snippets mode hints. + */ +enum class SnippetsMode : uint8_t { + ENABLE = 0, //!< Enable + IGNORE_CALLBACK = 1, //!< Ignore callback + DISABLE = 2, //!< Disable +}; + +inline std::ostream& operator<<(std::ostream& os, const SnippetsMode& mode) { + switch (mode) { + case SnippetsMode::ENABLE: + return os << "ENABLE"; + case SnippetsMode::IGNORE_CALLBACK: + return os << "IGNORE_CALLBACK"; + case SnippetsMode::DISABLE: + return os << "DISABLE"; + default: + OPENVINO_THROW("Unsupported snippets mode value"); + } +} + +inline std::istream& operator>>(std::istream& is, SnippetsMode& mode) { + std::string str; + is >> str; + if (str == "ENABLE") { + mode = SnippetsMode::ENABLE; + } else if (str == "IGNORE_CALLBACK") { + mode = SnippetsMode::IGNORE_CALLBACK; + } else if (str == "DISABLE") { + mode = SnippetsMode::DISABLE; + } else { + OPENVINO_THROW("Unsupported snippets mode: ", str); + } + return is; +} + /** * @brief Defines queue type that must be used for model execution */ @@ -168,6 +205,7 @@ static constexpr Property static constexpr Property, ov::PropertyMutability::RW> load_dump_raw_binary{"GPU_LOAD_DUMP_RAW_BINARY"}; static constexpr Property could_use_flashattn_v2{"GPU_COULD_USE_FLASHATTN_V2"}; static constexpr Property dynamic_quantization_group_size_max{"GPU_DYNAMIC_QUANTIZATION_GROUP_SIZE_MAX"}; +static constexpr Property snippets_mode{"SNIPPETS_MODE"}; } // namespace ov::intel_gpu namespace cldnn { diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/options.inl b/src/plugins/intel_gpu/include/intel_gpu/runtime/options.inl index e2eeef0569a009..9cf533580bf717 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/options.inl +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/options.inl @@ -35,6 +35,7 @@ OV_CONFIG_RELEASE_OPTION(ov::hint, activations_scale_factor, -1.0f, "Scalar floa OV_CONFIG_RELEASE_OPTION(ov::internal, enable_lp_transformations, false, "Enable/Disable Low precision transformations set") OV_CONFIG_RELEASE_OPTION(ov::intel_gpu, config_file, "", "Path to custom layers config file") OV_CONFIG_RELEASE_OPTION(ov::hint, model, nullptr, "Shared pointer to the ov::Model") +OV_CONFIG_RELEASE_OPTION(ov::intel_gpu, snippets_mode, ov::intel_gpu::SnippetsMode::DISABLE, "Define tokenization mode for Snippets.") OV_CONFIG_RELEASE_INTERNAL_OPTION(ov::intel_gpu, shape_predictor_settings, {10, 16 * 1024, 2, 1.1f}, "Preallocation settings") OV_CONFIG_RELEASE_INTERNAL_OPTION(ov::intel_gpu, queue_type, QueueTypes::out_of_order, "Type of the queue that must be used for model execution. May be in-order or out-of-order") diff --git a/src/plugins/intel_gpu/src/graph/impls/jit/CMakeLists.txt b/src/plugins/intel_gpu/src/graph/impls/jit/CMakeLists.txt index db4ccd4c89c858..9f80ea10aafa1e 100644 --- a/src/plugins/intel_gpu/src/graph/impls/jit/CMakeLists.txt +++ b/src/plugins/intel_gpu/src/graph/impls/jit/CMakeLists.txt @@ -10,7 +10,7 @@ set(TARGET_NAME "openvino_intel_gpu_jit_obj") ov_gpu_add_backend_target( NAME ${TARGET_NAME} - LINK_LIBRARIES onednn_gpu_tgt + LINK_LIBRARIES onednn_gpu_tgt openvino::snippets ) ov_build_target_faster(${TARGET_NAME} PCH PCH_EXCLUDE detection_output.cpp) diff --git a/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_eltwise_emitters.hpp b/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_eltwise_emitters.hpp new file mode 100644 index 00000000000000..d64bdd72062d0d --- /dev/null +++ b/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_eltwise_emitters.hpp @@ -0,0 +1,24 @@ +// Copyright (C) 2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "jit_emitter.hpp" + + +namespace ov::intel_gpu::jit { + +template +class jit_add_emitter : public jit_emitter { +public: + jit_add_emitter(dnnl::impl::gpu::intel::jit::ngen_code_generator_t* host, + ov::element::Type exec_prc = ov::element::f32) : jit_emitter(host, exec_prc) {}; + + static std::set> get_supported_precisions( + [[maybe_unused]] const std::shared_ptr& node) { + return {{element::f32, element::f32}, {element::f16, element::f16}}; + } +}; + +} // namespace ov::intel_gpu::jit diff --git a/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_emitter.hpp b/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_emitter.hpp new file mode 100644 index 00000000000000..7c3b844bbc8195 --- /dev/null +++ b/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_emitter.hpp @@ -0,0 +1,46 @@ +// Copyright (C) 2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "snippets/emitter.hpp" + +#include "graph/impls/jit/jit_generator.hpp" + +#include "openvino/core/type/element_type.hpp" +#include "openvino/core/node.hpp" + +namespace ov::intel_gpu::jit { + +template +class jit_emitter : public ov::snippets::Emitter { +public: + jit_emitter(dnnl::impl::gpu::intel::jit::ngen_code_generator_t* host, + ov::element::Type exec_prc = ov::element::f32) : + m_h(host), + m_exec_prc(exec_prc) {} + + /** + * @brief Returns supported precisions. + * Precisions are ordered, the first bigger bitness precision with the same type will be selected. + * Empty collection means the emitter supports any input precisions. + */ + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr) { + return {}; + } + +protected: + void emit_code_impl(const std::vector& in, + const std::vector& out, + const std::vector& pool, + const std::vector& gpr) const override { + OPENVINO_THROW("Unimplemented"); + } + + dnnl::impl::gpu::intel::jit::ngen_code_generator_t* m_h; + ov::element::Type m_exec_prc; +}; + +} // namespace ov::intel_gpu::jit diff --git a/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_snippets_emitters.hpp b/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_snippets_emitters.hpp new file mode 100644 index 00000000000000..2d2b6f4383579a --- /dev/null +++ b/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_snippets_emitters.hpp @@ -0,0 +1,24 @@ +// Copyright (C) 2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "jit_emitter.hpp" + + +namespace ov::intel_gpu::jit { + +template +class jit_nop_emitter : public jit_emitter { +public: + jit_nop_emitter(dnnl::impl::gpu::intel::jit::ngen_code_generator_t* host, + ov::element::Type exec_prc = ov::element::f32) : jit_emitter(host, exec_prc) {}; + + static std::set> get_supported_precisions( + [[maybe_unused]] const std::shared_ptr& node) { + return {}; + } +}; + +} // namespace ov::intel_gpu::jit diff --git a/src/plugins/intel_gpu/src/graph/impls/jit/gpu_generator.cpp b/src/plugins/intel_gpu/src/graph/impls/jit/gpu_generator.cpp new file mode 100644 index 00000000000000..e805881e5ceed5 --- /dev/null +++ b/src/plugins/intel_gpu/src/graph/impls/jit/gpu_generator.cpp @@ -0,0 +1,135 @@ +// Copyright (C) 2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "gpu_generator.hpp" + +#include "snippets/runtime_configurator.hpp" +#include "emitters/jit_eltwise_emitters.hpp" +#include "emitters/jit_snippets_emitters.hpp" + +#include "openvino/op/add.hpp" + + +using namespace dnnl::impl::gpu::intel::jit; + +namespace ov::intel_gpu::jit { + +#define CREATE_SNIPPETS_EMITTER(e_type, ...) \ + {[this](const snippets::lowered::ExpressionPtr& expr) -> std::shared_ptr { \ + return std::make_shared>(m_h.get(), ##__VA_ARGS__); \ + }, \ + [](const std::shared_ptr& n) -> std::set> { \ + return e_type::get_supported_precisions(n); \ + }} + +template +GPUTargetMachine::GPUTargetMachine() + : TargetMachine(std::make_shared(std::make_shared())), + m_h(std::make_unique>()) { + jitters[op::v0::Parameter::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(jit_nop_emitter); + jitters[op::v0::Result::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(jit_nop_emitter); + jitters[op::v1::Add::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(jit_add_emitter); +} + +template +std::shared_ptr GPUTargetMachine::clone() const { + const auto cloned = std::make_shared>(); + cloned->configurator = std::make_shared(*configurator); + return cloned; +} + +template +size_t GPUTargetMachine::get_lanes() const { + assert(m_h); + return m_h->getSIMD(); +} + +template +std::vector GPUTargetMachine::get_abi_arg_regs() const { + OPENVINO_THROW("Unimplemented!"); + return {}; +} + +template +std::vector GPUTargetMachine::get_gp_reg_pool() const { + OPENVINO_THROW("Unimplemented!"); + return {}; +} + +template +std::vector GPUTargetMachine::get_vec_reg_pool() const { + OPENVINO_THROW("Unimplemented!"); + return {}; +} + +template +ngen::HW GPUTargetMachine::get_hw() const { + return hw; +} + +template +snippets::CompiledSnippetPtr GPUTargetMachine::get_snippet() { + // OPENVINO_ASSERT(h->create_kernel() == dnnl::impl::status::success, "Failed to create jit_kernel in get_snippet()"); + // const auto& result = + // std::make_shared(std::unique_ptr(h.release())); + // // Note that we reset all the generated code, since it was copied into CompiledSnippetGPU + // h = std::make_unique(); + // return result; + OPENVINO_THROW("Unimplemented!"); + return nullptr; +} + +const uint8_t* CompiledSnippetGPU::get_code() const { + //return h_compiled->jit_ker(); + OPENVINO_THROW("Unimplemented!"); + return nullptr; +} + +size_t CompiledSnippetGPU::get_code_size() const { + OPENVINO_THROW("Unimplemented!"); +} + +bool CompiledSnippetGPU::empty() const { + return get_code_size() == 0; +} + +GPUGenerator::GPUGenerator(ngen::HW hw) + : Generator(create_target_machine(hw)) {} + +GPUGenerator::GPUGenerator(const std::shared_ptr& target) + : Generator(target) { + OPENVINO_ASSERT(typeid(*target) == typeid(GPUTargetMachine) || + typeid(*target) == typeid(GPUTargetMachine) || + typeid(*target) == typeid(GPUTargetMachine) || + typeid(*target) == typeid(GPUTargetMachine) || + typeid(*target) == typeid(GPUTargetMachine) || + typeid(*target) == typeid(GPUTargetMachine) || + typeid(*target) == typeid(GPUTargetMachine) || + typeid(*target) == typeid(GPUTargetMachine)); +} + +std::shared_ptr GPUGenerator::clone() const { + return std::shared_ptr(new GPUGenerator(target->clone())); +} + +ov::snippets::RegType GPUGenerator::get_specific_op_out_reg_type(const ov::Output& out) const { + return ov::snippets::RegType::undefined; +} + +std::shared_ptr GPUGenerator::create_target_machine(ngen::HW hw) { + switch (hw) { + case ngen::HW::Gen9: return std::make_unique>(); + case ngen::HW::Gen11: return std::make_unique>(); + case ngen::HW::Gen12LP: return std::make_unique>(); + case ngen::HW::XeHP: return std::make_unique>(); + case ngen::HW::XeHPG: return std::make_unique>(); + case ngen::HW::XeHPC: return std::make_unique>(); + case ngen::HW::Xe2: return std::make_unique>(); + case ngen::HW::Xe3: return std::make_unique>(); + default: + OPENVINO_THROW("Unknown GPU hardware!"); + } +} + +} // namespace ov::intel_gpu::jit diff --git a/src/plugins/intel_gpu/src/graph/impls/jit/gpu_generator.hpp b/src/plugins/intel_gpu/src/graph/impls/jit/gpu_generator.hpp new file mode 100644 index 00000000000000..3e245c175781b4 --- /dev/null +++ b/src/plugins/intel_gpu/src/graph/impls/jit/gpu_generator.hpp @@ -0,0 +1,65 @@ +// Copyright (C) 2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include +#include + +#include "jit_generator.hpp" +#include "gpu/intel/jit/generator.hpp" +#include "openvino/core/node.hpp" +#include "openvino/core/node_output.hpp" +#include "snippets/emitter.hpp" +#include "snippets/generator.hpp" +#include "snippets/target_machine.hpp" + +namespace ov::intel_gpu::jit { + +class CompiledSnippetGPU : public snippets::CompiledSnippet { +public: + [[nodiscard]] const uint8_t* get_code() const override; + [[nodiscard]] size_t get_code_size() const override; + [[nodiscard]] bool empty() const override; + explicit CompiledSnippetGPU() = default; +}; + +template +class GPUTargetMachine : public ov::snippets::TargetMachine { +public: + explicit GPUTargetMachine(); + + [[nodiscard]] bool is_supported() const override { return true; } + [[nodiscard]] std::shared_ptr clone() const override; + + [[nodiscard]] size_t get_lanes() const override; + + [[nodiscard]] std::vector get_abi_arg_regs() const override; + [[nodiscard]] std::vector get_gp_reg_pool() const override; + [[nodiscard]] std::vector get_vec_reg_pool() const override; + + [[nodiscard]] dnnl::impl::gpu::intel::jit::gpu_gen_t get_hw() const; + + snippets::CompiledSnippetPtr get_snippet() override; + +private: + std::unique_ptr> m_h; +}; + +class GPUGenerator : public ov::snippets::Generator { +public: + GPUGenerator(dnnl::impl::gpu::intel::jit::gpu_gen_t hw); + std::shared_ptr clone() const override; + + ov::snippets::RegType get_specific_op_out_reg_type(const ov::Output& out) const override; + +private: + GPUGenerator(const std::shared_ptr& target); + + static std::shared_ptr create_target_machine(ngen::HW hw); +}; + +} // namespace ov::intel_gpu::jit diff --git a/src/plugins/intel_gpu/src/graph/impls/jit/jit_generator.hpp b/src/plugins/intel_gpu/src/graph/impls/jit/jit_generator.hpp new file mode 100644 index 00000000000000..1575a0c468c566 --- /dev/null +++ b/src/plugins/intel_gpu/src/graph/impls/jit/jit_generator.hpp @@ -0,0 +1,24 @@ +// Copyright (C) 2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include +#include + +#include "gpu/intel/jit/generator.hpp" + + +namespace ov::intel_gpu::jit { + +template +class jit_snippet_t : public ngen::OpenCLCodeGenerator { +public: + jit_snippet_t() + : ngen::OpenCLCodeGenerator(0, {GENERATOR_NAME, GENERATOR_LINE, false}) {}; +}; + +} // namespace ov::intel_gpu::jit diff --git a/src/plugins/intel_gpu/src/graph/impls/jit/subgraph.cpp b/src/plugins/intel_gpu/src/graph/impls/jit/subgraph.cpp index f20749c255c5be..1d81d445a98653 100644 --- a/src/plugins/intel_gpu/src/graph/impls/jit/subgraph.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/jit/subgraph.cpp @@ -12,132 +12,60 @@ #include "runtime/ocl/ocl_engine.hpp" #include "intel_gpu/graph/serialization/binary_buffer.hpp" +#include "plugin/transformations/snippets/lowered/set_single_kernel_work_amount.hpp" + +#include "snippets/lowered/pass/optimize_domain.hpp" +#include "snippets/utils/utils.hpp" +#include "gpu_generator.hpp" #include namespace ov::intel_gpu::jit { using namespace dnnl::impl::gpu::intel::jit; using namespace ngen; +class SubgraphImpl : public primitive_impl { + using primitive_impl::primitive_impl; -template -class VectorScaleKernelGenerator : public OpenCLCodeGenerator -{ -protected: - NGEN_FORWARD_OPENCL(hw); + using DataFlowPasses = std::vector; + using ControlFlowPasses = std::vector; + + std::shared_ptr m_subgraph {nullptr}; public: - VectorScaleKernelGenerator() : OpenCLCodeGenerator() - { - // Define kernel interface for OpenCL. - newArgument("buffer", ExternalArgumentType::GlobalPtr); - newArgument("alpha", DataType::f); - requireLocalID(1); - requireLocalSize(); - requireSIMD((GRF::bytes(hw) == 64) ? 16 : 8); - externalName("vector_scale"); - - finalizeInterface(); - - auto bufferSurface = Surface(getArgumentSurfaceIfExists("buffer")); // Surface # for buffer. - auto bufferPtr = getArgument("buffer"); // A64 pointer for buffer. - auto alpha = getArgument("alpha"); - - auto localSize = getLocalSize(0).uw(); - auto localID = getLocalID(0); // Vector of local IDs. - auto groupID = r0.ud(1); // Thread group (a.k.a. workgroup) IDs are in r0.ud(1) (X) r0.ud(6) (Y) r0.ud(7) (Z) - - // Local variables. - auto globalID = r12.ud(0); - auto header = r13; - auto data = r14; - auto temp = r15; - - // Decide on load/store messages. - bool useLSC = (hw >= HW::XeHPC); - - // All instructions use W (NoMask) by default. - setDefaultNoMask(); - - // Enable automatic SWSB for Gen12. - setDefaultAutoSWSB(); - - // Prologue for ATS+. - prologue(); - - // Enable IEEE denormals. - or_(1 | Switch, cr0[0], cr0[0], 0x4C0); - - // Calculate global ID = (group ID) * (local size) + (local ID for lane 0). - mul(1, globalID, groupID, localSize); - add(1, globalID, globalID, localID[0]); - - // Do 32 byte (2 OWord) block read at offset (global ID) * sizeof(float). - if (!useLSC) { - shr(1, header[2], globalID, 2); - load(8, data, block_oword(2), bufferSurface, header); - } else { - shl(1, globalID, globalID, 2); - addc(1, header.ud(0), bufferPtr.ud(0), globalID); - mov(1, temp.ud(0), acc0.ud(0)); - add(1, header.ud(1), bufferPtr.ud(1), temp.ud(0)); - load(1, data, D32 | V8T, A64, header); + explicit SubgraphImpl(const program_node& node, const kernel_impl_params& impl_params) + : primitive_impl("jit::subgraph"), m_subgraph(node.as().get_primitive()->ov_subgraph->clone()) { + m_subgraph->set_generator( + std::make_shared(ngenHW2pluginHW(impl_params.get_device_info().arch))); + + const auto in_blocked_shapes = getSnippetsBlockedShapes(impl_params); + const auto precisions = getIOPrecisions(impl_params); + m_subgraph->data_flow_transformations(in_blocked_shapes, precisions.first, precisions.second); + + const auto control_flow_config = std::make_shared(); + control_flow_config->disable(); + m_subgraph->set_tile_rank(1UL); + + m_subgraph->control_flow_transformations(0, // unused + 256, // unused + std::make_shared(), + control_flow_config, + getControlFlowPasses()); } + + ControlFlowPasses getControlFlowPasses() const { + using PassPosition = ov::snippets::pass::PassPosition; + using Place = PassPosition::Place; - // Scale data. - mul(8, data, data, alpha); + ControlFlowPasses backend_passes; +#define SNIPPETS_REGISTER_PASS_ABSOLUTE(PASS_PLACE, PASS, ...) \ + backend_passes.emplace_back(PassPosition(PASS_PLACE), std::make_shared(__VA_ARGS__)) - // Store updated data. - if (!useLSC) - store(8, block_oword(2), bufferSurface, header, data); - else - store(1, D32 | V8T, A64, header, data); - // End thread. Must move r0 to one of r112-r127, then call threadend. - mov(8, r127, r0); - threadend(r127); + SNIPPETS_REGISTER_PASS_ABSOLUTE(Place::PipelineStart, + ov::intel_gpu::pass::SetSingleKernelWorkAmount); +#undef SNIPPETS_REGISTER_PASS_ABSOLUTE + return backend_passes; } -}; - - -class SubgraphImpl : public primitive_impl { - using primitive_impl::primitive_impl; - -public: - explicit SubgraphImpl(const program_node& /*node*/, const kernel_impl_params& impl_params) - : primitive_impl("jit::subgraph") { - const auto& engine = downcast(impl_params.get_program().get_engine()); - - HW hw = VectorScaleKernelGenerator::detectHW(engine.get_cl_context().get(), engine.get_cl_device().get()); - const char *gpuString = "unknown"; - - switch (hw) { - case HW::Gen9: gpuString = "Gen9"; break; - case HW::Gen11: gpuString = "Gen11"; break; - case HW::Gen12LP: gpuString = "Gen12LP"; break; - case HW::XeHP: gpuString = "XeHP"; break; - case HW::XeHPG: gpuString = "XeHPG"; break; - case HW::XeHPC: gpuString = "XeHPC"; break; - case HW::Xe2: gpuString = "Xe2"; break; - case HW::Xe3: gpuString = "Xe3"; break; - default: OPENVINO_THROW("[GPU] Unsupported architecture"); - } - - std::cout << "GPU arch: " << gpuString << "\n"; - - // Create appropriate kernel generator object for the detected HW, and get a cl_kernel. - // switch (hw) { - // case HW::Gen9: VectorScaleKernelGenerator().getKernel(engine.get_cl_context().get(), engine.get_cl_device().get()); - // case HW::Gen11: VectorScaleKernelGenerator().getKernel(engine.get_cl_context().get(), engine.get_cl_device().get()); - // case HW::Gen12LP: VectorScaleKernelGenerator().getKernel(engine.get_cl_context().get(), engine.get_cl_device().get()); - // case HW::XeHP: VectorScaleKernelGenerator().getKernel(engine.get_cl_context().get(), engine.get_cl_device().get()); - // case HW::XeHPG: VectorScaleKernelGenerator().getKernel(engine.get_cl_context().get(), engine.get_cl_device().get()); - // case HW::XeHPC: VectorScaleKernelGenerator().getKernel(engine.get_cl_context().get(), engine.get_cl_device().get()); - // case HW::Xe2: VectorScaleKernelGenerator().getKernel(engine.get_cl_context().get(), engine.get_cl_device().get()); - // case HW::Xe3: VectorScaleKernelGenerator().getKernel(engine.get_cl_context().get(), engine.get_cl_device().get()); - // default: OPENVINO_THROW("[GPU] Unsupported architecture");; - // } - - } SubgraphImpl() : primitive_impl() {} @@ -159,6 +87,47 @@ class SubgraphImpl : public primitive_impl { } void update(primitive_inst& inst, const kernel_impl_params& impl_param) override { } + +private: + static ngen::HW ngenHW2pluginHW(gpu_arch arch) { + switch (arch) { + case gpu_arch::gen9: return ngen::HW::Gen9; + case gpu_arch::gen11: return ngen::HW::Gen11; + case gpu_arch::xe_lp: return ngen::HW::XeLP; + case gpu_arch::xe_hp: return ngen::HW::XeHP; + case gpu_arch::xe_hpg: return ngen::HW::XeHPG; + case gpu_arch::xe_hpc: return ngen::HW::XeHPC; + case gpu_arch::xe2: return ngen::HW::Xe2; + case gpu_arch::xe3: return ngen::HW::Xe3; + case gpu_arch::unknown: return ngen::HW::Unknown; + default: + OPENVINO_THROW("Unexpected arch"); + } + } + + static ov::snippets::op::Subgraph::BlockedShapeVector getSnippetsBlockedShapes(const kernel_impl_params& impl_params) { + ov::snippets::op::Subgraph::BlockedShapeVector in_blocked_shapes(impl_params.input_layouts.size()); + for (size_t i = 0; i < in_blocked_shapes.size(); i++) { + // support only planar shapes + const auto blocked_dims = ov::snippets::utils::pshape_to_vdims(impl_params.input_layouts[i].get_partial_shape()); + const auto blocked_layout = ov::snippets::utils::get_planar_layout(blocked_dims.size()); + in_blocked_shapes[i] = {blocked_dims, blocked_layout}; + } + return in_blocked_shapes; + } + + static std::pair, std::vector> getIOPrecisions(const kernel_impl_params& impl_params) { + std::pair, std::vector> prc; + prc.first.reserve(impl_params.input_layouts.size()); + prc.second.reserve(impl_params.output_layouts.size()); + for (const auto& in : impl_params.input_layouts) { + prc.first.push_back(in.data_type); + } + for (const auto& out : impl_params.output_layouts) { + prc.second.push_back(out.data_type); + } + return prc; + } }; std::unique_ptr Subgraph::create_impl(const program_node& node, const RuntimeParams& params) const { diff --git a/src/plugins/intel_gpu/src/plugin/transformations/snippets/lowered/set_single_kernel_work_amount.cpp b/src/plugins/intel_gpu/src/plugin/transformations/snippets/lowered/set_single_kernel_work_amount.cpp new file mode 100644 index 00000000000000..0ca914a8c9fa39 --- /dev/null +++ b/src/plugins/intel_gpu/src/plugin/transformations/snippets/lowered/set_single_kernel_work_amount.cpp @@ -0,0 +1,90 @@ +// Copyright (C) 2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "set_single_kernel_work_amount.hpp" + +#include +#include +#include +#include +#include + +#include "openvino/core/except.hpp" +#include "openvino/core/type.hpp" +#include "snippets/lowered/expression.hpp" +#include "snippets/lowered/linear_ir.hpp" +#include "snippets/lowered/pass/pass.hpp" +#include "snippets/op/rank_normalization.hpp" +#include "snippets/shape_types.hpp" +#include "snippets/utils/utils.hpp" + +namespace ov::intel_gpu::pass { + +bool SetSingleKernelWorkAmount::run(snippets::lowered::LinearIR& linear_ir) { + // GPU Plugin requires 1D tile + linear_ir.set_loop_depth(1); + + const auto& config = linear_ir.get_config(); + if (linear_ir.empty()) { + return false; + } + + if (!config.m_enable_domain_optimization) { + // Unsupported + return false; + } + + if (linear_ir.is_dynamic()) { + // [134873] In dynamic case we need to implement own shape inference in runtime configurator + return false; + } + + auto master_shape = linear_ir.get_master_shape(); + if (master_shape.back() == 1) { + // Already single work amount + return false; + } + + auto CollapseDims = [](ov::snippets::VectorDims& dims) { + OPENVINO_ASSERT(dims.size() >= 2, "CollapseDims can't process shape with less than two dims"); + const auto full_wa_idx = dims.size() - 2; + dims[full_wa_idx] *= dims[dims.size() - 1]; + dims[dims.size() - 1] = 1; + for (size_t i = 0; i < full_wa_idx; i++) { + dims[full_wa_idx] *= dims[i]; + dims[i] = 1; + } + }; + + const auto& params = linear_ir.get_parameters(); + std::vector input_shapes; + for (const auto& param : params) { + const auto& desc = param->get_output_port_descriptor(0); + OPENVINO_ASSERT(ov::snippets::utils::is_planar_layout(desc->get_layout()), + "SetSingleKernelWorkAmount supports only planar layout on inputs"); + auto shape = desc->get_shape(); + OPENVINO_ASSERT(std::none_of(shape.begin(), + shape.end(), + [](size_t d) { + return ov::snippets::utils::is_dynamic_value(d); + }), + "SetSingleKernelWorkAmount pass does not support dynamic shapes"); + OPENVINO_ASSERT(shape == params.front()->get_output_port_descriptor(0)->get_shape(), + "SetSingleKernelWorkAmount pass supports only similar shapes on input"); + CollapseDims(shape); + input_shapes.emplace_back(shape); + } + + std::vector infer_shapes; + infer_shapes.reserve(input_shapes.size()); + for (const auto& is : input_shapes) { + infer_shapes.emplace_back(is); + } + // Need to propagate updated shapes through LIR + linear_ir.shape_infer(infer_shapes); + + return true; +} + +} // namespace ov::intel_gpu::pass diff --git a/src/plugins/intel_gpu/src/plugin/transformations/snippets/lowered/set_single_kernel_work_amount.hpp b/src/plugins/intel_gpu/src/plugin/transformations/snippets/lowered/set_single_kernel_work_amount.hpp new file mode 100644 index 00000000000000..2ae80a9db2971a --- /dev/null +++ b/src/plugins/intel_gpu/src/plugin/transformations/snippets/lowered/set_single_kernel_work_amount.hpp @@ -0,0 +1,29 @@ +// Copyright (C) 2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +#include "snippets/lowered/linear_ir.hpp" +#include "snippets/lowered/pass/pass.hpp" +#include "snippets/shape_types.hpp" + +namespace ov::intel_gpu::pass { + +/** + * @interface SetSingleKernelWorkAmount + * @brief TODO + * @ingroup snippets + */ + +class SetSingleKernelWorkAmount : public ov::snippets::lowered::pass::Pass { +public: + OPENVINO_RTTI("SetSingleKernelWorkAmount", "", Pass) + explicit SetSingleKernelWorkAmount() = default; + bool run(ov::snippets::lowered::LinearIR& linear_ir) override; +}; + +} // namespace ov::intel_gpu::pass diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index 1bedda8fccfd16..670cba9dd0511b 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -1296,8 +1296,7 @@ void TransformationsPipeline::apply(std::shared_ptr func) { } // TODO: Move to the right place - { - //ov::serialize(func, "pre_snippets.xml"); + if (config.get_snippets_mode() != ov::intel_gpu::SnippetsMode::DISABLE) { ov::pass::Manager manager("GPU:Snippets"); manager.set_per_pass_validation(false); @@ -1333,7 +1332,6 @@ void TransformationsPipeline::apply(std::shared_ptr func) { }); manager.run_passes(func); - //ov::serialize(func, "post_snippets.xml"); } } } // namespace ov::intel_gpu diff --git a/src/plugins/intel_gpu/tests/functional/CMakeLists.txt b/src/plugins/intel_gpu/tests/functional/CMakeLists.txt index c89083a026aed6..33d2c64fc34ba6 100644 --- a/src/plugins/intel_gpu/tests/functional/CMakeLists.txt +++ b/src/plugins/intel_gpu/tests/functional/CMakeLists.txt @@ -34,6 +34,8 @@ ov_add_test_target( funcSharedTests OpenCL::NewHeaders # should come before OpenCL::OpenCL OpenCL::OpenCL + openvino::snippets + ov_snippets_models ADD_CPPLINT LABELS OV GPU diff --git a/src/plugins/intel_gpu/tests/functional/shared_tests_instances/snippets/add.cpp b/src/plugins/intel_gpu/tests/functional/shared_tests_instances/snippets/add.cpp new file mode 100644 index 00000000000000..2f0d791770a27c --- /dev/null +++ b/src/plugins/intel_gpu/tests/functional/shared_tests_instances/snippets/add.cpp @@ -0,0 +1,33 @@ +// Copyright (C) 2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/add.hpp" +#include "common_test_utils/test_constants.hpp" + +namespace ov { +namespace test { +namespace snippets { + + +namespace { +// ===================================Add=========================================================// +// These inputs are needed to test static Loop optimizations (emit the whole tile, body with increments, set WA etc) +std::vector inShapesStatic1{{{}, {{128, 256, 512}}}}; +std::vector inShapesStatic2{{{}, {{128, 256, 512}}}}; + +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, Add, + ::testing::Combine( + ::testing::ValuesIn(inShapesStatic1), + ::testing::ValuesIn(inShapesStatic2), + ::testing::ValuesIn({ov::element::f16}), + ::testing::Values(1), // Add + ::testing::Values(1), // Subgraph is created, since the inputs are followed by converts + ::testing::Values(ov::test::utils::DEVICE_GPU)), + Add::getTestCaseName); + + +} // namespace +} // namespace snippets +} // namespace test +} // namespace ov \ No newline at end of file