diff --git a/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_eltwise_emitters.cpp b/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_eltwise_emitters.cpp new file mode 100644 index 00000000000000..880c421feedeb7 --- /dev/null +++ b/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_eltwise_emitters.cpp @@ -0,0 +1,35 @@ +// Copyright (C) 2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "jit_eltwise_emitters.hpp" + + +namespace ov::intel_gpu::jit { + + +template +void jit_add_emitter::emit_impl(const std::vector& in_idxs, const std::vector& out_idxs) const { + switch (this->m_exec_prc) { + case ov::element::Type_t::f16: + this->m_h->template add(this->m_h->getSIMD(), ngen::GRF(out_idxs[0]), ngen::GRF(in_idxs[0]), ngen::GRF(in_idxs[1])); + break; + case ov::element::Type_t::f32: + this->m_h->template add(this->m_h->getSIMD(), ngen::GRF(out_idxs[0]), ngen::GRF(in_idxs[0]), ngen::GRF(in_idxs[1])); + break; + default: + OPENVINO_THROW("[GPU] Unsupported add emitter data type:", this->m_exec_prc); + break; + } +} + +template class jit_add_emitter; +template class jit_add_emitter; +template class jit_add_emitter; +template class jit_add_emitter; +template class jit_add_emitter; +template class jit_add_emitter; +template class jit_add_emitter; +template class jit_add_emitter; + +} // namespace ov::intel_gpu::jit diff --git a/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_eltwise_emitters.hpp b/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_eltwise_emitters.hpp index 75b4d19722296c..90856071a85e03 100644 --- a/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_eltwise_emitters.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_eltwise_emitters.hpp @@ -26,9 +26,7 @@ class jit_add_emitter : public jit_emitter { size_t get_inputs_count() const override { return 2; }; protected: - void emit_impl(const std::vector& in_idxs, const std::vector& out_idxs) const override { - OPENVINO_THROW("Unimplemented"); - } + void emit_impl(const std::vector& in_idxs, const std::vector& out_idxs) const override; }; } // namespace ov::intel_gpu::jit diff --git a/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_emitter.hpp b/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_emitter.hpp index de0a462bc180f0..fe54b2d6425f1b 100644 --- a/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_emitter.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_emitter.hpp @@ -79,6 +79,6 @@ class jit_emitter : public ov::snippets::Emitter { TEMPLATE_INSTANCE(emitter, ngen::HW::XeHPC) \ TEMPLATE_INSTANCE(emitter, ngen::HW::Xe2) \ TEMPLATE_INSTANCE(emitter, ngen::HW::Xe3) - + } // namespace ov::intel_gpu::jit diff --git a/src/plugins/intel_gpu/src/graph/impls/jit/gpu_generator.cpp b/src/plugins/intel_gpu/src/graph/impls/jit/gpu_generator.cpp index 2af318ecb0b945..bd8154a760aecf 100644 --- a/src/plugins/intel_gpu/src/graph/impls/jit/gpu_generator.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/jit/gpu_generator.cpp @@ -4,6 +4,8 @@ #include "gpu_generator.hpp" +#include "snippets/lowered/port_connector.hpp" +#include "snippets/lowered/reg_manager.hpp" #include "snippets/runtime_configurator.hpp" #include "snippets/op/load.hpp" #include "snippets/op/kernel.hpp" @@ -20,6 +22,22 @@ using namespace dnnl::impl::gpu::intel::jit; namespace ov::intel_gpu::jit { +static ngen::HW pluginHW2ngen(cldnn::gpu_arch arch) { + switch (arch) { + case cldnn::gpu_arch::gen9: return ngen::HW::Gen9; + case cldnn::gpu_arch::gen11: return ngen::HW::Gen11; + case cldnn::gpu_arch::xe_lp: return ngen::HW::XeLP; + case cldnn::gpu_arch::xe_hp: return ngen::HW::XeHP; + case cldnn::gpu_arch::xe_hpg: return ngen::HW::XeHPG; + case cldnn::gpu_arch::xe_hpc: return ngen::HW::XeHPC; + case cldnn::gpu_arch::xe2: return ngen::HW::Xe2; + case cldnn::gpu_arch::xe3: return ngen::HW::Xe3; + case cldnn::gpu_arch::unknown: return ngen::HW::Unknown; + default: + OPENVINO_THROW("[GPU] Unexpected GPU arch"); + } +} + #define CREATE_SNIPPETS_EMITTER(e_type, ...) \ {[this](const snippets::lowered::ExpressionPtr& expr) -> std::shared_ptr { \ return std::make_shared>(m_h.get(), expr, ##__VA_ARGS__); \ @@ -29,9 +47,10 @@ namespace ov::intel_gpu::jit { }} template -GPUTargetMachine::GPUTargetMachine() +GPUTargetMachine::GPUTargetMachine(cldnn::engine& engine) : TargetMachine(std::make_shared(std::make_shared())), - m_h(std::make_unique>()) { + m_h(std::make_unique>()), + engine(engine) { jitters[op::v0::Parameter::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(jit_nop_emitter); jitters[op::v0::Result::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(jit_nop_emitter); @@ -44,7 +63,7 @@ GPUTargetMachine::GPUTargetMachine() template std::shared_ptr GPUTargetMachine::clone() const { - const auto cloned = std::make_shared>(); + const auto cloned = std::make_shared>(engine); cloned->configurator = std::make_shared(*configurator); return cloned; } @@ -95,14 +114,17 @@ ngen::HW GPUTargetMachine::get_hw() const { template snippets::CompiledSnippetPtr GPUTargetMachine::get_snippet() { - // OPENVINO_ASSERT(h->create_kernel() == dnnl::impl::status::success, "Failed to create jit_kernel in get_snippet()"); - // const auto& result = - // std::make_shared(std::unique_ptr(h.release())); - // // Note that we reset all the generated code, since it was copied into CompiledSnippetGPU - // h = std::make_unique(); - // return result; - OPENVINO_THROW("Unimplemented!"); - return nullptr; + auto compiled_snippets = std::make_shared(); + + const auto& ocl_engine = cldnn::downcast(engine); + const auto& ocl_device = cldnn::downcast(*engine.get_device()); + + auto cl_kernel = cl::Kernel(m_h->getKernel(ocl_engine.get_cl_context().get(), ocl_device.get_device().get())); + + compiled_snippets->kernel = std::make_shared(cldnn::ocl::ocl_kernel_type(cl_kernel, ocl_device.get_usm_helper()), + cl_kernel.getInfo()); + + return compiled_snippets; } const uint8_t* CompiledSnippetGPU::get_code() const { @@ -119,8 +141,8 @@ bool CompiledSnippetGPU::empty() const { return get_code_size() == 0; } -GPUGenerator::GPUGenerator(ngen::HW hw) - : Generator(create_target_machine(hw)) {} +GPUGenerator::GPUGenerator(cldnn::engine& engine) + : Generator(create_target_machine(engine)) {} GPUGenerator::GPUGenerator(const std::shared_ptr& target) : Generator(target) { @@ -142,16 +164,17 @@ ov::snippets::RegType GPUGenerator::get_specific_op_out_reg_type(const ov::Outpu return ov::snippets::RegType::undefined; } -std::shared_ptr GPUGenerator::create_target_machine(ngen::HW hw) { +std::shared_ptr GPUGenerator::create_target_machine(cldnn::engine& engine) { + auto hw = pluginHW2ngen(engine.get_device_info().arch); switch (hw) { - case ngen::HW::Gen9: return std::make_unique>(); - case ngen::HW::Gen11: return std::make_unique>(); - case ngen::HW::Gen12LP: return std::make_unique>(); - case ngen::HW::XeHP: return std::make_unique>(); - case ngen::HW::XeHPG: return std::make_unique>(); - case ngen::HW::XeHPC: return std::make_unique>(); - case ngen::HW::Xe2: return std::make_unique>(); - case ngen::HW::Xe3: return std::make_unique>(); + case ngen::HW::Gen9: return std::make_unique>(engine); + case ngen::HW::Gen11: return std::make_unique>(engine); + case ngen::HW::Gen12LP: return std::make_unique>(engine); + case ngen::HW::XeHP: return std::make_unique>(engine); + case ngen::HW::XeHPG: return std::make_unique>(engine); + case ngen::HW::XeHPC: return std::make_unique>(engine); + case ngen::HW::Xe2: return std::make_unique>(engine); + case ngen::HW::Xe3: return std::make_unique>(engine); default: OPENVINO_THROW("Unknown GPU hardware!"); } diff --git a/src/plugins/intel_gpu/src/graph/impls/jit/gpu_generator.hpp b/src/plugins/intel_gpu/src/graph/impls/jit/gpu_generator.hpp index 3e245c175781b4..804b687968d100 100644 --- a/src/plugins/intel_gpu/src/graph/impls/jit/gpu_generator.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/jit/gpu_generator.hpp @@ -17,6 +17,13 @@ #include "snippets/generator.hpp" #include "snippets/target_machine.hpp" +#include "runtime/ocl/ocl_device.hpp" +#include "runtime/ocl/ocl_kernel.hpp" +#include "common_utils/kernel_generator_base.hpp" + +#include "intel_gpu/runtime/device_info.hpp" +#include "intel_gpu/runtime/utils.hpp" + namespace ov::intel_gpu::jit { class CompiledSnippetGPU : public snippets::CompiledSnippet { @@ -25,12 +32,15 @@ class CompiledSnippetGPU : public snippets::CompiledSnippet { [[nodiscard]] size_t get_code_size() const override; [[nodiscard]] bool empty() const override; explicit CompiledSnippetGPU() = default; + + std::shared_ptr kernel{nullptr}; + ov::intel_gpu::KernelData kernels_data{}; }; template class GPUTargetMachine : public ov::snippets::TargetMachine { public: - explicit GPUTargetMachine(); + explicit GPUTargetMachine(cldnn::engine& engine); [[nodiscard]] bool is_supported() const override { return true; } [[nodiscard]] std::shared_ptr clone() const override; @@ -47,11 +57,12 @@ class GPUTargetMachine : public ov::snippets::TargetMachine { private: std::unique_ptr> m_h; + cldnn::engine& engine; }; class GPUGenerator : public ov::snippets::Generator { public: - GPUGenerator(dnnl::impl::gpu::intel::jit::gpu_gen_t hw); + GPUGenerator(cldnn::engine& engine); std::shared_ptr clone() const override; ov::snippets::RegType get_specific_op_out_reg_type(const ov::Output& out) const override; @@ -59,7 +70,7 @@ class GPUGenerator : public ov::snippets::Generator { private: GPUGenerator(const std::shared_ptr& target); - static std::shared_ptr create_target_machine(ngen::HW hw); + static std::shared_ptr create_target_machine(cldnn::engine& engine); }; } // namespace ov::intel_gpu::jit diff --git a/src/plugins/intel_gpu/src/graph/impls/jit/subgraph.cpp b/src/plugins/intel_gpu/src/graph/impls/jit/subgraph.cpp index c2b9c753d0b13f..a51189c0e0977b 100644 --- a/src/plugins/intel_gpu/src/graph/impls/jit/subgraph.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/jit/subgraph.cpp @@ -35,7 +35,7 @@ class SubgraphImpl : public primitive_impl { explicit SubgraphImpl(const program_node& node, const kernel_impl_params& impl_params) : primitive_impl("jit::subgraph"), m_subgraph(node.as().get_primitive()->ov_subgraph->clone()) { m_subgraph->set_generator( - std::make_shared(ngenHW2pluginHW(impl_params.get_device_info().arch))); + std::make_shared(impl_params.get_program().get_engine())); const auto in_blocked_shapes = getSnippetsBlockedShapes(impl_params); const auto precisions = getIOPrecisions(impl_params); @@ -51,9 +51,36 @@ class SubgraphImpl : public primitive_impl { control_flow_config, getControlFlowPasses()); - auto snippet = m_subgraph->generate(nullptr); + auto result = m_subgraph->generate(nullptr); + + auto gpu_snippets = std::dynamic_pointer_cast(result.lowering_result.compiled_snippet); + ocl_kernel = gpu_snippets->kernel; + kd = gpu_snippets->kernels_data; + + update_dispatch_data(impl_params); + configure_arguments(impl_params); + } + + void update_dispatch_data(const kernel_impl_params& impl_params) { + const auto& runtime_config = m_subgraph->update_runtime_config(); + const auto& master_shape = runtime_config->master_shape; + const auto total_elements_num = std::accumulate(master_shape.begin(), master_shape.end(), static_cast(1), std::multiplies()); + const auto simd = 8; + + kd.params.workGroups.global = {total_elements_num, 1, 1}; + kd.params.workGroups.local = {simd, 1, 1}; + } + + void configure_arguments(const kernel_impl_params& impl_params) { + for (uint32_t i = 0; i < impl_params.input_layouts.size(); i++) { + kd.params.arguments.push_back({cldnn::argument_desc::Types::INPUT, i}); } - + + for (uint32_t i = 0; i < impl_params.output_layouts.size(); i++) { + kd.params.arguments.push_back({cldnn::argument_desc::Types::OUTPUT, i}); + } + } + ControlFlowPasses getControlFlowPasses() const { using PassPosition = ov::snippets::pass::PassPosition; using Place = PassPosition::Place; @@ -77,35 +104,77 @@ class SubgraphImpl : public primitive_impl { return std::make_unique(*this); } + [[nodiscard]] virtual cldnn::kernel_arguments_data get_arguments(const cldnn::primitive_inst& instance) const { + cldnn::kernel_arguments_data args; + + for (size_t i = 0; i < instance.inputs_memory_count(); i++) { + args.inputs.push_back(instance.input_memory_ptr(i)); + } + + if (instance.has_fused_primitives()) { + size_t count = instance.get_fused_mem_count(); + for (size_t i = 0; i < count; i++) { + args.fused_op_inputs.push_back(instance.fused_memory(i)); + } + } + + for (size_t i = 0; i < instance.outputs_memory_count(); i++) { + args.outputs.push_back(instance.output_memory_ptr(i)); + } + + args.shape_info = instance.shape_info_memory_ptr(); + + auto intermediates = instance.get_intermediates_memories(); + args.intermediates = {intermediates.begin(), intermediates.end()}; + + return args; + } + void init_kernels(const kernels_cache&, const kernel_impl_params&) override {} - void set_arguments(primitive_inst& /*instance*/) override {} + + void set_arguments(primitive_inst& instance) override { + auto& stream = instance.get_network().get_stream(); + + auto args_data = get_arguments(instance); + + // Update scalars pointer + args_data.scalars = &kd.params.scalars; + + for (const auto arg : kd.params.arguments) { + GPU_DEBUG_TRACE_DETAIL << "Argument: type=" << static_cast(arg.t) << " idx=" << arg.index << "\n"; + } + + stream.set_arguments(*ocl_kernel, kd.params, args_data); + } + void set_arguments(primitive_inst& /*instance*/, kernel_arguments_data& /*args*/) override {} + std::vector get_internal_buffer_descs(const kernel_impl_params&) const override { return {}; } event::ptr execute(const std::vector& events, primitive_inst& instance) override { auto& stream = instance.get_network().get_stream(); + if (instance.can_be_optimized()) { + return stream.aggregate_events(events, false, instance.is_output()); + } + + // If any user of the desc's users is CPU implementation or network's output, set desc as a output event (event + // won't be nullptr) + bool needs_completion_event = instance.needs_completion_event(); + + auto& params = kd.params; - return stream.aggregate_events(events); + const auto& gws = params.workGroups.global; + const auto& lws = params.workGroups.local; + + GPU_DEBUG_TRACE_DETAIL << "Enqueue jit kernel : gws=[" << gws[0] << ", " << gws[1] << ", " << gws[2] << "] " << "lws=[" + << lws[0] << ", " << lws[1] << ", " << lws[2] << "]" << (needs_completion_event ? " has_completion_event=true" : "") << '\n'; + + return stream.enqueue_kernel(*ocl_kernel, params, {}, events, needs_completion_event); } void update(primitive_inst& inst, const kernel_impl_params& impl_param) override { } private: - static ngen::HW ngenHW2pluginHW(gpu_arch arch) { - switch (arch) { - case gpu_arch::gen9: return ngen::HW::Gen9; - case gpu_arch::gen11: return ngen::HW::Gen11; - case gpu_arch::xe_lp: return ngen::HW::XeLP; - case gpu_arch::xe_hp: return ngen::HW::XeHP; - case gpu_arch::xe_hpg: return ngen::HW::XeHPG; - case gpu_arch::xe_hpc: return ngen::HW::XeHPC; - case gpu_arch::xe2: return ngen::HW::Xe2; - case gpu_arch::xe3: return ngen::HW::Xe3; - case gpu_arch::unknown: return ngen::HW::Unknown; - default: - OPENVINO_THROW("Unexpected arch"); - } - } static ov::snippets::op::Subgraph::BlockedShapeVector getSnippetsBlockedShapes(const kernel_impl_params& impl_params) { ov::snippets::op::Subgraph::BlockedShapeVector in_blocked_shapes(impl_params.input_layouts.size()); @@ -130,6 +199,9 @@ class SubgraphImpl : public primitive_impl { } return prc; } + + KernelData kd{}; + ocl::ocl_kernel::ptr ocl_kernel{nullptr}; }; std::unique_ptr Subgraph::create_impl(const program_node& node, const RuntimeParams& params) const { diff --git a/src/plugins/intel_gpu/src/plugin/graph.cpp b/src/plugins/intel_gpu/src/plugin/graph.cpp index c482b67cb4062f..6d7f43571d5847 100644 --- a/src/plugins/intel_gpu/src/plugin/graph.cpp +++ b/src/plugins/intel_gpu/src/plugin/graph.cpp @@ -285,6 +285,7 @@ std::shared_ptr Graph::get_runtime_model(std::vectorget_rt_info().at(ov::exec_model_info::LAYER_TYPE).as(); // todo: Ignore reorders only after (Const or Inputs) or before outputs. // Alternatively, force plain layouts for convolutions, matmuls, FCs, etc., so reorders won't be inserted. - if (layer_type == "Const" || - layer_type == "Input" || - layer_type == "Output") + if (layer_type == "Const" || + layer_type == "Input" || + layer_type == "Output" || + layer_type == "Result") continue; auto &rt = op->get_rt_info(); const auto rinfo = rt.find("layerType");