Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
// Copyright (C) 2025 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "jit_eltwise_emitters.hpp"


namespace ov::intel_gpu::jit {


template <dnnl::impl::gpu::intel::jit::gpu_gen_t hw>
void jit_add_emitter<hw>::emit_impl(const std::vector<size_t>& in_idxs, const std::vector<size_t>& out_idxs) const {
switch (this->m_exec_prc) {
case ov::element::Type_t::f16:
this->m_h->template add<ngen::half>(this->m_h->getSIMD(), ngen::GRF(out_idxs[0]), ngen::GRF(in_idxs[0]), ngen::GRF(in_idxs[1]));
break;
case ov::element::Type_t::f32:
this->m_h->template add<float>(this->m_h->getSIMD(), ngen::GRF(out_idxs[0]), ngen::GRF(in_idxs[0]), ngen::GRF(in_idxs[1]));
break;
default:
OPENVINO_THROW("[GPU] Unsupported add emitter data type:", this->m_exec_prc);
break;
}
}

template class jit_add_emitter<ngen::HW::Gen9>;
template class jit_add_emitter<ngen::HW::Gen11>;
template class jit_add_emitter<ngen::HW::Gen12LP>;
template class jit_add_emitter<ngen::HW::XeHP>;
template class jit_add_emitter<ngen::HW::XeHPG>;
template class jit_add_emitter<ngen::HW::XeHPC>;
template class jit_add_emitter<ngen::HW::Xe2>;
template class jit_add_emitter<ngen::HW::Xe3>;

} // namespace ov::intel_gpu::jit
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,7 @@ class jit_add_emitter : public jit_emitter<hw> {
size_t get_inputs_count() const override { return 2; };

protected:
void emit_impl(const std::vector<size_t>& in_idxs, const std::vector<size_t>& out_idxs) const override {
OPENVINO_THROW("Unimplemented");
}
void emit_impl(const std::vector<size_t>& in_idxs, const std::vector<size_t>& out_idxs) const override;
};

} // namespace ov::intel_gpu::jit
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,6 @@ class jit_emitter : public ov::snippets::Emitter {
TEMPLATE_INSTANCE(emitter, ngen::HW::XeHPC) \
TEMPLATE_INSTANCE(emitter, ngen::HW::Xe2) \
TEMPLATE_INSTANCE(emitter, ngen::HW::Xe3)


} // namespace ov::intel_gpu::jit
67 changes: 45 additions & 22 deletions src/plugins/intel_gpu/src/graph/impls/jit/gpu_generator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

#include "gpu_generator.hpp"

#include "snippets/lowered/port_connector.hpp"
#include "snippets/lowered/reg_manager.hpp"
#include "snippets/runtime_configurator.hpp"
#include "snippets/op/load.hpp"
#include "snippets/op/kernel.hpp"
Expand All @@ -20,6 +22,22 @@ using namespace dnnl::impl::gpu::intel::jit;

namespace ov::intel_gpu::jit {

static ngen::HW pluginHW2ngen(cldnn::gpu_arch arch) {
switch (arch) {
case cldnn::gpu_arch::gen9: return ngen::HW::Gen9;
case cldnn::gpu_arch::gen11: return ngen::HW::Gen11;
case cldnn::gpu_arch::xe_lp: return ngen::HW::XeLP;
case cldnn::gpu_arch::xe_hp: return ngen::HW::XeHP;
case cldnn::gpu_arch::xe_hpg: return ngen::HW::XeHPG;
case cldnn::gpu_arch::xe_hpc: return ngen::HW::XeHPC;
case cldnn::gpu_arch::xe2: return ngen::HW::Xe2;
case cldnn::gpu_arch::xe3: return ngen::HW::Xe3;
case cldnn::gpu_arch::unknown: return ngen::HW::Unknown;
default:
OPENVINO_THROW("[GPU] Unexpected GPU arch");
}
}

#define CREATE_SNIPPETS_EMITTER(e_type, ...) \
{[this](const snippets::lowered::ExpressionPtr& expr) -> std::shared_ptr<snippets::Emitter> { \
return std::make_shared<e_type<hw>>(m_h.get(), expr, ##__VA_ARGS__); \
Expand All @@ -29,9 +47,10 @@ namespace ov::intel_gpu::jit {
}}

template <ngen::HW hw>
GPUTargetMachine<hw>::GPUTargetMachine()
GPUTargetMachine<hw>::GPUTargetMachine(cldnn::engine& engine)
: TargetMachine(std::make_shared<ov::snippets::RuntimeConfigurator>(std::make_shared<ov::snippets::RuntimeConfig>())),
m_h(std::make_unique<jit_snippet_t<hw>>()) {
m_h(std::make_unique<jit_snippet_t<hw>>()),
engine(engine) {
jitters[op::v0::Parameter::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(jit_nop_emitter);
jitters[op::v0::Result::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(jit_nop_emitter);

Expand All @@ -44,7 +63,7 @@ GPUTargetMachine<hw>::GPUTargetMachine()

template <ngen::HW hw>
std::shared_ptr<snippets::TargetMachine> GPUTargetMachine<hw>::clone() const {
const auto cloned = std::make_shared<GPUTargetMachine<hw>>();
const auto cloned = std::make_shared<GPUTargetMachine<hw>>(engine);
cloned->configurator = std::make_shared<ov::snippets::RuntimeConfigurator>(*configurator);
return cloned;
}
Expand Down Expand Up @@ -95,14 +114,17 @@ ngen::HW GPUTargetMachine<hw>::get_hw() const {

template <ngen::HW hw>
snippets::CompiledSnippetPtr GPUTargetMachine<hw>::get_snippet() {
// OPENVINO_ASSERT(h->create_kernel() == dnnl::impl::status::success, "Failed to create jit_kernel in get_snippet()");
// const auto& result =
// std::make_shared<CompiledSnippetGPU>(std::unique_ptr<dnnl::impl::cpu::x64::jit_generator_t>(h.release()));
// // Note that we reset all the generated code, since it was copied into CompiledSnippetGPU
// h = std::make_unique<jit_snippet>();
// return result;
OPENVINO_THROW("Unimplemented!");
return nullptr;
auto compiled_snippets = std::make_shared<CompiledSnippetGPU>();

const auto& ocl_engine = cldnn::downcast<cldnn::ocl::ocl_engine>(engine);
const auto& ocl_device = cldnn::downcast<cldnn::ocl::ocl_device>(*engine.get_device());

auto cl_kernel = cl::Kernel(m_h->getKernel(ocl_engine.get_cl_context().get(), ocl_device.get_device().get()));

compiled_snippets->kernel = std::make_shared<cldnn::ocl::ocl_kernel>(cldnn::ocl::ocl_kernel_type(cl_kernel, ocl_device.get_usm_helper()),
cl_kernel.getInfo<CL_KERNEL_FUNCTION_NAME>());

return compiled_snippets;
}

const uint8_t* CompiledSnippetGPU::get_code() const {
Expand All @@ -119,8 +141,8 @@ bool CompiledSnippetGPU::empty() const {
return get_code_size() == 0;
}

GPUGenerator::GPUGenerator(ngen::HW hw)
: Generator(create_target_machine(hw)) {}
GPUGenerator::GPUGenerator(cldnn::engine& engine)
: Generator(create_target_machine(engine)) {}

GPUGenerator::GPUGenerator(const std::shared_ptr<ov::snippets::TargetMachine>& target)
: Generator(target) {
Expand All @@ -142,16 +164,17 @@ ov::snippets::RegType GPUGenerator::get_specific_op_out_reg_type(const ov::Outpu
return ov::snippets::RegType::undefined;
}

std::shared_ptr<ov::snippets::TargetMachine> GPUGenerator::create_target_machine(ngen::HW hw) {
std::shared_ptr<ov::snippets::TargetMachine> GPUGenerator::create_target_machine(cldnn::engine& engine) {
auto hw = pluginHW2ngen(engine.get_device_info().arch);
switch (hw) {
case ngen::HW::Gen9: return std::make_unique<GPUTargetMachine<ngen::HW::Gen9>>();
case ngen::HW::Gen11: return std::make_unique<GPUTargetMachine<ngen::HW::Gen11>>();
case ngen::HW::Gen12LP: return std::make_unique<GPUTargetMachine<ngen::HW::Gen12LP>>();
case ngen::HW::XeHP: return std::make_unique<GPUTargetMachine<ngen::HW::XeHP>>();
case ngen::HW::XeHPG: return std::make_unique<GPUTargetMachine<ngen::HW::XeHPG>>();
case ngen::HW::XeHPC: return std::make_unique<GPUTargetMachine<ngen::HW::XeHPC>>();
case ngen::HW::Xe2: return std::make_unique<GPUTargetMachine<ngen::HW::Xe2>>();
case ngen::HW::Xe3: return std::make_unique<GPUTargetMachine<ngen::HW::Xe3>>();
case ngen::HW::Gen9: return std::make_unique<GPUTargetMachine<ngen::HW::Gen9>>(engine);
case ngen::HW::Gen11: return std::make_unique<GPUTargetMachine<ngen::HW::Gen11>>(engine);
case ngen::HW::Gen12LP: return std::make_unique<GPUTargetMachine<ngen::HW::Gen12LP>>(engine);
case ngen::HW::XeHP: return std::make_unique<GPUTargetMachine<ngen::HW::XeHP>>(engine);
case ngen::HW::XeHPG: return std::make_unique<GPUTargetMachine<ngen::HW::XeHPG>>(engine);
case ngen::HW::XeHPC: return std::make_unique<GPUTargetMachine<ngen::HW::XeHPC>>(engine);
case ngen::HW::Xe2: return std::make_unique<GPUTargetMachine<ngen::HW::Xe2>>(engine);
case ngen::HW::Xe3: return std::make_unique<GPUTargetMachine<ngen::HW::Xe3>>(engine);
default:
OPENVINO_THROW("Unknown GPU hardware!");
}
Expand Down
17 changes: 14 additions & 3 deletions src/plugins/intel_gpu/src/graph/impls/jit/gpu_generator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,13 @@
#include "snippets/generator.hpp"
#include "snippets/target_machine.hpp"

#include "runtime/ocl/ocl_device.hpp"
#include "runtime/ocl/ocl_kernel.hpp"
#include "common_utils/kernel_generator_base.hpp"

#include "intel_gpu/runtime/device_info.hpp"
#include "intel_gpu/runtime/utils.hpp"

namespace ov::intel_gpu::jit {

class CompiledSnippetGPU : public snippets::CompiledSnippet {
Expand All @@ -25,12 +32,15 @@ class CompiledSnippetGPU : public snippets::CompiledSnippet {
[[nodiscard]] size_t get_code_size() const override;
[[nodiscard]] bool empty() const override;
explicit CompiledSnippetGPU() = default;

std::shared_ptr<cldnn::ocl::ocl_kernel> kernel{nullptr};
ov::intel_gpu::KernelData kernels_data{};
};

template <ngen::HW hw>
class GPUTargetMachine : public ov::snippets::TargetMachine {
public:
explicit GPUTargetMachine();
explicit GPUTargetMachine(cldnn::engine& engine);

[[nodiscard]] bool is_supported() const override { return true; }
[[nodiscard]] std::shared_ptr<snippets::TargetMachine> clone() const override;
Expand All @@ -47,19 +57,20 @@ class GPUTargetMachine : public ov::snippets::TargetMachine {

private:
std::unique_ptr<jit_snippet_t<hw>> m_h;
cldnn::engine& engine;
};

class GPUGenerator : public ov::snippets::Generator {
public:
GPUGenerator(dnnl::impl::gpu::intel::jit::gpu_gen_t hw);
GPUGenerator(cldnn::engine& engine);
std::shared_ptr<Generator> clone() const override;

ov::snippets::RegType get_specific_op_out_reg_type(const ov::Output<ov::Node>& out) const override;

private:
GPUGenerator(const std::shared_ptr<ov::snippets::TargetMachine>& target);

static std::shared_ptr<ov::snippets::TargetMachine> create_target_machine(ngen::HW hw);
static std::shared_ptr<ov::snippets::TargetMachine> create_target_machine(cldnn::engine& engine);
};

} // namespace ov::intel_gpu::jit
112 changes: 92 additions & 20 deletions src/plugins/intel_gpu/src/graph/impls/jit/subgraph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ class SubgraphImpl : public primitive_impl {
explicit SubgraphImpl(const program_node& node, const kernel_impl_params& impl_params)
: primitive_impl("jit::subgraph"), m_subgraph(node.as<subgraph>().get_primitive()->ov_subgraph->clone()) {
m_subgraph->set_generator(
std::make_shared<ov::intel_gpu::jit::GPUGenerator>(ngenHW2pluginHW(impl_params.get_device_info().arch)));
std::make_shared<ov::intel_gpu::jit::GPUGenerator>(impl_params.get_program().get_engine()));

const auto in_blocked_shapes = getSnippetsBlockedShapes(impl_params);
const auto precisions = getIOPrecisions(impl_params);
Expand All @@ -51,9 +51,36 @@ class SubgraphImpl : public primitive_impl {
control_flow_config,
getControlFlowPasses());

auto snippet = m_subgraph->generate(nullptr);
auto result = m_subgraph->generate(nullptr);

auto gpu_snippets = std::dynamic_pointer_cast<CompiledSnippetGPU>(result.lowering_result.compiled_snippet);
ocl_kernel = gpu_snippets->kernel;
kd = gpu_snippets->kernels_data;

update_dispatch_data(impl_params);
configure_arguments(impl_params);
}

void update_dispatch_data(const kernel_impl_params& impl_params) {
const auto& runtime_config = m_subgraph->update_runtime_config();
const auto& master_shape = runtime_config->master_shape;
const auto total_elements_num = std::accumulate(master_shape.begin(), master_shape.end(), static_cast<size_t>(1), std::multiplies<size_t>());
const auto simd = 8;

kd.params.workGroups.global = {total_elements_num, 1, 1};
kd.params.workGroups.local = {simd, 1, 1};
}

void configure_arguments(const kernel_impl_params& impl_params) {
for (uint32_t i = 0; i < impl_params.input_layouts.size(); i++) {
kd.params.arguments.push_back({cldnn::argument_desc::Types::INPUT, i});
}


for (uint32_t i = 0; i < impl_params.output_layouts.size(); i++) {
kd.params.arguments.push_back({cldnn::argument_desc::Types::OUTPUT, i});
}
}

ControlFlowPasses getControlFlowPasses() const {
using PassPosition = ov::snippets::pass::PassPosition;
using Place = PassPosition::Place;
Expand All @@ -77,35 +104,77 @@ class SubgraphImpl : public primitive_impl {
return std::make_unique<SubgraphImpl>(*this);
}

[[nodiscard]] virtual cldnn::kernel_arguments_data get_arguments(const cldnn::primitive_inst& instance) const {
cldnn::kernel_arguments_data args;

for (size_t i = 0; i < instance.inputs_memory_count(); i++) {
args.inputs.push_back(instance.input_memory_ptr(i));
}

if (instance.has_fused_primitives()) {
size_t count = instance.get_fused_mem_count();
for (size_t i = 0; i < count; i++) {
args.fused_op_inputs.push_back(instance.fused_memory(i));
}
}

for (size_t i = 0; i < instance.outputs_memory_count(); i++) {
args.outputs.push_back(instance.output_memory_ptr(i));
}

args.shape_info = instance.shape_info_memory_ptr();

auto intermediates = instance.get_intermediates_memories();
args.intermediates = {intermediates.begin(), intermediates.end()};

return args;
}

void init_kernels(const kernels_cache&, const kernel_impl_params&) override {}
void set_arguments(primitive_inst& /*instance*/) override {}

void set_arguments(primitive_inst& instance) override {
auto& stream = instance.get_network().get_stream();

auto args_data = get_arguments(instance);

// Update scalars pointer
args_data.scalars = &kd.params.scalars;

for (const auto arg : kd.params.arguments) {
GPU_DEBUG_TRACE_DETAIL << "Argument: type=" << static_cast<int>(arg.t) << " idx=" << arg.index << "\n";
}

stream.set_arguments(*ocl_kernel, kd.params, args_data);
}

void set_arguments(primitive_inst& /*instance*/, kernel_arguments_data& /*args*/) override {}

std::vector<BufferDescriptor> get_internal_buffer_descs(const kernel_impl_params&) const override { return {}; }

event::ptr execute(const std::vector<event::ptr>& events, primitive_inst& instance) override {
auto& stream = instance.get_network().get_stream();
if (instance.can_be_optimized()) {
return stream.aggregate_events(events, false, instance.is_output());
}

// If any user of the desc's users is CPU implementation or network's output, set desc as a output event (event
// won't be nullptr)
bool needs_completion_event = instance.needs_completion_event();

auto& params = kd.params;

return stream.aggregate_events(events);
const auto& gws = params.workGroups.global;
const auto& lws = params.workGroups.local;

GPU_DEBUG_TRACE_DETAIL << "Enqueue jit kernel : gws=[" << gws[0] << ", " << gws[1] << ", " << gws[2] << "] " << "lws=["
<< lws[0] << ", " << lws[1] << ", " << lws[2] << "]" << (needs_completion_event ? " has_completion_event=true" : "") << '\n';

return stream.enqueue_kernel(*ocl_kernel, params, {}, events, needs_completion_event);
}

void update(primitive_inst& inst, const kernel_impl_params& impl_param) override { }

private:
static ngen::HW ngenHW2pluginHW(gpu_arch arch) {
switch (arch) {
case gpu_arch::gen9: return ngen::HW::Gen9;
case gpu_arch::gen11: return ngen::HW::Gen11;
case gpu_arch::xe_lp: return ngen::HW::XeLP;
case gpu_arch::xe_hp: return ngen::HW::XeHP;
case gpu_arch::xe_hpg: return ngen::HW::XeHPG;
case gpu_arch::xe_hpc: return ngen::HW::XeHPC;
case gpu_arch::xe2: return ngen::HW::Xe2;
case gpu_arch::xe3: return ngen::HW::Xe3;
case gpu_arch::unknown: return ngen::HW::Unknown;
default:
OPENVINO_THROW("Unexpected arch");
}
}

static ov::snippets::op::Subgraph::BlockedShapeVector getSnippetsBlockedShapes(const kernel_impl_params& impl_params) {
ov::snippets::op::Subgraph::BlockedShapeVector in_blocked_shapes(impl_params.input_layouts.size());
Expand All @@ -130,6 +199,9 @@ class SubgraphImpl : public primitive_impl {
}
return prc;
}

KernelData kd{};
ocl::ocl_kernel::ptr ocl_kernel{nullptr};
};

std::unique_ptr<primitive_impl> Subgraph::create_impl(const program_node& node, const RuntimeParams& params) const {
Expand Down
1 change: 1 addition & 0 deletions src/plugins/intel_gpu/src/plugin/graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,7 @@ std::shared_ptr<ov::Model> Graph::get_runtime_model(std::vector<cldnn::primitive
{ "reduce_log_sum", "ReduceLogSum" },
{ "reduce_log_sum_exp", "ReduceLogSumExp" },
{ "space_to_depth", "SpaceToDepth" },
{ "subgraph", "Subgraph" },
};

if (type_n2l.find(cldnn_name) != type_n2l.end())
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, Add,
::testing::Combine(
::testing::ValuesIn(inShapesStatic1),
::testing::ValuesIn(inShapesStatic2),
::testing::ValuesIn({ov::element::f16}),
::testing::ValuesIn({ov::element::f32}),
::testing::Values(1), // Add
::testing::Values(1), // Subgraph is created, since the inputs are followed by converts
::testing::Values(ov::test::utils::DEVICE_GPU)),
Expand Down
Loading
Loading