sshlyapn · a-sidorova · Aug 5, 2025 · Jul 30, 2025 · Jul 31, 2025 · Aug 1, 2025
diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/subgraph.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/subgraph.hpp
@@ -6,6 +6,8 @@
 #include "snippets/op/subgraph.hpp"
 #include "primitive.hpp"
 
+#include "ocl/ocl_engine.hpp"
+
 namespace cldnn {
 
 /// @brief Subgraph primitive
@@ -19,7 +21,8 @@ struct subgraph : public primitive_base<subgraph> {
     /// @param id This primitive id
     /// @param inputs Input primitive ids
     /// @param subgraph Original subgraph node
-    subgraph(const primitive_id& id, const std::vector<input_info>& inputs, const std::shared_ptr<ov::snippets::op::Subgraph>& subgraph)
+    subgraph(const primitive_id& id, const std::vector<input_info>& inputs,
+             const std::shared_ptr<ov::snippets::op::Subgraph>& subgraph)
         : primitive_base(id, inputs), ov_subgraph(subgraph->clone()) {}
 
     std::shared_ptr<ov::snippets::op::Subgraph> ov_subgraph;

diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/internal_properties.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/internal_properties.hpp
@@ -113,6 +113,43 @@ inline std::istream& operator>>(std::istream& is, DumpTensors& val) {
     return is;
 }
 
+/**
+ * @brief Enum to define possible snippets mode hints.
+ */
+enum class SnippetsMode : uint8_t {
+    ENABLE = 0,           //!<  Enable
+    IGNORE_CALLBACK = 1,  //!<  Ignore callback
+    DISABLE = 2,          //!<  Disable
+};
+
+inline std::ostream& operator<<(std::ostream& os, const SnippetsMode& mode) {
+    switch (mode) {
+    case SnippetsMode::ENABLE:
+        return os << "ENABLE";
+    case SnippetsMode::IGNORE_CALLBACK:
+        return os << "IGNORE_CALLBACK";
+    case SnippetsMode::DISABLE:
+        return os << "DISABLE";
+    default:
+        OPENVINO_THROW("Unsupported snippets mode value");
+    }
+}
+
+inline std::istream& operator>>(std::istream& is, SnippetsMode& mode) {
+    std::string str;
+    is >> str;
+    if (str == "ENABLE") {
+        mode = SnippetsMode::ENABLE;
+    } else if (str == "IGNORE_CALLBACK") {
+        mode = SnippetsMode::IGNORE_CALLBACK;
+    } else if (str == "DISABLE") {
+        mode = SnippetsMode::DISABLE;
+    } else {
+        OPENVINO_THROW("Unsupported snippets mode: ", str);
+    }
+    return is;
+}
+
 /**
  * @brief Defines queue type that must be used for model execution
  */
@@ -168,6 +205,7 @@ static constexpr Property<ShapePredictor::Settings, ov::PropertyMutability::RW>
 static constexpr Property<std::vector<std::string>, ov::PropertyMutability::RW> load_dump_raw_binary{"GPU_LOAD_DUMP_RAW_BINARY"};
 static constexpr Property<bool, ov::PropertyMutability::RW> could_use_flashattn_v2{"GPU_COULD_USE_FLASHATTN_V2"};
 static constexpr Property<uint64_t, PropertyMutability::RW> dynamic_quantization_group_size_max{"GPU_DYNAMIC_QUANTIZATION_GROUP_SIZE_MAX"};
+static constexpr Property<SnippetsMode, PropertyMutability::RW> snippets_mode{"SNIPPETS_MODE"};
 }  // namespace ov::intel_gpu
 
 namespace cldnn {

diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/options.inl b/src/plugins/intel_gpu/include/intel_gpu/runtime/options.inl
@@ -35,6 +35,7 @@ OV_CONFIG_RELEASE_OPTION(ov::hint, activations_scale_factor, -1.0f, "Scalar floa
 OV_CONFIG_RELEASE_OPTION(ov::internal, enable_lp_transformations, false, "Enable/Disable Low precision transformations set")
 OV_CONFIG_RELEASE_OPTION(ov::intel_gpu, config_file, "", "Path to custom layers config file")
 OV_CONFIG_RELEASE_OPTION(ov::hint, model, nullptr, "Shared pointer to the ov::Model")
+OV_CONFIG_RELEASE_OPTION(ov::intel_gpu, snippets_mode, ov::intel_gpu::SnippetsMode::DISABLE, "Define tokenization mode for Snippets.")
 
 OV_CONFIG_RELEASE_INTERNAL_OPTION(ov::intel_gpu, shape_predictor_settings, {10, 16 * 1024, 2, 1.1f}, "Preallocation settings")
 OV_CONFIG_RELEASE_INTERNAL_OPTION(ov::intel_gpu, queue_type, QueueTypes::out_of_order, "Type of the queue that must be used for model execution. May be in-order or out-of-order")

diff --git a/src/plugins/intel_gpu/src/graph/impls/jit/CMakeLists.txt b/src/plugins/intel_gpu/src/graph/impls/jit/CMakeLists.txt
@@ -10,7 +10,7 @@ set(TARGET_NAME "openvino_intel_gpu_jit_obj")
 
 ov_gpu_add_backend_target(
     NAME ${TARGET_NAME}
-    LINK_LIBRARIES onednn_gpu_tgt
+    LINK_LIBRARIES onednn_gpu_tgt openvino::snippets
 )
 
 ov_build_target_faster(${TARGET_NAME} PCH PCH_EXCLUDE detection_output.cpp)
diff --git a/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_eltwise_emitters.hpp b/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_eltwise_emitters.hpp
@@ -0,0 +1,24 @@
+// Copyright (C) 2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "jit_emitter.hpp"
+
+
+namespace ov::intel_gpu::jit {
+
+template <dnnl::impl::gpu::intel::jit::gpu_gen_t hw>
+class jit_add_emitter : public jit_emitter<hw> {
+public:
+    jit_add_emitter(dnnl::impl::gpu::intel::jit::ngen_code_generator_t<hw>* host,
+                    ov::element::Type exec_prc = ov::element::f32) : jit_emitter<hw>(host, exec_prc) {};
+
+    static std::set<std::vector<ov::element::Type>> get_supported_precisions(
+        [[maybe_unused]] const std::shared_ptr<ov::Node>& node) {
+        return {{element::f32, element::f32}, {element::f16, element::f16}};
+    }
+};
+
+}  // namespace ov::intel_gpu::jit
diff --git a/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_emitter.hpp b/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_emitter.hpp
@@ -0,0 +1,46 @@
+// Copyright (C) 2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "snippets/emitter.hpp"
+
+#include "graph/impls/jit/jit_generator.hpp"
+
+#include "openvino/core/type/element_type.hpp"
+#include "openvino/core/node.hpp"
+
+namespace ov::intel_gpu::jit {
+
+template <dnnl::impl::gpu::intel::jit::gpu_gen_t hw>
+class jit_emitter : public ov::snippets::Emitter {
+public:
+    jit_emitter(dnnl::impl::gpu::intel::jit::ngen_code_generator_t<hw>* host,
+                ov::element::Type exec_prc = ov::element::f32) :
+        m_h(host),
+        m_exec_prc(exec_prc) {}
+
+    /**
+     * @brief Returns supported precisions.
+     * Precisions are ordered, the first bigger bitness precision with the same type will be selected.
+     * Empty collection means the emitter supports any input precisions.
+     */
+    static std::set<std::vector<ov::element::Type>> get_supported_precisions(
+        const std::shared_ptr<ov::Node>& node = nullptr) {
+        return {};
+    }
+
+protected:
+    void emit_code_impl(const std::vector<size_t>& in,
+                        const std::vector<size_t>& out,
+                        const std::vector<size_t>& pool,
+                        const std::vector<size_t>& gpr) const override {
+        OPENVINO_THROW("Unimplemented");
+    }
+
+    dnnl::impl::gpu::intel::jit::ngen_code_generator_t<hw>* m_h;
+    ov::element::Type m_exec_prc;
+};
+
+}  // namespace ov::intel_gpu::jit
diff --git a/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_snippets_emitters.hpp b/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_snippets_emitters.hpp
@@ -0,0 +1,24 @@
+// Copyright (C) 2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "jit_emitter.hpp"
+
+
+namespace ov::intel_gpu::jit {
+
+template <dnnl::impl::gpu::intel::jit::gpu_gen_t hw>
+class jit_nop_emitter : public jit_emitter<hw> {
+public:
+    jit_nop_emitter(dnnl::impl::gpu::intel::jit::ngen_code_generator_t<hw>* host,
+                    ov::element::Type exec_prc = ov::element::f32) : jit_emitter<hw>(host, exec_prc) {};
+
+    static std::set<std::vector<ov::element::Type>> get_supported_precisions(
+        [[maybe_unused]] const std::shared_ptr<ov::Node>& node) {
+        return {};
+    }
+};
+
+}  // namespace ov::intel_gpu::jit
diff --git a/src/plugins/intel_gpu/src/graph/impls/jit/gpu_generator.cpp b/src/plugins/intel_gpu/src/graph/impls/jit/gpu_generator.cpp
@@ -0,0 +1,135 @@
+// Copyright (C) 2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "gpu_generator.hpp"
+
+#include "snippets/runtime_configurator.hpp"
+#include "emitters/jit_eltwise_emitters.hpp"
+#include "emitters/jit_snippets_emitters.hpp"
+
+#include "openvino/op/add.hpp"
+
+
+using namespace dnnl::impl::gpu::intel::jit;
+
+namespace ov::intel_gpu::jit {
+
+#define CREATE_SNIPPETS_EMITTER(e_type, ...)                                                          \
+        {[this](const snippets::lowered::ExpressionPtr& expr) -> std::shared_ptr<snippets::Emitter> { \
+             return std::make_shared<e_type<hw>>(m_h.get(), ##__VA_ARGS__);                           \
+         },                                                                                           \
+         [](const std::shared_ptr<ov::Node>& n) -> std::set<std::vector<element::Type>> {             \
+             return e_type<hw>::get_supported_precisions(n);                                          \
+         }}
+
+template <ngen::HW hw>
+GPUTargetMachine<hw>::GPUTargetMachine()
+    : TargetMachine(std::make_shared<ov::snippets::RuntimeConfigurator>(std::make_shared<ov::snippets::RuntimeConfig>())),
+      m_h(std::make_unique<jit_snippet_t<hw>>()) {
+    jitters[op::v0::Parameter::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(jit_nop_emitter);
+    jitters[op::v0::Result::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(jit_nop_emitter);
+    jitters[op::v1::Add::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(jit_add_emitter);
+}
+
+template <ngen::HW hw>
+std::shared_ptr<snippets::TargetMachine> GPUTargetMachine<hw>::clone() const {
+    const auto cloned = std::make_shared<GPUTargetMachine<hw>>();
+    cloned->configurator = std::make_shared<ov::snippets::RuntimeConfigurator>(*configurator);
+    return cloned;
+}
+
+template <ngen::HW hw>
+size_t GPUTargetMachine<hw>::get_lanes() const {
+    assert(m_h);
+    return m_h->getSIMD();
+}
+
+template <ngen::HW hw>
+std::vector<snippets::Reg> GPUTargetMachine<hw>::get_abi_arg_regs() const {
+    OPENVINO_THROW("Unimplemented!");
+    return {};
+}
+
+template <ngen::HW hw>
+std::vector<snippets::Reg> GPUTargetMachine<hw>::get_gp_reg_pool() const {
+    OPENVINO_THROW("Unimplemented!");
+    return {};
+}
+
+template <ngen::HW hw>
+std::vector<snippets::Reg> GPUTargetMachine<hw>::get_vec_reg_pool() const {
+    OPENVINO_THROW("Unimplemented!");
+    return {};
+}
+
+template <ngen::HW hw>
+ngen::HW GPUTargetMachine<hw>::get_hw() const {
+    return hw;
+}
+
+template <ngen::HW hw>
+snippets::CompiledSnippetPtr GPUTargetMachine<hw>::get_snippet() {
+    // OPENVINO_ASSERT(h->create_kernel() == dnnl::impl::status::success, "Failed to create jit_kernel in get_snippet()");
+    // const auto& result =
+    //     std::make_shared<CompiledSnippetGPU>(std::unique_ptr<dnnl::impl::cpu::x64::jit_generator_t>(h.release()));
+    // // Note that we reset all the generated code, since it was copied into CompiledSnippetGPU
+    // h = std::make_unique<jit_snippet>();
+    // return result;
+    OPENVINO_THROW("Unimplemented!");
+    return nullptr;
+}
+
+const uint8_t* CompiledSnippetGPU::get_code() const {
+    //return h_compiled->jit_ker();
+    OPENVINO_THROW("Unimplemented!");
+    return nullptr;
+}
+
+size_t CompiledSnippetGPU::get_code_size() const {
+    OPENVINO_THROW("Unimplemented!");
+}
+
+bool CompiledSnippetGPU::empty() const {
+    return get_code_size() == 0;
+}
+
+GPUGenerator::GPUGenerator(ngen::HW hw)
+    : Generator(create_target_machine(hw)) {}
+
+GPUGenerator::GPUGenerator(const std::shared_ptr<ov::snippets::TargetMachine>& target)
+    : Generator(target) {
+    OPENVINO_ASSERT(typeid(*target) == typeid(GPUTargetMachine<ngen::HW::Gen9>) ||
+                    typeid(*target) == typeid(GPUTargetMachine<ngen::HW::Gen11>) ||
+                    typeid(*target) == typeid(GPUTargetMachine<ngen::HW::Gen12LP>) ||
+                    typeid(*target) == typeid(GPUTargetMachine<ngen::HW::XeHP>) ||
+                    typeid(*target) == typeid(GPUTargetMachine<ngen::HW::XeHPG>) ||
+                    typeid(*target) == typeid(GPUTargetMachine<ngen::HW::XeHPC>) ||
+                    typeid(*target) == typeid(GPUTargetMachine<ngen::HW::Xe2>) ||
+                    typeid(*target) == typeid(GPUTargetMachine<ngen::HW::Xe3>));
+}
+
+std::shared_ptr<snippets::Generator> GPUGenerator::clone() const {
+    return std::shared_ptr<GPUGenerator>(new GPUGenerator(target->clone()));
+}
+
+ov::snippets::RegType GPUGenerator::get_specific_op_out_reg_type(const ov::Output<ov::Node>& out) const {
+    return ov::snippets::RegType::undefined;
+}
+
+std::shared_ptr<ov::snippets::TargetMachine> GPUGenerator::create_target_machine(ngen::HW hw) {
+    switch (hw) {
+    case ngen::HW::Gen9:    return std::make_unique<GPUTargetMachine<ngen::HW::Gen9>>();
+    case ngen::HW::Gen11:   return std::make_unique<GPUTargetMachine<ngen::HW::Gen11>>();
+    case ngen::HW::Gen12LP: return std::make_unique<GPUTargetMachine<ngen::HW::Gen12LP>>();
+    case ngen::HW::XeHP:    return std::make_unique<GPUTargetMachine<ngen::HW::XeHP>>();
+    case ngen::HW::XeHPG:   return std::make_unique<GPUTargetMachine<ngen::HW::XeHPG>>();
+    case ngen::HW::XeHPC:   return std::make_unique<GPUTargetMachine<ngen::HW::XeHPC>>();
+    case ngen::HW::Xe2:     return std::make_unique<GPUTargetMachine<ngen::HW::Xe2>>();
+    case ngen::HW::Xe3:     return std::make_unique<GPUTargetMachine<ngen::HW::Xe3>>();
+    default:
+        OPENVINO_THROW("Unknown GPU hardware!");
+    }
+}
+
+}  // namespace ov::intel_gpu::jit
diff --git a/src/plugins/intel_gpu/src/graph/impls/jit/gpu_generator.hpp b/src/plugins/intel_gpu/src/graph/impls/jit/gpu_generator.hpp
@@ -0,0 +1,65 @@
+// Copyright (C) 2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "jit_generator.hpp"
+#include "gpu/intel/jit/generator.hpp"
+#include "openvino/core/node.hpp"
+#include "openvino/core/node_output.hpp"
+#include "snippets/emitter.hpp"
+#include "snippets/generator.hpp"
+#include "snippets/target_machine.hpp"
+
+namespace ov::intel_gpu::jit {
+
+class CompiledSnippetGPU : public snippets::CompiledSnippet {
+public:
+    [[nodiscard]] const uint8_t* get_code() const override;
+    [[nodiscard]] size_t get_code_size() const override;
+    [[nodiscard]] bool empty() const override;
+    explicit CompiledSnippetGPU() = default;
+};
+
+template <ngen::HW hw>
+class GPUTargetMachine : public ov::snippets::TargetMachine {
+public:
+    explicit GPUTargetMachine();
+
+    [[nodiscard]] bool is_supported() const override { return true; }
+    [[nodiscard]] std::shared_ptr<snippets::TargetMachine> clone() const override;
+
+    [[nodiscard]] size_t get_lanes() const override;
+
+    [[nodiscard]] std::vector<snippets::Reg> get_abi_arg_regs() const override;
+    [[nodiscard]] std::vector<snippets::Reg> get_gp_reg_pool() const override;
+    [[nodiscard]] std::vector<snippets::Reg> get_vec_reg_pool() const override;
+
+    [[nodiscard]] dnnl::impl::gpu::intel::jit::gpu_gen_t get_hw() const;
+
+    snippets::CompiledSnippetPtr get_snippet() override;
+
+private:
+    std::unique_ptr<jit_snippet_t<hw>> m_h;
+};
+
+class GPUGenerator : public ov::snippets::Generator {
+public:
+    GPUGenerator(dnnl::impl::gpu::intel::jit::gpu_gen_t hw);
+    std::shared_ptr<Generator> clone() const override;
+
+    ov::snippets::RegType get_specific_op_out_reg_type(const ov::Output<ov::Node>& out) const override;
+
+private:
+    GPUGenerator(const std::shared_ptr<ov::snippets::TargetMachine>& target);
+
+    static std::shared_ptr<ov::snippets::TargetMachine> create_target_machine(ngen::HW hw);
+};
+
+}  // namespace ov::intel_gpu::jit