diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/subgraph.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/subgraph.hpp
index d4de1678709a35..c17f10f86cc91e 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/primitives/subgraph.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/subgraph.hpp
@@ -6,6 +6,8 @@
 #include "snippets/op/subgraph.hpp"
 #include "primitive.hpp"
 
+#include "ocl/ocl_engine.hpp"
+
 namespace cldnn {
 
 /// @brief Subgraph primitive
@@ -19,7 +21,8 @@ struct subgraph : public primitive_base<subgraph> {
     /// @param id This primitive id
     /// @param inputs Input primitive ids
     /// @param subgraph Original subgraph node
-    subgraph(const primitive_id& id, const std::vector<input_info>& inputs, const std::shared_ptr<ov::snippets::op::Subgraph>& subgraph)
+    subgraph(const primitive_id& id, const std::vector<input_info>& inputs,
+             const std::shared_ptr<ov::snippets::op::Subgraph>& subgraph)
         : primitive_base(id, inputs), ov_subgraph(subgraph->clone()) {}
 
     std::shared_ptr<ov::snippets::op::Subgraph> ov_subgraph;
diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/internal_properties.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/internal_properties.hpp
index 13c892b69814b0..1e8a0ea542034e 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/runtime/internal_properties.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/internal_properties.hpp
@@ -113,6 +113,43 @@ inline std::istream& operator>>(std::istream& is, DumpTensors& val) {
     return is;
 }
 
+/**
+ * @brief Enum to define possible snippets mode hints.
+ */
+enum class SnippetsMode : uint8_t {
+    ENABLE = 0,           //!<  Enable
+    IGNORE_CALLBACK = 1,  //!<  Ignore callback
+    DISABLE = 2,          //!<  Disable
+};
+
+inline std::ostream& operator<<(std::ostream& os, const SnippetsMode& mode) {
+    switch (mode) {
+    case SnippetsMode::ENABLE:
+        return os << "ENABLE";
+    case SnippetsMode::IGNORE_CALLBACK:
+        return os << "IGNORE_CALLBACK";
+    case SnippetsMode::DISABLE:
+        return os << "DISABLE";
+    default:
+        OPENVINO_THROW("Unsupported snippets mode value");
+    }
+}
+
+inline std::istream& operator>>(std::istream& is, SnippetsMode& mode) {
+    std::string str;
+    is >> str;
+    if (str == "ENABLE") {
+        mode = SnippetsMode::ENABLE;
+    } else if (str == "IGNORE_CALLBACK") {
+        mode = SnippetsMode::IGNORE_CALLBACK;
+    } else if (str == "DISABLE") {
+        mode = SnippetsMode::DISABLE;
+    } else {
+        OPENVINO_THROW("Unsupported snippets mode: ", str);
+    }
+    return is;
+}
+
 /**
  * @brief Defines queue type that must be used for model execution
  */
@@ -168,6 +205,7 @@ static constexpr Property<ShapePredictor::Settings, ov::PropertyMutability::RW>
 static constexpr Property<std::vector<std::string>, ov::PropertyMutability::RW> load_dump_raw_binary{"GPU_LOAD_DUMP_RAW_BINARY"};
 static constexpr Property<bool, ov::PropertyMutability::RW> could_use_flashattn_v2{"GPU_COULD_USE_FLASHATTN_V2"};
 static constexpr Property<uint64_t, PropertyMutability::RW> dynamic_quantization_group_size_max{"GPU_DYNAMIC_QUANTIZATION_GROUP_SIZE_MAX"};
+static constexpr Property<SnippetsMode, PropertyMutability::RW> snippets_mode{"SNIPPETS_MODE"};
 }  // namespace ov::intel_gpu
 
 namespace cldnn {
diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/options.inl b/src/plugins/intel_gpu/include/intel_gpu/runtime/options.inl
index e2eeef0569a009..9cf533580bf717 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/runtime/options.inl
+++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/options.inl
@@ -35,6 +35,7 @@ OV_CONFIG_RELEASE_OPTION(ov::hint, activations_scale_factor, -1.0f, "Scalar floa
 OV_CONFIG_RELEASE_OPTION(ov::internal, enable_lp_transformations, false, "Enable/Disable Low precision transformations set")
 OV_CONFIG_RELEASE_OPTION(ov::intel_gpu, config_file, "", "Path to custom layers config file")
 OV_CONFIG_RELEASE_OPTION(ov::hint, model, nullptr, "Shared pointer to the ov::Model")
+OV_CONFIG_RELEASE_OPTION(ov::intel_gpu, snippets_mode, ov::intel_gpu::SnippetsMode::DISABLE, "Define tokenization mode for Snippets.")
 
 OV_CONFIG_RELEASE_INTERNAL_OPTION(ov::intel_gpu, shape_predictor_settings, {10, 16 * 1024, 2, 1.1f}, "Preallocation settings")
 OV_CONFIG_RELEASE_INTERNAL_OPTION(ov::intel_gpu, queue_type, QueueTypes::out_of_order, "Type of the queue that must be used for model execution. May be in-order or out-of-order")
diff --git a/src/plugins/intel_gpu/src/graph/impls/jit/CMakeLists.txt b/src/plugins/intel_gpu/src/graph/impls/jit/CMakeLists.txt
index db4ccd4c89c858..9f80ea10aafa1e 100644
--- a/src/plugins/intel_gpu/src/graph/impls/jit/CMakeLists.txt
+++ b/src/plugins/intel_gpu/src/graph/impls/jit/CMakeLists.txt
@@ -10,7 +10,7 @@ set(TARGET_NAME "openvino_intel_gpu_jit_obj")
 
 ov_gpu_add_backend_target(
     NAME ${TARGET_NAME}
-    LINK_LIBRARIES onednn_gpu_tgt
+    LINK_LIBRARIES onednn_gpu_tgt openvino::snippets
 )
 
 ov_build_target_faster(${TARGET_NAME} PCH PCH_EXCLUDE detection_output.cpp)
diff --git a/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_eltwise_emitters.hpp b/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_eltwise_emitters.hpp
new file mode 100644
index 00000000000000..d64bdd72062d0d
--- /dev/null
+++ b/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_eltwise_emitters.hpp
@@ -0,0 +1,24 @@
+// Copyright (C) 2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "jit_emitter.hpp"
+
+
+namespace ov::intel_gpu::jit {
+
+template <dnnl::impl::gpu::intel::jit::gpu_gen_t hw>
+class jit_add_emitter : public jit_emitter<hw> {
+public:
+    jit_add_emitter(dnnl::impl::gpu::intel::jit::ngen_code_generator_t<hw>* host,
+                    ov::element::Type exec_prc = ov::element::f32) : jit_emitter<hw>(host, exec_prc) {};
+
+    static std::set<std::vector<ov::element::Type>> get_supported_precisions(
+        [[maybe_unused]] const std::shared_ptr<ov::Node>& node) {
+        return {{element::f32, element::f32}, {element::f16, element::f16}};
+    }
+};
+
+}  // namespace ov::intel_gpu::jit
diff --git a/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_emitter.hpp b/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_emitter.hpp
new file mode 100644
index 00000000000000..7c3b844bbc8195
--- /dev/null
+++ b/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_emitter.hpp
@@ -0,0 +1,46 @@
+// Copyright (C) 2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "snippets/emitter.hpp"
+
+#include "graph/impls/jit/jit_generator.hpp"
+
+#include "openvino/core/type/element_type.hpp"
+#include "openvino/core/node.hpp"
+
+namespace ov::intel_gpu::jit {
+
+template <dnnl::impl::gpu::intel::jit::gpu_gen_t hw>
+class jit_emitter : public ov::snippets::Emitter {
+public:
+    jit_emitter(dnnl::impl::gpu::intel::jit::ngen_code_generator_t<hw>* host,
+                ov::element::Type exec_prc = ov::element::f32) :
+        m_h(host),
+        m_exec_prc(exec_prc) {}
+
+    /**
+     * @brief Returns supported precisions.
+     * Precisions are ordered, the first bigger bitness precision with the same type will be selected.
+     * Empty collection means the emitter supports any input precisions.
+     */
+    static std::set<std::vector<ov::element::Type>> get_supported_precisions(
+        const std::shared_ptr<ov::Node>& node = nullptr) {
+        return {};
+    }
+
+protected:
+    void emit_code_impl(const std::vector<size_t>& in,
+                        const std::vector<size_t>& out,
+                        const std::vector<size_t>& pool,
+                        const std::vector<size_t>& gpr) const override {
+        OPENVINO_THROW("Unimplemented");
+    }
+
+    dnnl::impl::gpu::intel::jit::ngen_code_generator_t<hw>* m_h;
+    ov::element::Type m_exec_prc;
+};
+
+}  // namespace ov::intel_gpu::jit
diff --git a/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_snippets_emitters.hpp b/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_snippets_emitters.hpp
new file mode 100644
index 00000000000000..2d2b6f4383579a
--- /dev/null
+++ b/src/plugins/intel_gpu/src/graph/impls/jit/emitters/jit_snippets_emitters.hpp
@@ -0,0 +1,24 @@
+// Copyright (C) 2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "jit_emitter.hpp"
+
+
+namespace ov::intel_gpu::jit {
+
+template <dnnl::impl::gpu::intel::jit::gpu_gen_t hw>
+class jit_nop_emitter : public jit_emitter<hw> {
+public:
+    jit_nop_emitter(dnnl::impl::gpu::intel::jit::ngen_code_generator_t<hw>* host,
+                    ov::element::Type exec_prc = ov::element::f32) : jit_emitter<hw>(host, exec_prc) {};
+
+    static std::set<std::vector<ov::element::Type>> get_supported_precisions(
+        [[maybe_unused]] const std::shared_ptr<ov::Node>& node) {
+        return {};
+    }
+};
+
+}  // namespace ov::intel_gpu::jit
diff --git a/src/plugins/intel_gpu/src/graph/impls/jit/gpu_generator.cpp b/src/plugins/intel_gpu/src/graph/impls/jit/gpu_generator.cpp
new file mode 100644
index 00000000000000..e805881e5ceed5
--- /dev/null
+++ b/src/plugins/intel_gpu/src/graph/impls/jit/gpu_generator.cpp
@@ -0,0 +1,135 @@
+// Copyright (C) 2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "gpu_generator.hpp"
+
+#include "snippets/runtime_configurator.hpp"
+#include "emitters/jit_eltwise_emitters.hpp"
+#include "emitters/jit_snippets_emitters.hpp"
+
+#include "openvino/op/add.hpp"
+
+
+using namespace dnnl::impl::gpu::intel::jit;
+
+namespace ov::intel_gpu::jit {
+
+#define CREATE_SNIPPETS_EMITTER(e_type, ...)                                                          \
+        {[this](const snippets::lowered::ExpressionPtr& expr) -> std::shared_ptr<snippets::Emitter> { \
+             return std::make_shared<e_type<hw>>(m_h.get(), ##__VA_ARGS__);                           \
+         },                                                                                           \
+         [](const std::shared_ptr<ov::Node>& n) -> std::set<std::vector<element::Type>> {             \
+             return e_type<hw>::get_supported_precisions(n);                                          \
+         }}
+
+template <ngen::HW hw>
+GPUTargetMachine<hw>::GPUTargetMachine()
+    : TargetMachine(std::make_shared<ov::snippets::RuntimeConfigurator>(std::make_shared<ov::snippets::RuntimeConfig>())),
+      m_h(std::make_unique<jit_snippet_t<hw>>()) {
+    jitters[op::v0::Parameter::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(jit_nop_emitter);
+    jitters[op::v0::Result::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(jit_nop_emitter);
+    jitters[op::v1::Add::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(jit_add_emitter);
+}
+
+template <ngen::HW hw>
+std::shared_ptr<snippets::TargetMachine> GPUTargetMachine<hw>::clone() const {
+    const auto cloned = std::make_shared<GPUTargetMachine<hw>>();
+    cloned->configurator = std::make_shared<ov::snippets::RuntimeConfigurator>(*configurator);
+    return cloned;
+}
+
+template <ngen::HW hw>
+size_t GPUTargetMachine<hw>::get_lanes() const {
+    assert(m_h);
+    return m_h->getSIMD();
+}
+
+template <ngen::HW hw>
+std::vector<snippets::Reg> GPUTargetMachine<hw>::get_abi_arg_regs() const {
+    OPENVINO_THROW("Unimplemented!");
+    return {};
+}
+
+template <ngen::HW hw>
+std::vector<snippets::Reg> GPUTargetMachine<hw>::get_gp_reg_pool() const {
+    OPENVINO_THROW("Unimplemented!");
+    return {};
+}
+
+template <ngen::HW hw>
+std::vector<snippets::Reg> GPUTargetMachine<hw>::get_vec_reg_pool() const {
+    OPENVINO_THROW("Unimplemented!");
+    return {};
+}
+
+template <ngen::HW hw>
+ngen::HW GPUTargetMachine<hw>::get_hw() const {
+    return hw;
+}
+
+template <ngen::HW hw>
+snippets::CompiledSnippetPtr GPUTargetMachine<hw>::get_snippet() {
+    // OPENVINO_ASSERT(h->create_kernel() == dnnl::impl::status::success, "Failed to create jit_kernel in get_snippet()");
+    // const auto& result =
+    //     std::make_shared<CompiledSnippetGPU>(std::unique_ptr<dnnl::impl::cpu::x64::jit_generator_t>(h.release()));
+    // // Note that we reset all the generated code, since it was copied into CompiledSnippetGPU
+    // h = std::make_unique<jit_snippet>();
+    // return result;
+    OPENVINO_THROW("Unimplemented!");
+    return nullptr;
+}
+
+const uint8_t* CompiledSnippetGPU::get_code() const {
+    //return h_compiled->jit_ker();
+    OPENVINO_THROW("Unimplemented!");
+    return nullptr;
+}
+
+size_t CompiledSnippetGPU::get_code_size() const {
+    OPENVINO_THROW("Unimplemented!");
+}
+
+bool CompiledSnippetGPU::empty() const {
+    return get_code_size() == 0;
+}
+
+GPUGenerator::GPUGenerator(ngen::HW hw)
+    : Generator(create_target_machine(hw)) {}
+
+GPUGenerator::GPUGenerator(const std::shared_ptr<ov::snippets::TargetMachine>& target)
+    : Generator(target) {
+    OPENVINO_ASSERT(typeid(*target) == typeid(GPUTargetMachine<ngen::HW::Gen9>) ||
+                    typeid(*target) == typeid(GPUTargetMachine<ngen::HW::Gen11>) ||
+                    typeid(*target) == typeid(GPUTargetMachine<ngen::HW::Gen12LP>) ||
+                    typeid(*target) == typeid(GPUTargetMachine<ngen::HW::XeHP>) ||
+                    typeid(*target) == typeid(GPUTargetMachine<ngen::HW::XeHPG>) ||
+                    typeid(*target) == typeid(GPUTargetMachine<ngen::HW::XeHPC>) ||
+                    typeid(*target) == typeid(GPUTargetMachine<ngen::HW::Xe2>) ||
+                    typeid(*target) == typeid(GPUTargetMachine<ngen::HW::Xe3>));
+}
+
+std::shared_ptr<snippets::Generator> GPUGenerator::clone() const {
+    return std::shared_ptr<GPUGenerator>(new GPUGenerator(target->clone()));
+}
+
+ov::snippets::RegType GPUGenerator::get_specific_op_out_reg_type(const ov::Output<ov::Node>& out) const {
+    return ov::snippets::RegType::undefined;
+}
+
+std::shared_ptr<ov::snippets::TargetMachine> GPUGenerator::create_target_machine(ngen::HW hw) {
+    switch (hw) {
+    case ngen::HW::Gen9:    return std::make_unique<GPUTargetMachine<ngen::HW::Gen9>>();
+    case ngen::HW::Gen11:   return std::make_unique<GPUTargetMachine<ngen::HW::Gen11>>();
+    case ngen::HW::Gen12LP: return std::make_unique<GPUTargetMachine<ngen::HW::Gen12LP>>();
+    case ngen::HW::XeHP:    return std::make_unique<GPUTargetMachine<ngen::HW::XeHP>>();
+    case ngen::HW::XeHPG:   return std::make_unique<GPUTargetMachine<ngen::HW::XeHPG>>();
+    case ngen::HW::XeHPC:   return std::make_unique<GPUTargetMachine<ngen::HW::XeHPC>>();
+    case ngen::HW::Xe2:     return std::make_unique<GPUTargetMachine<ngen::HW::Xe2>>();
+    case ngen::HW::Xe3:     return std::make_unique<GPUTargetMachine<ngen::HW::Xe3>>();
+    default:
+        OPENVINO_THROW("Unknown GPU hardware!");
+    }
+}
+
+}  // namespace ov::intel_gpu::jit
diff --git a/src/plugins/intel_gpu/src/graph/impls/jit/gpu_generator.hpp b/src/plugins/intel_gpu/src/graph/impls/jit/gpu_generator.hpp
new file mode 100644
index 00000000000000..3e245c175781b4
--- /dev/null
+++ b/src/plugins/intel_gpu/src/graph/impls/jit/gpu_generator.hpp
@@ -0,0 +1,65 @@
+// Copyright (C) 2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "jit_generator.hpp"
+#include "gpu/intel/jit/generator.hpp"
+#include "openvino/core/node.hpp"
+#include "openvino/core/node_output.hpp"
+#include "snippets/emitter.hpp"
+#include "snippets/generator.hpp"
+#include "snippets/target_machine.hpp"
+
+namespace ov::intel_gpu::jit {
+
+class CompiledSnippetGPU : public snippets::CompiledSnippet {
+public:
+    [[nodiscard]] const uint8_t* get_code() const override;
+    [[nodiscard]] size_t get_code_size() const override;
+    [[nodiscard]] bool empty() const override;
+    explicit CompiledSnippetGPU() = default;
+};
+
+template <ngen::HW hw>
+class GPUTargetMachine : public ov::snippets::TargetMachine {
+public:
+    explicit GPUTargetMachine();
+
+    [[nodiscard]] bool is_supported() const override { return true; }
+    [[nodiscard]] std::shared_ptr<snippets::TargetMachine> clone() const override;
+
+    [[nodiscard]] size_t get_lanes() const override;
+
+    [[nodiscard]] std::vector<snippets::Reg> get_abi_arg_regs() const override;
+    [[nodiscard]] std::vector<snippets::Reg> get_gp_reg_pool() const override;
+    [[nodiscard]] std::vector<snippets::Reg> get_vec_reg_pool() const override;
+
+    [[nodiscard]] dnnl::impl::gpu::intel::jit::gpu_gen_t get_hw() const;
+
+    snippets::CompiledSnippetPtr get_snippet() override;
+
+private:
+    std::unique_ptr<jit_snippet_t<hw>> m_h;
+};
+
+class GPUGenerator : public ov::snippets::Generator {
+public:
+    GPUGenerator(dnnl::impl::gpu::intel::jit::gpu_gen_t hw);
+    std::shared_ptr<Generator> clone() const override;
+
+    ov::snippets::RegType get_specific_op_out_reg_type(const ov::Output<ov::Node>& out) const override;
+
+private:
+    GPUGenerator(const std::shared_ptr<ov::snippets::TargetMachine>& target);
+
+    static std::shared_ptr<ov::snippets::TargetMachine> create_target_machine(ngen::HW hw);
+};
+
+}  // namespace ov::intel_gpu::jit
diff --git a/src/plugins/intel_gpu/src/graph/impls/jit/jit_generator.hpp b/src/plugins/intel_gpu/src/graph/impls/jit/jit_generator.hpp
new file mode 100644
index 00000000000000..1575a0c468c566
--- /dev/null
+++ b/src/plugins/intel_gpu/src/graph/impls/jit/jit_generator.hpp
@@ -0,0 +1,24 @@
+// Copyright (C) 2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "gpu/intel/jit/generator.hpp"
+
+
+namespace ov::intel_gpu::jit {
+
+template <ngen::HW hw>
+class jit_snippet_t : public ngen::OpenCLCodeGenerator<hw> {
+public:
+    jit_snippet_t()
+        : ngen::OpenCLCodeGenerator<hw>(0, {GENERATOR_NAME, GENERATOR_LINE, false}) {};
+};
+
+}  // namespace ov::intel_gpu::jit
diff --git a/src/plugins/intel_gpu/src/graph/impls/jit/subgraph.cpp b/src/plugins/intel_gpu/src/graph/impls/jit/subgraph.cpp
index f20749c255c5be..1d81d445a98653 100644
--- a/src/plugins/intel_gpu/src/graph/impls/jit/subgraph.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/jit/subgraph.cpp
@@ -12,132 +12,60 @@
 #include "runtime/ocl/ocl_engine.hpp"
 
 #include "intel_gpu/graph/serialization/binary_buffer.hpp"
+#include "plugin/transformations/snippets/lowered/set_single_kernel_work_amount.hpp"
+
+#include "snippets/lowered/pass/optimize_domain.hpp"
+#include "snippets/utils/utils.hpp"
+#include "gpu_generator.hpp"
 
 #include <vector>
 namespace ov::intel_gpu::jit {
 
 using namespace dnnl::impl::gpu::intel::jit;
 using namespace ngen;
+class SubgraphImpl : public primitive_impl {
+    using primitive_impl::primitive_impl;
 
-template <HW hw>
-class VectorScaleKernelGenerator : public OpenCLCodeGenerator<hw>
-{
-protected:
-    NGEN_FORWARD_OPENCL(hw);
+    using DataFlowPasses = std::vector<ov::snippets::pass::Manager::PositionedPassBase>;
+    using ControlFlowPasses = std::vector<ov::snippets::lowered::pass::PassPipeline::PositionedPassLowered>;
+
+    std::shared_ptr<ov::snippets::op::Subgraph> m_subgraph {nullptr};
 
 public:
-    VectorScaleKernelGenerator() : OpenCLCodeGenerator<hw>()
-    {
-        // Define kernel interface for OpenCL.
-        newArgument("buffer", ExternalArgumentType::GlobalPtr);
-        newArgument("alpha", DataType::f);
-        requireLocalID(1);
-        requireLocalSize();
-        requireSIMD((GRF::bytes(hw) == 64) ? 16 : 8);
-        externalName("vector_scale");
-
-        finalizeInterface();
-
-        auto bufferSurface = Surface(getArgumentSurfaceIfExists("buffer"));     // Surface # for buffer.
-        auto bufferPtr = getArgument("buffer");                                 // A64 pointer for buffer.
-        auto alpha = getArgument("alpha");
-
-        auto localSize = getLocalSize(0).uw();
-        auto localID = getLocalID(0);               // Vector of local IDs.
-        auto groupID = r0.ud(1);                    // Thread group (a.k.a. workgroup) IDs are in r0.ud(1) (X) r0.ud(6) (Y) r0.ud(7) (Z)
-
-        // Local variables.
-        auto globalID = r12.ud(0);
-        auto header = r13;
-        auto data = r14;
-        auto temp = r15;
-
-        // Decide on load/store messages.
-        bool useLSC = (hw >= HW::XeHPC);
-
-        // All instructions use W (NoMask) by default.
-        setDefaultNoMask();
-
-        // Enable automatic SWSB for Gen12.
-        setDefaultAutoSWSB();
-
-        // Prologue for ATS+.
-        prologue();
-
-        // Enable IEEE denormals.
-        or_(1 | Switch, cr0[0], cr0[0], 0x4C0);
-
-        // Calculate global ID = (group ID) * (local size) + (local ID for lane 0).
-        mul(1, globalID, groupID, localSize);
-        add(1, globalID, globalID, localID[0]);
-
-        // Do 32 byte (2 OWord) block read at offset (global ID) * sizeof(float).
-        if (!useLSC) {
-            shr<uint32_t>(1, header[2], globalID, 2);
-            load(8, data, block_oword(2), bufferSurface, header);
-        } else {
-            shl(1, globalID, globalID, 2);
-            addc(1, header.ud(0), bufferPtr.ud(0), globalID);
-            mov(1, temp.ud(0), acc0.ud(0));
-            add(1, header.ud(1), bufferPtr.ud(1), temp.ud(0));
-            load(1, data, D32 | V8T, A64, header);
+    explicit SubgraphImpl(const program_node& node, const kernel_impl_params& impl_params)
+        : primitive_impl("jit::subgraph"), m_subgraph(node.as<subgraph>().get_primitive()->ov_subgraph->clone())  {
+            m_subgraph->set_generator(
+                std::make_shared<ov::intel_gpu::jit::GPUGenerator>(ngenHW2pluginHW(impl_params.get_device_info().arch)));
+
+            const auto in_blocked_shapes = getSnippetsBlockedShapes(impl_params);
+            const auto precisions = getIOPrecisions(impl_params);
+            m_subgraph->data_flow_transformations(in_blocked_shapes, precisions.first, precisions.second);
+
+            const auto control_flow_config = std::make_shared<ov::snippets::lowered::pass::PassConfig>();
+            control_flow_config->disable<ov::snippets::lowered::pass::OptimizeDomain>();
+            m_subgraph->set_tile_rank(1UL);
+
+            m_subgraph->control_flow_transformations(0,   // unused
+                                                     256, // unused
+                                                     std::make_shared<ov::snippets::IShapeInferSnippetsFactory>(),
+                                                     control_flow_config,
+                                                     getControlFlowPasses());
         }
+    
+    ControlFlowPasses getControlFlowPasses() const {
+        using PassPosition = ov::snippets::pass::PassPosition;
+        using Place = PassPosition::Place;
 
-        // Scale data.
-        mul<float>(8, data, data, alpha);
+        ControlFlowPasses backend_passes;
+#define SNIPPETS_REGISTER_PASS_ABSOLUTE(PASS_PLACE, PASS, ...)             \
+        backend_passes.emplace_back(PassPosition(PASS_PLACE), std::make_shared<PASS>(__VA_ARGS__))
 
-        // Store updated data.
-        if (!useLSC)
-            store(8, block_oword(2), bufferSurface, header, data);
-        else
-            store(1, D32 | V8T, A64, header, data);
 
-        // End thread. Must move r0 to one of r112-r127, then call threadend.
-        mov<uint32_t>(8, r127, r0);
-        threadend(r127);
+        SNIPPETS_REGISTER_PASS_ABSOLUTE(Place::PipelineStart,
+                                        ov::intel_gpu::pass::SetSingleKernelWorkAmount);
+#undef SNIPPETS_REGISTER_PASS_ABSOLUTE
+        return backend_passes;
     }
-};
-
-
-class SubgraphImpl : public primitive_impl {
-    using primitive_impl::primitive_impl;
-
-public:
-    explicit SubgraphImpl(const program_node& /*node*/, const kernel_impl_params& impl_params)
-        : primitive_impl("jit::subgraph") {
-            const auto& engine = downcast<ocl::ocl_engine>(impl_params.get_program().get_engine());
-
-            HW hw = VectorScaleKernelGenerator<HW::Unknown>::detectHW(engine.get_cl_context().get(), engine.get_cl_device().get());
-            const char *gpuString = "unknown";
-
-            switch (hw) {
-                case HW::Gen9:    gpuString = "Gen9"; break;
-                case HW::Gen11:   gpuString = "Gen11"; break;
-                case HW::Gen12LP: gpuString = "Gen12LP"; break;
-                case HW::XeHP:    gpuString = "XeHP"; break;
-                case HW::XeHPG:   gpuString = "XeHPG"; break;
-                case HW::XeHPC:   gpuString = "XeHPC"; break;
-                case HW::Xe2:     gpuString = "Xe2"; break;
-                case HW::Xe3:     gpuString = "Xe3"; break;
-                default:          OPENVINO_THROW("[GPU] Unsupported architecture");
-            }
-
-            std::cout << "GPU arch: " << gpuString << "\n";
-
-            // Create appropriate kernel generator object for the detected HW, and get a cl_kernel.
-            // switch (hw) {
-            //     case HW::Gen9:    VectorScaleKernelGenerator<HW::Gen9>().getKernel(engine.get_cl_context().get(), engine.get_cl_device().get());
-            //     case HW::Gen11:   VectorScaleKernelGenerator<HW::Gen11>().getKernel(engine.get_cl_context().get(), engine.get_cl_device().get());
-            //     case HW::Gen12LP: VectorScaleKernelGenerator<HW::Gen12LP>().getKernel(engine.get_cl_context().get(), engine.get_cl_device().get());
-            //     case HW::XeHP:    VectorScaleKernelGenerator<HW::XeHP>().getKernel(engine.get_cl_context().get(), engine.get_cl_device().get());
-            //     case HW::XeHPG:   VectorScaleKernelGenerator<HW::XeHPG>().getKernel(engine.get_cl_context().get(), engine.get_cl_device().get());
-            //     case HW::XeHPC:   VectorScaleKernelGenerator<HW::XeHPC>().getKernel(engine.get_cl_context().get(), engine.get_cl_device().get());
-            //     case HW::Xe2:     VectorScaleKernelGenerator<HW::Xe2>().getKernel(engine.get_cl_context().get(), engine.get_cl_device().get());
-            //     case HW::Xe3:     VectorScaleKernelGenerator<HW::Xe3>().getKernel(engine.get_cl_context().get(), engine.get_cl_device().get());
-            //     default:          OPENVINO_THROW("[GPU] Unsupported architecture");;
-            // }
-
-        }
 
     SubgraphImpl() : primitive_impl() {}
 
@@ -159,6 +87,47 @@ class SubgraphImpl : public primitive_impl {
     }
 
     void update(primitive_inst& inst, const kernel_impl_params& impl_param) override { }
+
+private:
+    static ngen::HW ngenHW2pluginHW(gpu_arch arch) {
+        switch (arch) {
+        case gpu_arch::gen9: return ngen::HW::Gen9;
+        case gpu_arch::gen11: return ngen::HW::Gen11;
+        case gpu_arch::xe_lp: return ngen::HW::XeLP;
+        case gpu_arch::xe_hp: return ngen::HW::XeHP;
+        case gpu_arch::xe_hpg: return ngen::HW::XeHPG;
+        case gpu_arch::xe_hpc: return ngen::HW::XeHPC;
+        case gpu_arch::xe2: return ngen::HW::Xe2;
+        case gpu_arch::xe3: return ngen::HW::Xe3;
+        case gpu_arch::unknown: return ngen::HW::Unknown;
+        default:
+            OPENVINO_THROW("Unexpected arch");
+        }
+    }
+
+    static ov::snippets::op::Subgraph::BlockedShapeVector getSnippetsBlockedShapes(const kernel_impl_params& impl_params) {
+        ov::snippets::op::Subgraph::BlockedShapeVector in_blocked_shapes(impl_params.input_layouts.size());
+        for (size_t i = 0; i < in_blocked_shapes.size(); i++) {
+            // support only planar shapes
+            const auto blocked_dims = ov::snippets::utils::pshape_to_vdims(impl_params.input_layouts[i].get_partial_shape());
+            const auto blocked_layout = ov::snippets::utils::get_planar_layout(blocked_dims.size());
+            in_blocked_shapes[i] = {blocked_dims, blocked_layout};
+        }
+        return in_blocked_shapes;
+    }
+
+    static std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>> getIOPrecisions(const kernel_impl_params& impl_params) {
+        std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>> prc;
+        prc.first.reserve(impl_params.input_layouts.size());
+        prc.second.reserve(impl_params.output_layouts.size());
+        for (const auto& in : impl_params.input_layouts) {
+            prc.first.push_back(in.data_type);
+        }
+        for (const auto& out : impl_params.output_layouts) {
+            prc.second.push_back(out.data_type);
+        }
+        return prc;
+    }
 };
 
 std::unique_ptr<primitive_impl> Subgraph::create_impl(const program_node& node, const RuntimeParams& params) const {
diff --git a/src/plugins/intel_gpu/src/plugin/transformations/snippets/lowered/set_single_kernel_work_amount.cpp b/src/plugins/intel_gpu/src/plugin/transformations/snippets/lowered/set_single_kernel_work_amount.cpp
new file mode 100644
index 00000000000000..0ca914a8c9fa39
--- /dev/null
+++ b/src/plugins/intel_gpu/src/plugin/transformations/snippets/lowered/set_single_kernel_work_amount.cpp
@@ -0,0 +1,90 @@
+// Copyright (C) 2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "set_single_kernel_work_amount.hpp"
+
+#include <algorithm>
+#include <cstddef>
+#include <functional>
+#include <numeric>
+#include <vector>
+
+#include "openvino/core/except.hpp"
+#include "openvino/core/type.hpp"
+#include "snippets/lowered/expression.hpp"
+#include "snippets/lowered/linear_ir.hpp"
+#include "snippets/lowered/pass/pass.hpp"
+#include "snippets/op/rank_normalization.hpp"
+#include "snippets/shape_types.hpp"
+#include "snippets/utils/utils.hpp"
+
+namespace ov::intel_gpu::pass {
+
+bool SetSingleKernelWorkAmount::run(snippets::lowered::LinearIR& linear_ir) {
+    // GPU Plugin requires 1D tile
+    linear_ir.set_loop_depth(1);
+
+    const auto& config = linear_ir.get_config();
+    if (linear_ir.empty()) {
+        return false;
+    }
+
+    if (!config.m_enable_domain_optimization) {
+        // Unsupported
+        return false;
+    }
+
+    if (linear_ir.is_dynamic()) {
+        // [134873] In dynamic case we need to implement own shape inference in runtime configurator
+        return false;
+    }
+
+    auto master_shape = linear_ir.get_master_shape();
+    if (master_shape.back() == 1) {
+        // Already single work amount
+        return false;
+    }
+
+    auto CollapseDims = [](ov::snippets::VectorDims& dims) {
+        OPENVINO_ASSERT(dims.size() >= 2, "CollapseDims can't process shape with less than two dims");
+        const auto full_wa_idx = dims.size() - 2;
+        dims[full_wa_idx] *= dims[dims.size() - 1];
+        dims[dims.size() - 1] = 1;
+        for (size_t i = 0; i < full_wa_idx; i++) {
+            dims[full_wa_idx] *= dims[i];
+            dims[i] = 1;
+        }
+    };
+
+    const auto& params = linear_ir.get_parameters();
+    std::vector<ov::snippets::VectorDims> input_shapes;
+    for (const auto& param : params) {
+        const auto& desc = param->get_output_port_descriptor(0);
+        OPENVINO_ASSERT(ov::snippets::utils::is_planar_layout(desc->get_layout()),
+                        "SetSingleKernelWorkAmount supports only planar layout on inputs");
+        auto shape = desc->get_shape();
+        OPENVINO_ASSERT(std::none_of(shape.begin(),
+                                     shape.end(),
+                                     [](size_t d) {
+                                         return ov::snippets::utils::is_dynamic_value(d);
+                                     }),
+                        "SetSingleKernelWorkAmount pass does not support dynamic shapes");
+        OPENVINO_ASSERT(shape == params.front()->get_output_port_descriptor(0)->get_shape(),
+                        "SetSingleKernelWorkAmount pass supports only similar shapes on input");
+        CollapseDims(shape);
+        input_shapes.emplace_back(shape);
+    }
+
+    std::vector<ov::snippets::VectorDimsRef> infer_shapes;
+    infer_shapes.reserve(input_shapes.size());
+    for (const auto& is : input_shapes) {
+        infer_shapes.emplace_back(is);
+    }
+    // Need to propagate updated shapes through LIR
+    linear_ir.shape_infer(infer_shapes);
+
+    return true;
+}
+
+}  // namespace ov::intel_gpu::pass
diff --git a/src/plugins/intel_gpu/src/plugin/transformations/snippets/lowered/set_single_kernel_work_amount.hpp b/src/plugins/intel_gpu/src/plugin/transformations/snippets/lowered/set_single_kernel_work_amount.hpp
new file mode 100644
index 00000000000000..2ae80a9db2971a
--- /dev/null
+++ b/src/plugins/intel_gpu/src/plugin/transformations/snippets/lowered/set_single_kernel_work_amount.hpp
@@ -0,0 +1,29 @@
+// Copyright (C) 2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <cstddef>
+#include <vector>
+
+#include "snippets/lowered/linear_ir.hpp"
+#include "snippets/lowered/pass/pass.hpp"
+#include "snippets/shape_types.hpp"
+
+namespace ov::intel_gpu::pass {
+
+/**
+ * @interface SetSingleKernelWorkAmount
+ * @brief TODO
+ * @ingroup snippets
+ */
+
+class SetSingleKernelWorkAmount : public ov::snippets::lowered::pass::Pass {
+public:
+    OPENVINO_RTTI("SetSingleKernelWorkAmount", "", Pass)
+    explicit SetSingleKernelWorkAmount() = default;
+    bool run(ov::snippets::lowered::LinearIR& linear_ir) override;
+};
+
+}  // namespace ov::intel_gpu::pass
diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
index 1bedda8fccfd16..670cba9dd0511b 100644
--- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
+++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
@@ -1296,8 +1296,7 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
     }
 
     // TODO: Move to the right place
-    {
-        //ov::serialize(func, "pre_snippets.xml");
+    if (config.get_snippets_mode() != ov::intel_gpu::SnippetsMode::DISABLE) {
         ov::pass::Manager manager("GPU:Snippets");
         manager.set_per_pass_validation(false);
 
@@ -1333,7 +1332,6 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
             });
 
         manager.run_passes(func);
-        //ov::serialize(func, "post_snippets.xml");
     }
 }
 }  // namespace ov::intel_gpu
diff --git a/src/plugins/intel_gpu/tests/functional/CMakeLists.txt b/src/plugins/intel_gpu/tests/functional/CMakeLists.txt
index c89083a026aed6..33d2c64fc34ba6 100644
--- a/src/plugins/intel_gpu/tests/functional/CMakeLists.txt
+++ b/src/plugins/intel_gpu/tests/functional/CMakeLists.txt
@@ -34,6 +34,8 @@ ov_add_test_target(
             funcSharedTests
             OpenCL::NewHeaders # should come before OpenCL::OpenCL
             OpenCL::OpenCL
+            openvino::snippets
+            ov_snippets_models
         ADD_CPPLINT
         LABELS
             OV GPU
diff --git a/src/plugins/intel_gpu/tests/functional/shared_tests_instances/snippets/add.cpp b/src/plugins/intel_gpu/tests/functional/shared_tests_instances/snippets/add.cpp
new file mode 100644
index 00000000000000..2f0d791770a27c
--- /dev/null
+++ b/src/plugins/intel_gpu/tests/functional/shared_tests_instances/snippets/add.cpp
@@ -0,0 +1,33 @@
+// Copyright (C) 2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "snippets/add.hpp"
+#include "common_test_utils/test_constants.hpp"
+
+namespace ov {
+namespace test {
+namespace snippets {
+
+
+namespace {
+// ===================================Add=========================================================//
+// These  inputs are needed to test static Loop optimizations (emit the whole tile, body with increments, set WA etc)
+std::vector<ov::test::InputShape> inShapesStatic1{{{}, {{128, 256, 512}}}};
+std::vector<ov::test::InputShape> inShapesStatic2{{{}, {{128, 256, 512}}}};
+
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, Add,
+                         ::testing::Combine(
+                             ::testing::ValuesIn(inShapesStatic1),
+                             ::testing::ValuesIn(inShapesStatic2),
+                             ::testing::ValuesIn({ov::element::f16}),
+                             ::testing::Values(1), // Add
+                             ::testing::Values(1), // Subgraph is created, since the inputs are followed by converts
+                             ::testing::Values(ov::test::utils::DEVICE_GPU)),
+                         Add::getTestCaseName);
+
+
+} // namespace
+} // namespace snippets
+} // namespace test
+} // namespace ov
\ No newline at end of file