From 2b3116e01f7a3787ef191bb14562128409653c8a Mon Sep 17 00:00:00 2001
From: duqimeng <1640472053@qq.com>
Date: Tue, 3 Mar 2026 15:45:52 +0800
Subject: [PATCH 1/8] [Metax] Fix eigen cmake bug

---
 backends/metax_gpu/patch/paddle.patch | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)
diff --git a/backends/metax_gpu/patch/paddle.patch b/backends/metax_gpu/patch/paddle.patch
index 0d06e6aad2..f7c6319e2e 100755
--- a/backends/metax_gpu/patch/paddle.patch
+++ b/backends/metax_gpu/patch/paddle.patch
@@ -1,15 +1,19 @@
 diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
-index cfada544d4..a690e97d74 100644
+index 6790d961f8..37b841bfc4 100644
 --- a/cmake/external/eigen.cmake
 +++ b/cmake/external/eigen.cmake
-@@ -42,12 +42,12 @@ endif()
+@@ -42,14 +42,14 @@ endif()
  file(TO_NATIVE_PATH "${PADDLE_SOURCE_DIR}/patches/eigen/TensorRandom.h.patch"
       tensor_random_header)
  # See: [Why calling some `git` commands before `patch`?]
--set(EIGEN_PATCH_COMMAND git checkout -- . && git checkout ${EIGEN_TAG} && git
--                        apply ${tensor_random_header})
-+# set(EIGEN_PATCH_COMMAND git checkout -- . && git checkout ${EIGEN_TAG} && git
-+#                         apply ${tensor_random_header})
+-set(EIGEN_PATCH_COMMAND
+-    git checkout -- Eigen/src/Core/arch/SSE/Complex.h
+-    unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h && git checkout
+-    ${EIGEN_TAG} && git apply ${tensor_random_header})
++# set(EIGEN_PATCH_COMMAND
++#     git checkout -- Eigen/src/Core/arch/SSE/Complex.h
++#     unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h && git checkout
++#     ${EIGEN_TAG} && git apply ${tensor_random_header})
  if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
    file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Complex.h.patch
         complex_header)

From 17b9715d295aebf1954ec8c144166df9589839ba Mon Sep 17 00:00:00 2001
From: duqimeng <1640472053@qq.com>
Date: Tue, 10 Mar 2026 15:26:08 +0800
Subject: [PATCH 2/8] [Metax] Fix allow_tf32_cublas

---
 backends/metax_gpu/patch/paddle.patch | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/backends/metax_gpu/patch/paddle.patch b/backends/metax_gpu/patch/paddle.patch
index f7c6319e2e..b0cf1b12d4 100755
--- a/backends/metax_gpu/patch/paddle.patch
+++ b/backends/metax_gpu/patch/paddle.patch
@@ -39,18 +39,19 @@ index 8d445b39ae..504e7b6293 100755
      op_library(fused_gemm_epilogue_op)
    endif()
 diff --git a/paddle/phi/backends/dynload/dynamic_loader.cc b/paddle/phi/backends/dynload/dynamic_loader.cc
-index 37ee00b591..f497472ad9 100644
+index 99d3733da0..d0c8783afd 100644
 --- a/paddle/phi/backends/dynload/dynamic_loader.cc
 +++ b/paddle/phi/backends/dynload/dynamic_loader.cc
-@@ -18,7 +18,6 @@ limitations under the License. */
- #include <cstdlib>
+@@ -19,7 +19,7 @@ limitations under the License. */
+ #include <locale>
  #include <string>
  #include <vector>
 -#include "paddle/phi/backends/dynload/cupti_lib_path.h"
++// #include "paddle/phi/backends/dynload/cupti_lib_path.h"
  #include "paddle/phi/common/port.h"
  #include "paddle/phi/core/enforce.h"
  
-@@ -112,6 +111,10 @@ COMMON_DECLARE_string(magma_dir);
+@@ -113,6 +113,10 @@ COMMON_DECLARE_string(magma_dir);
  #define SPARSELT_LIB_NAME "libcusparseLt.so"
  #endif
  
@@ -1100,3 +1101,16 @@ index d8bc15926b..6071baf340 100644
 
      PADDLE_ENFORCE_EQ(
          status,
+diff --git a/paddle/phi/backends/context_pool.cc b/paddle/phi/backends/context_pool.cc
+index 0495379383..9cbde17363 100644
+--- a/paddle/phi/backends/context_pool.cc
++++ b/paddle/phi/backends/context_pool.cc
+@@ -26,7 +26,7 @@ namespace phi {
+ 
+ #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+     defined(PADDLE_WITH_CUSTOM_DEVICE)
+-bool allow_tf32_cublas = false;
++bool allow_tf32_cublas = true;
+ void SetAllowTF32Cublas(bool active) { allow_tf32_cublas = active; }
+ bool AllowTF32Cublas() { return allow_tf32_cublas; }
+ bool allow_tf32_cudnn = true;

From dadf81a17c9d7ca63def2a88273750c1f688c629 Mon Sep 17 00:00:00 2001
From: duqimeng <1640472053@qq.com>
Date: Mon, 16 Mar 2026 14:28:45 +0800
Subject: [PATCH 3/8] fix patch

---
 backends/metax_gpu/patch/paddle.patch | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/backends/metax_gpu/patch/paddle.patch b/backends/metax_gpu/patch/paddle.patch
index b0cf1b12d4..c6f5c331c9 100755
--- a/backends/metax_gpu/patch/paddle.patch
+++ b/backends/metax_gpu/patch/paddle.patch
@@ -1101,16 +1101,4 @@ index d8bc15926b..6071baf340 100644
 
      PADDLE_ENFORCE_EQ(
          status,
-diff --git a/paddle/phi/backends/context_pool.cc b/paddle/phi/backends/context_pool.cc
-index 0495379383..9cbde17363 100644
---- a/paddle/phi/backends/context_pool.cc
-+++ b/paddle/phi/backends/context_pool.cc
-@@ -26,7 +26,7 @@ namespace phi {
- 
- #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
-     defined(PADDLE_WITH_CUSTOM_DEVICE)
--bool allow_tf32_cublas = false;
-+bool allow_tf32_cublas = true;
- void SetAllowTF32Cublas(bool active) { allow_tf32_cublas = active; }
- bool AllowTF32Cublas() { return allow_tf32_cublas; }
- bool allow_tf32_cudnn = true;
+         

From 2090de800c326ade59bc6eb0b621a57ef3755b10 Mon Sep 17 00:00:00 2001
From: duqimeng <1640472053@qq.com>
Date: Mon, 16 Mar 2026 17:39:10 +0800
Subject: [PATCH 4/8] [Metax] Fix error rnn

---
 backends/metax_gpu/CMakeLists.txt   | 4 ++--
 backends/metax_gpu/tests/ignore.txt | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt
index d09d0128c4..9bf4e5c6f8 100755
--- a/backends/metax_gpu/CMakeLists.txt
+++ b/backends/metax_gpu/CMakeLists.txt
@@ -158,8 +158,8 @@ file(
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/grid_sample_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/instance_norm_kernel.cu
-  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/rnn_grad_kernel.cu.cc
-  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/rnn_kernel.cu.cc
+  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/rnn_grad_kernel.cu
+  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/rnn_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/ctc_align_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/yolo_box_head_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/stft_grad_kernel.cu
diff --git a/backends/metax_gpu/tests/ignore.txt b/backends/metax_gpu/tests/ignore.txt
index ce7ec2e362..3459878625 100644
--- a/backends/metax_gpu/tests/ignore.txt
+++ b/backends/metax_gpu/tests/ignore.txt
@@ -71,6 +71,7 @@ test_householder_product
 test_paddle_device
 test_conv2d_op
 test_rnn_op
+test_interp_antialias_op.py
 
 [internet]
 test_hapi_amp

From 83513421d3ce0e45fca6df98026e5a0c115d1d2d Mon Sep 17 00:00:00 2001
From: duqimeng <1640472053@qq.com>
Date: Mon, 16 Mar 2026 19:06:55 +0800
Subject: [PATCH 5/8] [Metax] Fix error rnn

---
 backends/metax_gpu/tests/ignore.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backends/metax_gpu/tests/ignore.txt b/backends/metax_gpu/tests/ignore.txt
index 3459878625..3ec7440dad 100644
--- a/backends/metax_gpu/tests/ignore.txt
+++ b/backends/metax_gpu/tests/ignore.txt
@@ -71,7 +71,7 @@ test_householder_product
 test_paddle_device
 test_conv2d_op
 test_rnn_op
-test_interp_antialias_op.py
+test_interp_antialias_op
 
 [internet]
 test_hapi_amp

From adcdeb2a67950d503dd4663ad4d0b08464ee38a2 Mon Sep 17 00:00:00 2001
From: duqimeng <1640472053@qq.com>
Date: Tue, 17 Mar 2026 10:25:19 +0800
Subject: [PATCH 6/8] [Metax] update paddle

---
 Paddle | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Paddle b/Paddle
index 0025cdc953..56be465924 160000
--- a/Paddle
+++ b/Paddle
@@ -1 +1 @@
-Subproject commit 0025cdc95377be8a334ec943d35a33ef42edcfdf
+Subproject commit 56be465924264e1251cf127dbff56d17a7554d01

From 4459c6e604aa14aa60e2b248bbfd444c3bd7169b Mon Sep 17 00:00:00 2001
From: duqimeng <1640472053@qq.com>
Date: Mon, 30 Mar 2026 13:55:48 +0800
Subject: [PATCH 7/8] [Metax] Fix Bug

---
 backends/metax_gpu/CMakeLists.txt             |   2 +-
 .../cuda_kernels/sign_kernel_register.cu      |   8 +-
 .../layer_norm_grad_kernel_register.cu        | 260 ++++--
 .../layer_norm_kernel_register.cu             | 508 ++++++-----
 .../custom_kernel/lstm_kernel_register.cu     | 788 +++++++++++++++++-
 .../kernels/impl/addmm_kernel_impl.h          |  11 +-
 .../kernels/impl/layer_norm_impl.cu.h         |   1 +
 7 files changed, 1319 insertions(+), 259 deletions(-)

diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt
index 9bf4e5c6f8..60b43d6363 100755
--- a/backends/metax_gpu/CMakeLists.txt
+++ b/backends/metax_gpu/CMakeLists.txt
@@ -290,7 +290,7 @@ file(
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/unsqueeze_kernel.cc
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/squeeze_grad_kernel.cc
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/squeeze_kernel.cc
-  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/sign_kernel.cu.cc
+  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/sign_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/split_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/scatter_nd_add_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/scatter_nd_add_grad_kernel.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/sign_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/sign_kernel_register.cu
index 52e0f9d256..8e37408da9 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/sign_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/sign_kernel_register.cu
@@ -27,7 +27,7 @@ PD_CUSTOM_KERNEL_REGISTER(sign,
                           int64_t,
                           float,
                           double,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16,
-                          phi::dtype::complex<float>,
-                          phi::dtype::complex<double>) {}
+                          phi::float16,
+                          phi::bfloat16,
+                          phi::complex64,
+                          phi::complex128) {}
diff --git a/backends/metax_gpu/kernels/custom_kernel/layer_norm_grad_kernel_register.cu b/backends/metax_gpu/kernels/custom_kernel/layer_norm_grad_kernel_register.cu
index 857dcb6d52..dd727cd21a 100644
--- a/backends/metax_gpu/kernels/custom_kernel/layer_norm_grad_kernel_register.cu
+++ b/backends/metax_gpu/kernels/custom_kernel/layer_norm_grad_kernel_register.cu
@@ -14,48 +14,99 @@
 
 #include "funcs/layer_norm_util.h"
 #include "impl/layer_norm_impl.cu.h"
-#include "paddle/common/flags.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cast_kernel.h"
+#include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/layer_norm_grad_kernel.h"
+#if defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && !defined(_WIN32)
+#include "paddle/phi/kernels/funcs/fast_ln_v2.h"
+#endif
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/phi/kernels/gpu/rms_norm_cuda_kernel.h"
+#endif
 
 namespace phi {
+enum class LayerNormGadKernelVariant { FAST_LN_V2, GENERIC };
+static inline LayerNormGadKernelVariant LayerNormGradKernelDispatch(
+    const paddle::DataType weight_type,
+    const paddle::DataType input_type,
+    const paddle::DataType output_type,
+    const paddle::DataType compute_type,
+    const uint32_t hidden_size,
+    const int64_t x_numel,
+    const DenseTensor* scale,
+    const DenseTensor* bias) {
+#if defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && !defined(_WIN32)
+  if (scale != nullptr && bias != nullptr &&
+      input_type != paddle::DataType::FLOAT32 && hidden_size != 4096 &&
+      hidden_size > 1024 && hidden_size <= 10240 &&
+      x_numel <= std::numeric_limits<uint32_t>::max()) {
+    // using fast_ln_v2 only sm > 70 and x_numel <= uint32_max
+    auto prop = funcs::fast_ln_v2::GetDeviceProp();
+    if (prop->major > 7 &&
+        funcs::fast_ln_v2::has_fast_ln_v2_bwd_kernel(
+            weight_type, input_type, output_type, compute_type, hidden_size)) {
+      return LayerNormGadKernelVariant::FAST_LN_V2;
+    }
+  }
+#endif
+  return LayerNormGadKernelVariant::GENERIC;
+}
 
 template <typename T, typename Context>
-void LayerNormGradKernel(const Context &dev_ctx,
-                         const DenseTensor &x,
-                         const paddle::optional<DenseTensor> &scale_opt,
-                         const paddle::optional<DenseTensor> &bias_opt,
-                         const DenseTensor &mean,
-                         const DenseTensor &variance,
-                         const DenseTensor &out_grad,
-                         float epsilon,
+void LayerNormGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const optional<DenseTensor>& scale_opt,
+                         const optional<DenseTensor>& bias_opt,
+                         const DenseTensor& mean,
+                         const DenseTensor& variance,
+                         const DenseTensor& out_grad,
+                         double epsilon,
                          int begin_norm_axis,
-                         DenseTensor *x_grad,
-                         DenseTensor *scale_grad,
-                         DenseTensor *bias_grad) {
-  using U = phi::funcs::LayerNormParamType<T>;
+                         DenseTensor* x_grad,
+                         DenseTensor* scale_grad,
+                         DenseTensor* bias_grad) {
+  if (x.numel() == 0) {
+    dev_ctx.template Alloc<T>(x_grad);
+    if (scale_grad) {
+      Full<T, Context>(dev_ctx, scale_grad->dims(), 0, scale_grad);
+      if (scale_opt.get_ptr() && x.dtype() != scale_opt.get().dtype()) {
+        CastKernel<T, Context>(
+            dev_ctx, *scale_grad, scale_opt.get().dtype(), scale_grad);
+      }
+    }
+    if (bias_grad) {
+      Full<T, Context>(dev_ctx, bias_grad->dims(), 0, bias_grad);
+      if (bias_opt.get_ptr() && x.dtype() != bias_opt.get().dtype()) {
+        CastKernel<T, Context>(
+            dev_ctx, *bias_grad, bias_opt.get().dtype(), bias_grad);
+      }
+    }
+    return;
+  }
+  using U = funcs::LayerNormParamType<T>;
   // d_x, d_scale, d_bias may be nullptr
-  auto *d_x = x_grad;
-  auto *d_scale = scale_grad;
-  auto *d_bias = bias_grad;
+  auto* d_x = x_grad;
+  auto* d_scale = scale_grad;
+  auto* d_bias = bias_grad;
 
-  auto *scale = scale_opt.get_ptr();
-  auto *bias = bias_opt.get_ptr();
-  auto *d_y = &out_grad;
+  auto* scale = scale_opt.get_ptr();
+  auto* bias = bias_opt.get_ptr();
+  auto* d_y = &out_grad;
 
-  const auto &x_dims = x.dims();
+  const auto& x_dims = x.dims();
   auto matrix_dim = common::flatten_to_2d(x_dims, begin_norm_axis);
   int64_t batch_size = static_cast<int64_t>(matrix_dim[0]);
   int64_t feature_size = static_cast<int64_t>(matrix_dim[1]);
 
-  auto *x_data = x.data<T>();
-  auto *d_y_data = d_y->data<T>();
+  auto* x_data = x.data<T>();
+  auto* d_y_data = d_y->data<T>();
 
-  auto *mean_data = mean.data<U>();
-  auto *var_data = variance.data<U>();
+  auto* mean_data = mean.data<U>();
+  auto* var_data = variance.data<U>();
 
-  auto *d_x_data = (d_x == nullptr ? nullptr : dev_ctx.template Alloc<T>(d_x));
+  auto* d_x_data = (d_x == nullptr ? nullptr : dev_ctx.template Alloc<T>(d_x));
 
   auto x_dtype = x.dtype();
 
@@ -74,52 +125,165 @@ void LayerNormGradKernel(const Context &dev_ctx,
 
 #define PADDLE_LAUNCH_LAYERNORM_BWD(ScaleBiasT, IsScaleBiasSameDTypeWithX)  \
   do {                                                                      \
-    auto *scale_data =                                                      \
+    auto* scale_data =                                                      \
         (scale == nullptr ? nullptr : scale->data<ScaleBiasT>());           \
-    auto *d_scale_data =                                                    \
+    auto* d_scale_data =                                                    \
         (d_scale == nullptr ? nullptr                                       \
                             : dev_ctx.template Alloc<ScaleBiasT>(d_scale)); \
-    auto *d_bias_data =                                                     \
+    auto* d_bias_data =                                                     \
         (d_bias == nullptr ? nullptr                                        \
                            : dev_ctx.template Alloc<ScaleBiasT>(d_bias));   \
-    auto *d_x_data =                                                        \
+    auto* d_x_data =                                                        \
         (d_x == nullptr ? nullptr : dev_ctx.template Alloc<T>(d_x));        \
-    phi::funcs::LayerNormBackward<T, U, IsScaleBiasSameDTypeWithX>(         \
-        x_data,                                                             \
-        d_y_data,                                                           \
-        scale_data,                                                         \
-        mean_data,                                                          \
-        var_data,                                                           \
-        d_x_data,                                                           \
-        d_scale_data,                                                       \
-        d_bias_data,                                                        \
-        epsilon,                                                            \
-        batch_size,                                                         \
-        feature_size,                                                       \
-        dev_ctx);                                                           \
+    funcs::LayerNormBackward<T, U, IsScaleBiasSameDTypeWithX>(x_data,       \
+                                                              d_y_data,     \
+                                                              scale_data,   \
+                                                              mean_data,    \
+                                                              var_data,     \
+                                                              d_x_data,     \
+                                                              d_scale_data, \
+                                                              d_bias_data,  \
+                                                              epsilon,      \
+                                                              batch_size,   \
+                                                              feature_size, \
+                                                              dev_ctx);     \
   } while (0)
 
-  if (scale_bias_dtype == x_dtype) {
-    PADDLE_LAUNCH_LAYERNORM_BWD(T, true);
-  } else {
-    PADDLE_LAUNCH_LAYERNORM_BWD(U, false);
+#define PADDLE_LAUNCH_FAST_LAYERNORM_V2_BWD(ScaleBiasT)                     \
+  do {                                                                      \
+    auto stream = dev_ctx.stream();                                         \
+    auto place = x.place();                                                 \
+    auto* scale_data =                                                      \
+        (scale == nullptr ? nullptr : scale->data<ScaleBiasT>());           \
+    auto* d_scale_data =                                                    \
+        (d_scale == nullptr ? nullptr                                       \
+                            : dev_ctx.template Alloc<ScaleBiasT>(d_scale)); \
+    auto* d_bias_data =                                                     \
+        (d_bias == nullptr ? nullptr                                        \
+                           : dev_ctx.template Alloc<ScaleBiasT>(d_bias));   \
+    auto* d_x_data =                                                        \
+        (d_x == nullptr ? nullptr : dev_ctx.template Alloc<T>(d_x));        \
+    funcs::fast_ln_v2::LaunchNormBwd<T, Context>(dev_ctx,                   \
+                                                 stream,                    \
+                                                 place,                     \
+                                                 x_data,                    \
+                                                 scale_data,                \
+                                                 mean_data,                 \
+                                                 var_data,                  \
+                                                 d_y_data,                  \
+                                                 d_x_data,                  \
+                                                 d_scale_data,              \
+                                                 d_bias_data,               \
+                                                 scale_bias_dtype,          \
+                                                 x_dtype,                   \
+                                                 x_grad->dtype(),           \
+                                                 compute_dtype,             \
+                                                 feature_size,              \
+                                                 batch_size,                \
+                                                 feature_size,              \
+                                                 epsilon);                  \
+  } while (0)
+
+  auto compute_dtype = phi::CppTypeToDataType<U>::Type();
+  auto kernel_variant = LayerNormGradKernelDispatch(scale_bias_dtype,
+                                                    x_dtype,
+                                                    x_dtype,
+                                                    compute_dtype,
+                                                    feature_size,
+                                                    x.numel(),
+                                                    scale,
+                                                    bias);
+  switch (kernel_variant) {
+#if defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && !defined(_WIN32)
+    case LayerNormGadKernelVariant::FAST_LN_V2:
+      if (scale_bias_dtype == x_dtype) {
+        PADDLE_LAUNCH_FAST_LAYERNORM_V2_BWD(T);
+      } else {
+        PADDLE_LAUNCH_FAST_LAYERNORM_V2_BWD(U);
+      }
+      break;
+#endif
+    case LayerNormGadKernelVariant::GENERIC:
+    default:
+#ifdef PADDLE_WITH_CUDA
+      if ((FLAGS_use_accuracy_compatible_kernel ||
+           (!isPowerOfTwo(feature_size) && feature_size > 1024)) &&
+          scale_bias_dtype == x_dtype) {
+        auto* scale_data = (scale == nullptr ? nullptr : scale->data<T>());
+        auto* d_scale_data =
+            (d_scale == nullptr ? nullptr : dev_ctx.template Alloc<T>(d_scale));
+        auto* d_bias_data =
+            (d_bias == nullptr ? nullptr : dev_ctx.template Alloc<T>(d_bias));
+        auto* d_x_data =
+            (d_x == nullptr ? nullptr : dev_ctx.template Alloc<T>(d_x));
+        LayerNormBwdCompatKernel<T, Context>(dev_ctx,
+                                             d_y_data,
+                                             x_data,
+                                             scale_data,
+                                             mean_data,
+                                             var_data,
+                                             d_x_data,
+                                             d_scale_data,
+                                             d_bias_data,
+                                             epsilon,
+                                             batch_size,
+                                             feature_size);
+      } else {
+#endif
+        if (scale_bias_dtype == x_dtype) {
+          PADDLE_LAUNCH_LAYERNORM_BWD(T, true);
+        } else {
+          PADDLE_LAUNCH_LAYERNORM_BWD(U, false);
+        }
+#ifdef PADDLE_WITH_CUDA
+      }
+#endif
   }
 
 #undef PADDLE_LAUNCH_LAYERNORM_BWD
+#undef PADDLE_LAUNCH_FAST_LAYERNORM_V2_BWD
 }
 
 }  // namespace phi
 
+#ifdef PADDLE_WITH_HIP
+// MIOPEN do not support double
+PD_REGISTER_PLUGIN_KERNEL(layer_norm_grad,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::LayerNormGradKernel,
+                          float,
+                          phi::float16) {
+  if (kernel_key.dtype() == phi::DataType::FLOAT16) {
+    kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
+  }
+}
+#elif CUDNN_VERSION_MIN(8, 1, 0)
+PD_REGISTER_PLUGIN_KERNEL(layer_norm_grad,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::LayerNormGradKernel,
+                          float,
+                          double,
+                          phi::float16,
+                          phi::bfloat16) {
+  if (kernel_key.dtype() == phi::DataType::FLOAT16) {
+    kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
+  }
+}
+#else
 PD_REGISTER_PLUGIN_KERNEL(layer_norm_grad,
                           metax_gpu,
                           ALL_LAYOUT,
                           phi::LayerNormGradKernel,
                           float,
                           double,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16) {
+                          phi::float16) {
   if (kernel_key.dtype() == phi::DataType::FLOAT16) {
     kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
     kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
   }
 }
+#endif
diff --git a/backends/metax_gpu/kernels/custom_kernel/layer_norm_kernel_register.cu b/backends/metax_gpu/kernels/custom_kernel/layer_norm_kernel_register.cu
index d5e5901784..7fc6bb50c5 100644
--- a/backends/metax_gpu/kernels/custom_kernel/layer_norm_kernel_register.cu
+++ b/backends/metax_gpu/kernels/custom_kernel/layer_norm_kernel_register.cu
@@ -12,20 +12,30 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "funcs/layer_norm_util.h"
-#include "impl/layer_norm_impl.cu.h"
+// #include "kernels/funcs/layer_norm_util.h"
 #include "paddle/common/flags.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/layer_norm_kernel.h"
-
+#if defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && !defined(_WIN32)
+#include "paddle/phi/kernels/funcs/fast_ln_v2.h"
+#endif
+// #include "paddle/phi/kernels/funcs/layer_norm_impl.cu.h"
+// #include "paddle/phi/kernels/funcs/layer_norm_util.h"
+#include "funcs/layer_norm_util.h"
+#include "impl/layer_norm_impl.cu.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/phi/kernels/gpu/rms_norm_cuda_kernel.h"
+#endif
 COMMON_DECLARE_bool(use_fast_math);
 
 namespace phi {
 
+enum class LayerNormKernelVariant { FAST_LN_V1, FAST_LN_V2, GENERIC };
+
 #ifdef PADDLE_WITH_CUDA
 template <typename U>
-__device__ inline void WelfordOnline(U val, U *mean, U *square, U *count) {
+__device__ inline void WelfordOnline(U val, U* mean, U* square, U* count) {
   *count += 1;
   U delta1 = val - *mean;
   *mean += delta1 / (*count);
@@ -35,7 +45,7 @@ __device__ inline void WelfordOnline(U val, U *mean, U *square, U *count) {
 
 template <typename U>
 __device__ inline void WelfordOnline(
-    U b_mean, U b_square, U b_cnt, U *mean, U *square, U *count) {
+    U b_mean, U b_square, U b_cnt, U* mean, U* square, U* count) {
   if (b_cnt == 0) {
     return;
   }
@@ -49,7 +59,7 @@ __device__ inline void WelfordOnline(
 }
 
 template <typename U>
-__device__ inline void WelfordWarpAllReduce(U *mean, U *square, U *count) {
+__device__ inline void WelfordWarpAllReduce(U* mean, U* square, U* count) {
   constexpr int kWarpSize = 32;
 #pragma unroll
   for (int mask = 1; mask < kWarpSize; mask *= 2) {
@@ -68,7 +78,7 @@ template <int VecSize>
 struct ThreadAssigner {
   __device__ __forceinline__ int operator()(const int cols,
                                             const int cols_per_thread,
-                                            int32_t *last_tid_idx) {
+                                            int32_t* last_tid_idx) {
     return cols_per_thread;
   }
 };
@@ -77,7 +87,7 @@ template <>
 struct ThreadAssigner<1> {
   __device__ inline int operator()(const int cols,
                                    const int cols_per_thread,
-                                   int *last_tid_idx) {
+                                   int* last_tid_idx) {
     int cols_this_thread = cols_per_thread;
     int last_tid = (cols / cols_per_thread);
     *last_tid_idx = last_tid;
@@ -92,14 +102,14 @@ struct ThreadAssigner<1> {
 
 template <typename T, typename U, int VecSize>
 struct LayerNormDataReader {
-  __device__ inline void operator()(const T *__restrict__ row_src,
-                                    U *buffer,
+  __device__ inline void operator()(const T* __restrict__ row_src,
+                                    U* buffer,
                                     const int last_tid_idx,
                                     const int read_times,
                                     const int cols_this_thread) {
-    using VecT = phi::AlignedVector<T, VecSize>;
-    const VecT *__restrict__ v_src =
-        reinterpret_cast<const VecT *__restrict__>(row_src);
+    using VecT = AlignedVector<T, VecSize>;
+    const VecT* __restrict__ v_src =
+        reinterpret_cast<const VecT* __restrict__>(row_src);
 
     for (int i = 0; i < read_times; ++i) {
       VecT temp_src = v_src[threadIdx.x + i * blockDim.x];
@@ -113,8 +123,8 @@ struct LayerNormDataReader {
 
 template <typename T, typename U>
 struct LayerNormDataReader<T, U, 1> {
-  __device__ inline void operator()(const T *__restrict__ row_src,
-                                    U *buffer,
+  __device__ inline void operator()(const T* __restrict__ row_src,
+                                    U* buffer,
                                     const int last_tid_idx,
                                     const int read_times,
                                     const int cols_this_thread) {
@@ -134,10 +144,10 @@ struct LayerNormDataReader<T, U, 1> {
 template <typename T, typename U, bool IsSameType, int VecSize>
 struct LayerNormDataWriter {
   __device__ inline void operator()(
-      T *__restrict__ row_dst,
-      const U *__restrict__ buffer,
-      const funcs::LayerNormScaleBiasT<T, U, IsSameType> *__restrict__ scale,
-      const funcs::LayerNormScaleBiasT<T, U, IsSameType> *__restrict__ bias,
+      T* __restrict__ row_dst,
+      const U* __restrict__ buffer,
+      const funcs::LayerNormScaleBiasT<T, U, IsSameType>* __restrict__ scale,
+      const funcs::LayerNormScaleBiasT<T, U, IsSameType>* __restrict__ bias,
       const U row_mean,
       const U row_inv_var,
       const int write_times,
@@ -145,10 +155,10 @@ struct LayerNormDataWriter {
       const int last_tid_idx,
       const bool valid_scale,
       const bool valid_bias) {
-    using VecT = phi::AlignedVector<T, VecSize>;
+    using VecT = AlignedVector<T, VecSize>;
     using ScaleT = funcs::LayerNormScaleBiasT<T, U, IsSameType>;
-    using VecScaleT = phi::AlignedVector<ScaleT, VecSize>;
-    VecT *v_dst = reinterpret_cast<VecT *>(row_dst);
+    using VecScaleT = AlignedVector<ScaleT, VecSize>;
+    VecT* v_dst = reinterpret_cast<VecT*>(row_dst);
 
     // cols_this_thread is just cols_per_thread
     if ((!valid_scale) && (!valid_bias)) {
@@ -159,16 +169,16 @@ struct LayerNormDataWriter {
           temp_dst[j] = static_cast<T>((buffer[i * VecSize + j] - row_mean) *
                                        row_inv_var);
         }
-        v_dst[threadIdx.x + blockDim.x * i] = temp_dst;
+        v_dst[threadIdx.x + static_cast<int64_t>(blockDim.x) * i] = temp_dst;
       }
     } else {
-      const VecScaleT *__restrict__ v_scale =
-          reinterpret_cast<const VecScaleT *__restrict__>(scale);
-      const VecScaleT *__restrict__ v_bias =
-          reinterpret_cast<const VecScaleT *__restrict__>(bias);
+      const VecScaleT* __restrict__ v_scale =
+          reinterpret_cast<const VecScaleT* __restrict__>(scale);
+      const VecScaleT* __restrict__ v_bias =
+          reinterpret_cast<const VecScaleT* __restrict__>(bias);
       if (valid_scale && valid_bias) {
         for (int i = 0; i < write_times; ++i) {
-          int idx = threadIdx.x + blockDim.x * i;
+          int64_t idx = threadIdx.x + static_cast<int64_t>(blockDim.x) * i;
           VecT temp_dst;
           VecScaleT temp_v_scale = v_scale[idx];
           VecScaleT temp_v_bias = v_bias[idx];
@@ -184,7 +194,7 @@ struct LayerNormDataWriter {
       } else {
         if (valid_scale) {
           for (int i = 0; i < write_times; ++i) {
-            int idx = threadIdx.x + blockDim.x * i;
+            int64_t idx = threadIdx.x + static_cast<int64_t>(blockDim.x) * i;
             VecT temp_dst;
             VecScaleT temp_v_scale = v_scale[idx];
 #pragma unroll
@@ -217,10 +227,10 @@ struct LayerNormDataWriter {
 template <typename T, typename U, bool IsSameType>
 struct LayerNormDataWriter<T, U, IsSameType, 1> {
   __device__ __forceinline__ void operator()(
-      T *__restrict__ row_dst,
-      U *__restrict__ buffer,
-      const funcs::LayerNormScaleBiasT<T, U, IsSameType> *__restrict__ scale,
-      const funcs::LayerNormScaleBiasT<T, U, IsSameType> *__restrict__ bias,
+      T* __restrict__ row_dst,
+      U* __restrict__ buffer,
+      const funcs::LayerNormScaleBiasT<T, U, IsSameType>* __restrict__ scale,
+      const funcs::LayerNormScaleBiasT<T, U, IsSameType>* __restrict__ bias,
       const U row_mean,
       const U row_inv_var,
       const int write_times,
@@ -232,19 +242,19 @@ struct LayerNormDataWriter<T, U, IsSameType, 1> {
     if ((!valid_scale) && (!valid_bias)) {
       if (threadIdx.x < last_tid_idx) {
         for (int i = 0; i < cols_this_thread; ++i) {
-          row_dst[threadIdx.x + last_tid_idx * i] =
+          row_dst[threadIdx.x + static_cast<int64_t>(last_tid_idx) * i] =
               (buffer[i] - row_mean) * row_inv_var;
         }
       } else {
         for (int i = 0; i < cols_this_thread; ++i) {
-          row_dst[last_tid_idx * write_times + i] =
+          row_dst[static_cast<int64_t>(last_tid_idx) * write_times + i] =
               (buffer[i] - row_mean) * row_inv_var;
         }
       }
     } else if (valid_scale && valid_bias) {
       if (threadIdx.x < last_tid_idx) {
         for (int i = 0; i < cols_this_thread; ++i) {
-          int idx = threadIdx.x + last_tid_idx * i;
+          int64_t idx = threadIdx.x + static_cast<int64_t>(last_tid_idx) * i;
           row_dst[idx] =
               static_cast<T>(static_cast<U>(scale[idx]) *
                                  (buffer[i] - row_mean) * row_inv_var +
@@ -252,7 +262,7 @@ struct LayerNormDataWriter<T, U, IsSameType, 1> {
         }
       } else {
         for (int i = 0; i < cols_this_thread; ++i) {
-          int idx = last_tid_idx * write_times + i;
+          int64_t idx = static_cast<int64_t>(last_tid_idx) * write_times + i;
           row_dst[idx] =
               static_cast<T>(static_cast<U>(scale[idx]) *
                                  (buffer[i] - row_mean) * row_inv_var +
@@ -263,13 +273,13 @@ struct LayerNormDataWriter<T, U, IsSameType, 1> {
       if (valid_scale) {
         if (threadIdx.x < last_tid_idx) {
           for (int i = 0; i < cols_this_thread; ++i) {
-            int idx = threadIdx.x + last_tid_idx * i;
+            int64_t idx = threadIdx.x + static_cast<int64_t>(last_tid_idx) * i;
             row_dst[idx] = static_cast<T>(static_cast<U>(scale[idx]) *
                                           (buffer[i] - row_mean) * row_inv_var);
           }
         } else {
           for (int i = 0; i < cols_this_thread; ++i) {
-            int idx = last_tid_idx * write_times + i;
+            int64_t idx = static_cast<int64_t>(last_tid_idx) * write_times + i;
             row_dst[idx] = static_cast<T>(static_cast<U>(scale[idx]) *
                                           (buffer[i] - row_mean) * row_inv_var);
           }
@@ -277,13 +287,13 @@ struct LayerNormDataWriter<T, U, IsSameType, 1> {
       } else {
         if (threadIdx.x < last_tid_idx) {
           for (int i = 0; i < cols_this_thread; ++i) {
-            int idx = threadIdx.x + last_tid_idx * i;
+            int64_t idx = threadIdx.x + static_cast<int64_t>(last_tid_idx) * i;
             row_dst[idx] = static_cast<T>((buffer[i] - row_mean) * row_inv_var +
                                           static_cast<U>(bias[idx]));
           }
         } else {
           for (int i = 0; i < cols_this_thread; ++i) {
-            int idx = last_tid_idx * write_times + i;
+            int64_t idx = static_cast<int64_t>(last_tid_idx) * write_times + i;
             row_dst[idx] = static_cast<T>((buffer[i] - row_mean) * row_inv_var +
                                           static_cast<U>(bias[idx]));
           }
@@ -295,12 +305,12 @@ struct LayerNormDataWriter<T, U, IsSameType, 1> {
 
 template <typename IndexT, typename T, typename U, bool IsSameType, int VecSize>
 __global__ void LayerNormFwdWithWelford(
-    const T *__restrict__ src_data,
-    T *dst_data,
-    const funcs::LayerNormScaleBiasT<T, U, IsSameType> *__restrict__ scale,
-    const funcs::LayerNormScaleBiasT<T, U, IsSameType> *__restrict__ bias,
-    U *mean,
-    U *var,
+    const T* __restrict__ src_data,
+    T* dst_data,
+    const funcs::LayerNormScaleBiasT<T, U, IsSameType>* __restrict__ scale,
+    const funcs::LayerNormScaleBiasT<T, U, IsSameType>* __restrict__ bias,
+    U* mean,
+    U* var,
     const U epsilon,
     const IndexT rows,
     const int32_t cols,
@@ -320,8 +330,8 @@ __global__ void LayerNormFwdWithWelford(
     U tid_mean = static_cast<U>(0);
     U tid_square = static_cast<U>(0);
 
-    const T *__restrict__ row_src = src_data + row_offset * cols;
-    T *row_dst = dst_data + row_offset * cols;
+    const T* __restrict__ row_src = src_data + row_offset * cols;
+    T* row_dst = dst_data + row_offset * cols;
     LayerNormDataReader<T, U, VecSize>()(
         row_src, buffer, last_tid_idx, read_times, cols_this_thread);
 
@@ -358,14 +368,14 @@ __global__ void LayerNormFwdWithWelford(
 }
 
 template <typename Context, typename T, typename U>
-void LaunchLayerNormKernel(const Context &dev_ctx,
-                           const T *x_data,
-                           T *y_data,
-                           const void *void_scale_data,
-                           const void *void_bias_data,
-                           U *mean_data,
-                           U *var_data,
-                           float epsilon,
+void LaunchLayerNormKernel(const Context& dev_ctx,
+                           const T* x_data,
+                           T* y_data,
+                           const void* void_scale_data,
+                           const void* void_bias_data,
+                           U* mean_data,
+                           U* var_data,
+                           double epsilon,
                            const int64_t rows,
                            const int cols,
                            const bool valid_scale,
@@ -389,7 +399,8 @@ void LaunchLayerNormKernel(const Context &dev_ctx,
                    : addr;
         addr = valid_bias ? (addr | reinterpret_cast<uint64_t>(void_bias_data))
                           : addr;
-        data_vec_size = phi::GetVectorizedSize<T>(reinterpret_cast<T *>(addr));
+        data_vec_size =
+            std::min(4, phi::GetVectorizedSize<T>(reinterpret_cast<T*>(addr)));
       } else {
         uint64_t bias_addr = reinterpret_cast<uint64_t>(void_bias_data);
         uint64_t attr_addr = valid_scale
@@ -399,8 +410,9 @@ void LaunchLayerNormKernel(const Context &dev_ctx,
                         ? (valid_scale ? (attr_addr | bias_addr) : attr_addr)
                         : attr_addr;
         data_vec_size = std::min(
-            phi::GetVectorizedSize<T>(reinterpret_cast<T *>(addr)),
-            phi::GetVectorizedSize<U>(reinterpret_cast<U *>(attr_addr)));
+            phi::GetVectorizedSize<T>(reinterpret_cast<T*>(addr)),
+            phi::GetVectorizedSize<U>(reinterpret_cast<U*>(attr_addr)));
+        data_vec_size = std::min(4, data_vec_size);
       }
     }
     for (int size = data_vec_size; size > 0; size /= 2) {
@@ -417,8 +429,8 @@ void LaunchLayerNormKernel(const Context &dev_ctx,
         <<<block_size, threads, 0, dev_ctx.stream()>>>(                     \
             x_data,                                                         \
             y_data,                                                         \
-            static_cast<const scale_t *>(void_scale_data),                  \
-            static_cast<const scale_t *>(void_bias_data),                   \
+            static_cast<const scale_t*>(void_scale_data),                   \
+            static_cast<const scale_t*>(void_bias_data),                    \
             mean_data,                                                      \
             var_data,                                                       \
             static_cast<const U>(epsilon),                                  \
@@ -455,22 +467,24 @@ void LaunchLayerNormKernel(const Context &dev_ctx,
 template <typename T, typename U>
 void LayerNormDirectCUDAFunctor<T, U>::operator()(
     gpuStream_t stream,
-    const T *input,
+    const T* input,
     std::vector<int64_t> input_shape,
-    const U *bias,
-    const U *scale,
-    T *output,
-    U *mean,
-    U *variance,
+    const U* bias,
+    const U* scale,
+    T* output,
+    U* mean,
+    U* variance,
     int begin_norm_axis,
     float eps) {
-  const auto x_dims = common::make_ddim(input_shape);
+  const auto x_dims = make_ddim(input_shape);
   auto matrix_dim = common::flatten_to_2d(x_dims, begin_norm_axis);
-  int64_t batch_size = static_cast<int64_t>(matrix_dim[0]);
-  int64_t feature_size = static_cast<int64_t>(matrix_dim[1]);
-  switch (phi::funcs::GetDesiredBlockDim(feature_size)) {
+  int64_t batch_size = matrix_dim[0];
+  int64_t feature_size = matrix_dim[1];
+  // TODO(large-tensor): generic kernel launch uses int32 grid dim
+  PADDLE_ENFORCE_LE_INT_MAX(batch_size, "batch_size");
+  switch (funcs::GetDesiredBlockDim(feature_size)) {
     FIXED_BLOCK_DIM_CASE(
-        phi::funcs::LayerNormForward<T, U, kBlockDim>
+        funcs::LayerNormForward<T, U, kBlockDim>
         <<<batch_size, kBlockDim, 0, stream>>>(
             input, scale, bias, output, mean, variance, eps, feature_size));
     default:
@@ -481,38 +495,79 @@ void LayerNormDirectCUDAFunctor<T, U>::operator()(
   }
 }
 
-template class LayerNormDirectCUDAFunctor<float, float>;
-template class LayerNormDirectCUDAFunctor<double, double>;
+template class PADDLE_API LayerNormDirectCUDAFunctor<float, float>;
+template class PADDLE_API LayerNormDirectCUDAFunctor<double, double>;
 #if defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
-template class LayerNormDirectCUDAFunctor<half, float>;
+template class PADDLE_API LayerNormDirectCUDAFunctor<half, float>;
+#endif
+static inline LayerNormKernelVariant LayerNormKernelDispatch(
+    const paddle::DataType weight_type,
+    const paddle::DataType input_type,
+    const paddle::DataType output_type,
+    const paddle::DataType compute_type,
+    const uint32_t hidden_size,
+    const int64_t x_numel,
+    const DenseTensor* scale,
+    const DenseTensor* bias) {
+  if (scale == nullptr || bias == nullptr) {
+    return LayerNormKernelVariant::GENERIC;
+  }
+#ifdef PADDLE_WITH_CUDA
+  if (FLAGS_use_accuracy_compatible_kernel) {
+    return LayerNormKernelVariant::GENERIC;
+  }
+#endif
+#if defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && !defined(_WIN32)
+  if (input_type != paddle::DataType::FLOAT32 && hidden_size != 4096 &&
+      hidden_size > 1024 && hidden_size <= 10240 &&
+      x_numel <= std::numeric_limits<uint32_t>::max()) {
+    // using fast_ln_v2 only sm > 70 and x_numel <= uint32_max
+    auto prop = funcs::fast_ln_v2::GetDeviceProp();
+    if (prop->major > 7 &&
+        funcs::fast_ln_v2::has_fast_ln_v2_fwd_kernel(
+            weight_type, input_type, output_type, compute_type, hidden_size)) {
+      return LayerNormKernelVariant::FAST_LN_V2;
+    }
+  }
 #endif
+  if ((hidden_size >= 768 && hidden_size <= 2048 && hidden_size % 256 == 0 ||
+       hidden_size == 4096) &&
+      x_numel <= std::numeric_limits<int>::max() && scale != nullptr &&
+      bias != nullptr) {
+    return LayerNormKernelVariant::FAST_LN_V1;
+  }
+
+  return LayerNormKernelVariant::GENERIC;
+}
 
 template <typename T, typename Context>
-void LayerNormKernel(const Context &dev_ctx,
-                     const DenseTensor &x,
-                     const paddle::optional<DenseTensor> &scale_opt,
-                     const paddle::optional<DenseTensor> &bias_opt,
-                     float epsilon,
+void LayerNormKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const optional<DenseTensor>& scale_opt,
+                     const optional<DenseTensor>& bias_opt,
+                     double epsilon,
                      int begin_norm_axis,
-                     DenseTensor *y,
-                     DenseTensor *mean,
-                     DenseTensor *var) {
-  using U = phi::funcs::LayerNormParamType<T>;
-  auto *scale = scale_opt.get_ptr();
-  auto *bias = bias_opt.get_ptr();
+                     DenseTensor* y,
+                     DenseTensor* mean,
+                     DenseTensor* var) {
+  using U = funcs::LayerNormParamType<T>;
+  auto* scale = scale_opt.get_ptr();
+  auto* bias = bias_opt.get_ptr();
 
   const auto x_dims = x.dims();
-  auto *x_data = x.data<T>();
-  auto *y_data = dev_ctx.template Alloc<T>(y);
-  auto *mean_data = dev_ctx.template Alloc<U>(mean);
-  auto *var_data = dev_ctx.template Alloc<U>(var);
+  auto* x_data = x.data<T>();
+  auto* y_data = dev_ctx.template Alloc<T>(y);
+  auto* mean_data = dev_ctx.template Alloc<U>(mean);
+  auto* var_data = dev_ctx.template Alloc<U>(var);
+  if (x.numel() == 0) return;
 
   bool valid_scale = (scale != nullptr);
   bool valid_bias = (bias != nullptr);
-  auto *void_scale_data = valid_scale ? scale->data() : nullptr;
-  auto *void_bias_data = valid_bias ? bias->data() : nullptr;
+  auto* void_scale_data = valid_scale ? scale->data() : nullptr;
+  auto* void_bias_data = valid_bias ? bias->data() : nullptr;
 
   auto x_dtype = x.dtype();
+  auto y_dtype = y->dtype();
   phi::DataType scale_bias_dtype;
   if (valid_scale) {
     scale_bias_dtype = scale->dtype();
@@ -536,33 +591,35 @@ void LayerNormKernel(const Context &dev_ctx,
   }
 
   auto matrix_dim = common::flatten_to_2d(x_dims, begin_norm_axis);
-  int64_t batch_size = static_cast<int64_t>(matrix_dim[0]);
-  int64_t feature_size = static_cast<int64_t>(matrix_dim[1]);
+  int64_t batch_size = matrix_dim[0];
+  // TODO(large-tensor): generic kernel launch uses int32 grid dim
+  PADDLE_ENFORCE_LE_INT_MAX(batch_size, "batch_size");
+  int64_t feature_size = matrix_dim[1];
   auto stream = dev_ctx.stream();
-
-#define PADDLE_LAUNCH_LAYERNORM_FWD(ScaleBiasT, IsScaleBiasSameDTypeWithX) \
-  do {                                                                     \
-    switch (phi::funcs::GetDesiredBlockDim(feature_size)) {                \
-      FIXED_BLOCK_DIM_CASE(                                                \
-          phi::funcs::                                                     \
-              LayerNormForward<T, U, kBlockDim, IsScaleBiasSameDTypeWithX> \
-          <<<batch_size, kBlockDim, 0, stream>>>(                          \
-              x_data,                                                      \
-              static_cast<const ScaleBiasT *>(void_scale_data),            \
-              static_cast<const ScaleBiasT *>(void_bias_data),             \
-              y_data,                                                      \
-              mean_data,                                                   \
-              var_data,                                                    \
-              epsilon,                                                     \
-              feature_size));                                              \
-      default:                                                             \
-        PADDLE_THROW(common::errors::InvalidArgument(                      \
-            "Product from begin_norm_axis to end must be larger than 1")); \
-        break;                                                             \
-    }                                                                      \
+  auto place = x.place();
+
+#define PADDLE_LAUNCH_LAYERNORM_FWD(ScaleBiasT, IsScaleBiasSameDTypeWithX)    \
+  do {                                                                        \
+    switch (funcs::GetDesiredBlockDim(feature_size)) {                        \
+      FIXED_BLOCK_DIM_CASE(                                                   \
+          funcs::LayerNormForward<T, U, kBlockDim, IsScaleBiasSameDTypeWithX> \
+          <<<batch_size, kBlockDim, 0, stream>>>(                             \
+              x_data,                                                         \
+              static_cast<const ScaleBiasT*>(void_scale_data),                \
+              static_cast<const ScaleBiasT*>(void_bias_data),                 \
+              y_data,                                                         \
+              mean_data,                                                      \
+              var_data,                                                       \
+              epsilon,                                                        \
+              feature_size));                                                 \
+      default:                                                                \
+        PADDLE_THROW(common::errors::InvalidArgument(                         \
+            "Product from begin_norm_axis to end must be larger than 1"));    \
+        break;                                                                \
+    }                                                                         \
   } while (0)
 
-#define PADDLE_LAUNCH_FAST_LAYERNORM_FWD_BASE(ScaleT, feature_size)          \
+#define PADDLE_LAUNCH_FAST_LAYERNORM_V1_FWD_BASE(ScaleT, feature_size)       \
   case (feature_size): {                                                     \
     constexpr int WARPS_N = feature_size < 1024 ? 1 : (feature_size / 1024); \
     constexpr int WARPS_M = 4 / WARPS_N;                                     \
@@ -573,98 +630,161 @@ void LayerNormKernel(const Context &dev_ctx,
     const int ROWS_PER_CTA = WARPS_M;                                        \
     const int grid = static_cast<int>(                                       \
         std::ceil(batch_size / static_cast<float>(ROWS_PER_CTA)));           \
-    phi::funcs::fast_ln_fwd_kernel<T,                                        \
-                                   U,                                        \
-                                   ScaleT,                                   \
-                                   VecSize,                                  \
-                                   WARPS_M,                                  \
-                                   WARPS_N,                                  \
-                                   BYTES_PER_LDG,                            \
-                                   feature_size>                             \
+    funcs::fast_ln_v1::fast_ln_v1_fwd_kernel<T,                              \
+                                             U,                              \
+                                             ScaleT,                         \
+                                             VecSize,                        \
+                                             WARPS_M,                        \
+                                             WARPS_N,                        \
+                                             BYTES_PER_LDG,                  \
+                                             feature_size>                   \
         <<<grid, THREADS_PER_CTA, 0, stream>>>(                              \
             batch_size,                                                      \
             feature_size,                                                    \
             epsilon,                                                         \
             x_data,                                                          \
-            static_cast<const ScaleT *>(void_scale_data),                    \
-            static_cast<const ScaleT *>(void_bias_data),                     \
+            static_cast<const ScaleT*>(void_scale_data),                     \
+            static_cast<const ScaleT*>(void_bias_data),                      \
             mean_data,                                                       \
             var_data,                                                        \
             y_data);                                                         \
   } break
 
-#define PADDLE_LAUNCH_FAST_LAYERNORM_FWD(ScaleT)       \
-  PADDLE_LAUNCH_FAST_LAYERNORM_FWD_BASE(ScaleT, 768);  \
-  PADDLE_LAUNCH_FAST_LAYERNORM_FWD_BASE(ScaleT, 1024); \
-  PADDLE_LAUNCH_FAST_LAYERNORM_FWD_BASE(ScaleT, 1280); \
-  PADDLE_LAUNCH_FAST_LAYERNORM_FWD_BASE(ScaleT, 1536); \
-  PADDLE_LAUNCH_FAST_LAYERNORM_FWD_BASE(ScaleT, 1792); \
-  PADDLE_LAUNCH_FAST_LAYERNORM_FWD_BASE(ScaleT, 2048); \
-  PADDLE_LAUNCH_FAST_LAYERNORM_FWD_BASE(ScaleT, 4096)
-
-#ifdef PADDLE_WITH_CUDA
-  bool can_call_fast_kernel = false;
-  if ((feature_size >= 768 && feature_size <= 2048 && feature_size % 256 == 0 ||
-       feature_size == 4096) &&
-      scale != nullptr && bias != nullptr) {
-    can_call_fast_kernel = true;
-  }
-
-  if (can_call_fast_kernel) {
-    if (is_scale_bias_same_dtype_with_x) {
-      switch (feature_size) {
-        PADDLE_LAUNCH_FAST_LAYERNORM_FWD(T);
-        default:
-          PADDLE_THROW(common::errors::InvalidArgument(
-              "Only when feature_size is from 256 to 4096 and is diviaible by "
-              "256 is supported "
-              "now"));
-          break;
-      }
-    } else {
-      switch (feature_size) {
-        PADDLE_LAUNCH_FAST_LAYERNORM_FWD(U);
-        default:
-          PADDLE_THROW(common::errors::InvalidArgument(
-              "Only when feature_size is from 256 to 4096 and is diviaible by "
-              "is supported "
-              "now"));
-          break;
-      }
-    }
-  } else {
-    // WarpShuffle intrinsics is involved in LaunchLayerNormKernel.
-    if (FLAGS_use_fast_math && feature_size <= 1024 &&
-        (!std::is_same<T, int8_t>::value)) {
-      LaunchLayerNormKernel<Context, T, U>(dev_ctx,
-                                           x_data,
-                                           y_data,
-                                           void_scale_data,
-                                           void_bias_data,
-                                           mean_data,
-                                           var_data,
-                                           epsilon,
-                                           batch_size,
-                                           feature_size,
-                                           valid_scale,
-                                           valid_bias,
-                                           is_scale_bias_same_dtype_with_x);
-    } else {
+#define PADDLE_LAUNCH_FAST_LAYERNORM_V1_FWD(ScaleT)       \
+  PADDLE_LAUNCH_FAST_LAYERNORM_V1_FWD_BASE(ScaleT, 768);  \
+  PADDLE_LAUNCH_FAST_LAYERNORM_V1_FWD_BASE(ScaleT, 1024); \
+  PADDLE_LAUNCH_FAST_LAYERNORM_V1_FWD_BASE(ScaleT, 1280); \
+  PADDLE_LAUNCH_FAST_LAYERNORM_V1_FWD_BASE(ScaleT, 1536); \
+  PADDLE_LAUNCH_FAST_LAYERNORM_V1_FWD_BASE(ScaleT, 1792); \
+  PADDLE_LAUNCH_FAST_LAYERNORM_V1_FWD_BASE(ScaleT, 2048); \
+  PADDLE_LAUNCH_FAST_LAYERNORM_V1_FWD_BASE(ScaleT, 4096)
+  auto compute_dtype = phi::CppTypeToDataType<U>::Type();
+  auto kernel_variant = LayerNormKernelDispatch(scale_bias_dtype,
+                                                x_dtype,
+                                                y_dtype,
+                                                compute_dtype,
+                                                feature_size,
+                                                x.numel(),
+                                                scale,
+                                                bias);
+
+  switch (kernel_variant) {
+#if defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && !defined(_WIN32)
+    case LayerNormKernelVariant::FAST_LN_V2:
+      funcs::fast_ln_v2::LaunchNormFwd<T, Context>(dev_ctx,
+                                                   stream,
+                                                   place,
+                                                   x_data,
+                                                   void_scale_data,
+                                                   void_bias_data,
+                                                   y_data,
+                                                   mean_data,
+                                                   var_data,
+                                                   scale_bias_dtype,
+                                                   x_dtype,
+                                                   y_dtype,
+                                                   compute_dtype,
+                                                   feature_size,
+                                                   batch_size,
+                                                   feature_size,
+                                                   epsilon);
+      break;
 #endif
+    case LayerNormKernelVariant::FAST_LN_V1:
       if (is_scale_bias_same_dtype_with_x) {
-        PADDLE_LAUNCH_LAYERNORM_FWD(T, true);
+        switch (feature_size) {
+          PADDLE_LAUNCH_FAST_LAYERNORM_V1_FWD(T);
+          default:
+            break;
+        }
       } else {
-        PADDLE_LAUNCH_LAYERNORM_FWD(U, false);
+        switch (feature_size) {
+          PADDLE_LAUNCH_FAST_LAYERNORM_V1_FWD(U);
+          default:
+            break;
+        }
       }
+      break;
+    case LayerNormKernelVariant::GENERIC:
+    default:
 #ifdef PADDLE_WITH_CUDA
-    }
-  }
+      if (FLAGS_use_accuracy_compatible_kernel ||
+          (!isPowerOfTwo(feature_size) && feature_size > 1024)) {
+        LayerNormFwdCompatKernel<T, Context>(
+            dev_ctx,
+            x_data,
+            valid_scale ? static_cast<const T*>(void_scale_data) : nullptr,
+            valid_bias ? static_cast<const T*>(void_bias_data) : nullptr,
+            epsilon,
+            batch_size,
+            feature_size,
+            y_data,
+            mean_data,
+            var_data);
+      } else if (FLAGS_use_fast_math && feature_size <= 1024 &&
+                 (!std::is_same<T, int8_t>::value)) {
+        // WarpShuffle intrinsics is involved in LaunchLayerNormKernel.
+        LaunchLayerNormKernel<Context, T, U>(dev_ctx,
+                                             x_data,
+                                             y_data,
+                                             void_scale_data,
+                                             void_bias_data,
+                                             mean_data,
+                                             var_data,
+                                             epsilon,
+                                             batch_size,
+                                             feature_size,
+                                             valid_scale,
+                                             valid_bias,
+                                             is_scale_bias_same_dtype_with_x);
+      } else {
 #endif
+        if (is_scale_bias_same_dtype_with_x) {
+          PADDLE_LAUNCH_LAYERNORM_FWD(T, true);
+        } else {
+          PADDLE_LAUNCH_LAYERNORM_FWD(U, false);
+        }
+#ifdef PADDLE_WITH_CUDA
+      }
+#endif
+      break;
+  }
 
 #undef PADDLE_LAUNCH_LAYERNORM_FWD
-#undef PADDLE_LAUNCH_FAST_LAYERNORM_FWD
+#undef PADDLE_LAUNCH_FAST_LAYERNORM_V1_FWD
 }
-
+#ifdef _WIN32
+template PADDLE_API void LayerNormKernel<float, GPUContext>(
+    const GPUContext& dev_ctx,
+    const DenseTensor& x,
+    const optional<DenseTensor>& scale_opt,
+    const optional<DenseTensor>& bias_opt,
+    double epsilon,
+    int begin_norm_axis,
+    DenseTensor* y,
+    DenseTensor* mean,
+    DenseTensor* var);
+template PADDLE_API void LayerNormKernel<phi::dtype::float16, GPUContext>(
+    const GPUContext& dev_ctx,
+    const DenseTensor& x,
+    const optional<DenseTensor>& scale_opt,
+    const optional<DenseTensor>& bias_opt,
+    double epsilon,
+    int begin_norm_axis,
+    DenseTensor* y,
+    DenseTensor* mean,
+    DenseTensor* var);
+template PADDLE_API void LayerNormKernel<double, GPUContext>(
+    const GPUContext& dev_ctx,
+    const DenseTensor& x,
+    const optional<DenseTensor>& scale_opt,
+    const optional<DenseTensor>& bias_opt,
+    double epsilon,
+    int begin_norm_axis,
+    DenseTensor* y,
+    DenseTensor* mean,
+    DenseTensor* var);
+#endif
 }  // namespace phi
 
 #ifdef PADDLE_WITH_HIP
@@ -674,7 +794,7 @@ PD_REGISTER_PLUGIN_KERNEL(layer_norm,
                           ALL_LAYOUT,
                           phi::LayerNormKernel,
                           float,
-                          phi::dtype::float16) {
+                          phi::float16) {
   kernel->OutputAt(1).SetDataType(phi::DataType::UNDEFINED);
   kernel->OutputAt(2).SetDataType(phi::DataType::UNDEFINED);
 }
@@ -685,8 +805,8 @@ PD_REGISTER_PLUGIN_KERNEL(layer_norm,
                           phi::LayerNormKernel,
                           float,
                           double,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16) {
+                          phi::float16,
+                          phi::bfloat16) {
   kernel->OutputAt(1).SetDataType(phi::DataType::UNDEFINED);
   kernel->OutputAt(2).SetDataType(phi::DataType::UNDEFINED);
 }
@@ -697,7 +817,7 @@ PD_REGISTER_PLUGIN_KERNEL(layer_norm,
                           phi::LayerNormKernel,
                           float,
                           double,
-                          phi::dtype::float16) {
+                          phi::float16) {
   kernel->OutputAt(1).SetDataType(phi::DataType::UNDEFINED);
   kernel->OutputAt(2).SetDataType(phi::DataType::UNDEFINED);
 }
diff --git a/backends/metax_gpu/kernels/custom_kernel/lstm_kernel_register.cu b/backends/metax_gpu/kernels/custom_kernel/lstm_kernel_register.cu
index 15cd9b9238..1b094198dc 100644
--- a/backends/metax_gpu/kernels/custom_kernel/lstm_kernel_register.cu
+++ b/backends/metax_gpu/kernels/custom_kernel/lstm_kernel_register.cu
@@ -1,4 +1,4 @@
-// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,8 +12,786 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "kernels/impl/lstm_kernel_impl.h"
-#include "paddle/phi/kernels/lstm_kernel.h"
+// #include "kernels/funcs/layer_norm_util.h"
+#include "paddle/common/flags.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/layer_norm_kernel.h"
+#if defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && !defined(_WIN32)
+#include "paddle/phi/kernels/funcs/fast_ln_v2.h"
+#endif
+#include "funcs/layer_norm_util.h"
+#include "impl/layer_norm_impl.cu.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/phi/kernels/gpu/rms_norm_cuda_kernel.h"
+#endif
+COMMON_DECLARE_bool(use_fast_math);
 
-PD_REGISTER_PLUGIN_KERNEL(
-    lstm, metax_gpu, ALL_LAYOUT, phi::LSTMKernel, float, double) {}
+namespace phi {
+
+enum class LayerNormKernelVariant { FAST_LN_V1, FAST_LN_V2, GENERIC };
+
+#ifdef PADDLE_WITH_CUDA
+template <typename U>
+__device__ inline void WelfordOnline(U val, U* mean, U* square, U* count) {
+  *count += 1;
+  U delta1 = val - *mean;
+  *mean += delta1 / (*count);
+  U delta2 = val - *mean;
+  *square += delta1 * delta2;
+}
+
+template <typename U>
+__device__ inline void WelfordOnline(
+    U b_mean, U b_square, U b_cnt, U* mean, U* square, U* count) {
+  if (b_cnt == 0) {
+    return;
+  }
+
+  U new_cnt = *count + b_cnt;
+  U nb_n = b_cnt / new_cnt;
+  U delta = b_mean - *mean;
+  *mean += delta * nb_n;
+  *square += b_square + delta * delta * (*count) * nb_n;
+  *count = new_cnt;
+}
+
+template <typename U>
+__device__ inline void WelfordWarpAllReduce(U* mean, U* square, U* count) {
+  constexpr int kWarpSize = 32;
+#pragma unroll
+  for (int mask = 1; mask < kWarpSize; mask *= 2) {
+    U b_mean = __shfl_down_sync(0xffffffff, *mean, mask);
+    U b_square = __shfl_down_sync(0xffffffff, *square, mask);
+    U b_cnt = __shfl_down_sync(0xffffffff, *count, mask);
+    WelfordOnline<U>(b_mean, b_square, b_cnt, mean, square, count);
+  }
+
+  *mean = __shfl_sync(0xffffffff, *mean, 0, kWarpSize);
+  *square = __shfl_sync(0xffffffff, *square, 0, kWarpSize);
+  *count = __shfl_sync(0xffffffff, *count, 0, kWarpSize);
+}
+
+template <int VecSize>
+struct ThreadAssigner {
+  __device__ __forceinline__ int operator()(const int cols,
+                                            const int cols_per_thread,
+                                            int32_t* last_tid_idx) {
+    return cols_per_thread;
+  }
+};
+
+template <>
+struct ThreadAssigner<1> {
+  __device__ inline int operator()(const int cols,
+                                   const int cols_per_thread,
+                                   int* last_tid_idx) {
+    int cols_this_thread = cols_per_thread;
+    int last_tid = (cols / cols_per_thread);
+    *last_tid_idx = last_tid;
+    if (threadIdx.x == last_tid) {
+      cols_this_thread = cols - cols_per_thread * last_tid;
+    } else if (threadIdx.x > last_tid) {
+      cols_this_thread = 0;
+    }
+    return cols_this_thread;
+  }
+};
+
+template <typename T, typename U, int VecSize>
+struct LayerNormDataReader {
+  __device__ inline void operator()(const T* __restrict__ row_src,
+                                    U* buffer,
+                                    const int last_tid_idx,
+                                    const int read_times,
+                                    const int cols_this_thread) {
+    using VecT = AlignedVector<T, VecSize>;
+    const VecT* __restrict__ v_src =
+        reinterpret_cast<const VecT* __restrict__>(row_src);
+
+    for (int i = 0; i < read_times; ++i) {
+      VecT temp_src = v_src[threadIdx.x + i * blockDim.x];
+#pragma unroll
+      for (int j = 0; j < VecSize; ++j) {
+        buffer[i * VecSize + j] = static_cast<U>(temp_src[j]);
+      }
+    }
+  }
+};
+
+template <typename T, typename U>
+struct LayerNormDataReader<T, U, 1> {
+  __device__ inline void operator()(const T* __restrict__ row_src,
+                                    U* buffer,
+                                    const int last_tid_idx,
+                                    const int read_times,
+                                    const int cols_this_thread) {
+    // read_time is just cols_per_thread while VecSize is 1.
+    if (threadIdx.x < last_tid_idx) {
+      for (int i = 0; i < cols_this_thread; ++i) {
+        buffer[i] = static_cast<U>(row_src[threadIdx.x + last_tid_idx * i]);
+      }
+    } else {
+      for (int i = 0; i < cols_this_thread; ++i) {
+        buffer[i] = static_cast<U>(row_src[i + read_times * last_tid_idx]);
+      }
+    }
+  }
+};
+
+template <typename T, typename U, bool IsSameType, int VecSize>
+struct LayerNormDataWriter {
+  __device__ inline void operator()(
+      T* __restrict__ row_dst,
+      const U* __restrict__ buffer,
+      const funcs::LayerNormScaleBiasT<T, U, IsSameType>* __restrict__ scale,
+      const funcs::LayerNormScaleBiasT<T, U, IsSameType>* __restrict__ bias,
+      const U row_mean,
+      const U row_inv_var,
+      const int write_times,
+      const int cols_this_thread,
+      const int last_tid_idx,
+      const bool valid_scale,
+      const bool valid_bias) {
+    using VecT = AlignedVector<T, VecSize>;
+    using ScaleT = funcs::LayerNormScaleBiasT<T, U, IsSameType>;
+    using VecScaleT = AlignedVector<ScaleT, VecSize>;
+    VecT* v_dst = reinterpret_cast<VecT*>(row_dst);
+
+    // cols_this_thread is just cols_per_thread
+    if ((!valid_scale) && (!valid_bias)) {
+      for (int i = 0; i < write_times; ++i) {
+        VecT temp_dst;
+#pragma unroll
+        for (int j = 0; j < VecSize; ++j) {
+          temp_dst[j] = static_cast<T>((buffer[i * VecSize + j] - row_mean) *
+                                       row_inv_var);
+        }
+        v_dst[threadIdx.x + static_cast<int64_t>(blockDim.x) * i] = temp_dst;
+      }
+    } else {
+      const VecScaleT* __restrict__ v_scale =
+          reinterpret_cast<const VecScaleT* __restrict__>(scale);
+      const VecScaleT* __restrict__ v_bias =
+          reinterpret_cast<const VecScaleT* __restrict__>(bias);
+      if (valid_scale && valid_bias) {
+        for (int i = 0; i < write_times; ++i) {
+          int64_t idx = threadIdx.x + static_cast<int64_t>(blockDim.x) * i;
+          VecT temp_dst;
+          VecScaleT temp_v_scale = v_scale[idx];
+          VecScaleT temp_v_bias = v_bias[idx];
+#pragma unroll
+          for (int j = 0; j < VecSize; ++j) {
+            temp_dst[j] = static_cast<T>(
+                static_cast<U>(temp_v_scale[j]) *
+                    (buffer[i * VecSize + j] - row_mean) * row_inv_var +
+                static_cast<U>(temp_v_bias[j]));
+          }
+          v_dst[idx] = temp_dst;
+        }
+      } else {
+        if (valid_scale) {
+          for (int i = 0; i < write_times; ++i) {
+            int64_t idx = threadIdx.x + static_cast<int64_t>(blockDim.x) * i;
+            VecT temp_dst;
+            VecScaleT temp_v_scale = v_scale[idx];
+#pragma unroll
+            for (int j = 0; j < VecSize; ++j) {
+              temp_dst[j] = static_cast<T>(
+                  static_cast<U>(temp_v_scale[j]) *
+                  (buffer[i * VecSize + j] - row_mean) * row_inv_var);
+            }
+            v_dst[idx] = temp_dst;
+          }
+        } else {
+          for (int i = 0; i < write_times; ++i) {
+            int idx = threadIdx.x + blockDim.x * i;
+            VecT temp_dst;
+            VecScaleT temp_v_bias = v_bias[idx];
+#pragma unroll
+            for (int j = 0; j < VecSize; ++j) {
+              temp_dst[j] = static_cast<T>(
+                  (buffer[i * VecSize + j] - row_mean) * row_inv_var +
+                  static_cast<U>(temp_v_bias[j]));
+            }
+            v_dst[idx] = temp_dst;
+          }
+        }
+      }
+    }
+  }
+};
+
+template <typename T, typename U, bool IsSameType>
+struct LayerNormDataWriter<T, U, IsSameType, 1> {
+  __device__ __forceinline__ void operator()(
+      T* __restrict__ row_dst,
+      U* __restrict__ buffer,
+      const funcs::LayerNormScaleBiasT<T, U, IsSameType>* __restrict__ scale,
+      const funcs::LayerNormScaleBiasT<T, U, IsSameType>* __restrict__ bias,
+      const U row_mean,
+      const U row_inv_var,
+      const int write_times,
+      const int cols_this_thread,
+      const int last_tid_idx,
+      const bool valid_scale,
+      const bool valid_bias) {
+    // write_times is just col_per_thread.
+    if ((!valid_scale) && (!valid_bias)) {
+      if (threadIdx.x < last_tid_idx) {
+        for (int i = 0; i < cols_this_thread; ++i) {
+          row_dst[threadIdx.x + static_cast<int64_t>(last_tid_idx) * i] =
+              (buffer[i] - row_mean) * row_inv_var;
+        }
+      } else {
+        for (int i = 0; i < cols_this_thread; ++i) {
+          row_dst[static_cast<int64_t>(last_tid_idx) * write_times + i] =
+              (buffer[i] - row_mean) * row_inv_var;
+        }
+      }
+    } else if (valid_scale && valid_bias) {
+      if (threadIdx.x < last_tid_idx) {
+        for (int i = 0; i < cols_this_thread; ++i) {
+          int64_t idx = threadIdx.x + static_cast<int64_t>(last_tid_idx) * i;
+          row_dst[idx] =
+              static_cast<T>(static_cast<U>(scale[idx]) *
+                                 (buffer[i] - row_mean) * row_inv_var +
+                             static_cast<U>(bias[idx]));
+        }
+      } else {
+        for (int i = 0; i < cols_this_thread; ++i) {
+          int64_t idx = static_cast<int64_t>(last_tid_idx) * write_times + i;
+          row_dst[idx] =
+              static_cast<T>(static_cast<U>(scale[idx]) *
+                                 (buffer[i] - row_mean) * row_inv_var +
+                             static_cast<U>(bias[idx]));
+        }
+      }
+    } else {
+      if (valid_scale) {
+        if (threadIdx.x < last_tid_idx) {
+          for (int i = 0; i < cols_this_thread; ++i) {
+            int64_t idx = threadIdx.x + static_cast<int64_t>(last_tid_idx) * i;
+            row_dst[idx] = static_cast<T>(static_cast<U>(scale[idx]) *
+                                          (buffer[i] - row_mean) * row_inv_var);
+          }
+        } else {
+          for (int i = 0; i < cols_this_thread; ++i) {
+            int64_t idx = static_cast<int64_t>(last_tid_idx) * write_times + i;
+            row_dst[idx] = static_cast<T>(static_cast<U>(scale[idx]) *
+                                          (buffer[i] - row_mean) * row_inv_var);
+          }
+        }
+      } else {
+        if (threadIdx.x < last_tid_idx) {
+          for (int i = 0; i < cols_this_thread; ++i) {
+            int64_t idx = threadIdx.x + static_cast<int64_t>(last_tid_idx) * i;
+            row_dst[idx] = static_cast<T>((buffer[i] - row_mean) * row_inv_var +
+                                          static_cast<U>(bias[idx]));
+          }
+        } else {
+          for (int i = 0; i < cols_this_thread; ++i) {
+            int64_t idx = static_cast<int64_t>(last_tid_idx) * write_times + i;
+            row_dst[idx] = static_cast<T>((buffer[i] - row_mean) * row_inv_var +
+                                          static_cast<U>(bias[idx]));
+          }
+        }
+      }
+    }
+  }
+};
+
+template <typename IndexT, typename T, typename U, bool IsSameType, int VecSize>
+__global__ void LayerNormFwdWithWelford(
+    const T* __restrict__ src_data,
+    T* dst_data,
+    const funcs::LayerNormScaleBiasT<T, U, IsSameType>* __restrict__ scale,
+    const funcs::LayerNormScaleBiasT<T, U, IsSameType>* __restrict__ bias,
+    U* mean,
+    U* var,
+    const U epsilon,
+    const IndexT rows,
+    const int32_t cols,
+    const int32_t cols_per_thread,
+    const bool valid_scale,
+    const bool valid_bias) {
+  constexpr int kWarpSize = 32;
+  int last_tid_idx = 0;  // For condition once vecSize is 1.
+  IndexT row_offset = blockIdx.x * blockDim.y + threadIdx.y;
+  int cols_this_thread =
+      ThreadAssigner<VecSize>()(cols, cols_per_thread, &last_tid_idx);
+  int read_times = cols_per_thread / VecSize;
+
+  if (row_offset < rows) {
+    U buffer[kWarpSize];
+    U tid_cnt = static_cast<U>(0);
+    U tid_mean = static_cast<U>(0);
+    U tid_square = static_cast<U>(0);
+
+    const T* __restrict__ row_src = src_data + row_offset * cols;
+    T* row_dst = dst_data + row_offset * cols;
+    LayerNormDataReader<T, U, VecSize>()(
+        row_src, buffer, last_tid_idx, read_times, cols_this_thread);
+
+    for (int i = 0; i < cols_this_thread; i++) {
+      WelfordOnline<U>(buffer[i], &tid_mean, &tid_square, &tid_cnt);
+    }
+
+    U warp_cnt = tid_cnt;
+    U warp_mean = tid_mean;
+    U warp_square = tid_square;
+    WelfordWarpAllReduce<U>(&warp_mean, &warp_square, &warp_cnt);
+
+    U row_variance = max(warp_square / warp_cnt, 0.f);
+    U row_inv_var = funcs::rsqrt_(row_variance + epsilon);
+
+    // TODO(limingshu): make code below vectorization.
+    if (threadIdx.x == 0) {
+      // warp_mean is just row_mean here.
+      mean[row_offset] = warp_mean;
+      var[row_offset] = row_variance;
+    }
+    LayerNormDataWriter<T, U, IsSameType, VecSize>()(row_dst,
+                                                     buffer,
+                                                     scale,
+                                                     bias,
+                                                     warp_mean,
+                                                     row_inv_var,
+                                                     read_times,
+                                                     cols_this_thread,
+                                                     last_tid_idx,
+                                                     valid_scale,
+                                                     valid_bias);
+  }
+}
+
+template <typename Context, typename T, typename U>
+void LaunchLayerNormKernel(const Context& dev_ctx,
+                           const T* x_data,
+                           T* y_data,
+                           const void* void_scale_data,
+                           const void* void_bias_data,
+                           U* mean_data,
+                           U* var_data,
+                           double epsilon,
+                           const int64_t rows,
+                           const int cols,
+                           const bool valid_scale,
+                           const bool valid_bias,
+                           const bool is_same_type) {
+  constexpr int WarpSize = 32;
+  constexpr int RowPerBlock = 4;
+  int64_t block_size = (rows + (RowPerBlock - 1)) / RowPerBlock;
+  dim3 threads(WarpSize, RowPerBlock, 1);
+
+  int vec_size = 1;
+  int cols_per_thread = (cols + (WarpSize - 1)) / WarpSize;
+  if (cols_per_thread > 1 && (cols % WarpSize == 0)) {
+    int data_vec_size = 0;
+    uint64_t addr = (reinterpret_cast<uint64_t>(x_data) |
+                     reinterpret_cast<uint64_t>(y_data));
+    if (valid_bias || valid_scale) {
+      if (is_same_type) {
+        addr = valid_scale
+                   ? (addr | reinterpret_cast<uint64_t>(void_scale_data))
+                   : addr;
+        addr = valid_bias ? (addr | reinterpret_cast<uint64_t>(void_bias_data))
+                          : addr;
+        data_vec_size =
+            std::min(4, phi::GetVectorizedSize<T>(reinterpret_cast<T*>(addr)));
+      } else {
+        uint64_t bias_addr = reinterpret_cast<uint64_t>(void_bias_data);
+        uint64_t attr_addr = valid_scale
+                                 ? reinterpret_cast<uint64_t>(void_scale_data)
+                                 : bias_addr;
+        attr_addr = valid_bias
+                        ? (valid_scale ? (attr_addr | bias_addr) : attr_addr)
+                        : attr_addr;
+        data_vec_size = std::min(
+            phi::GetVectorizedSize<T>(reinterpret_cast<T*>(addr)),
+            phi::GetVectorizedSize<U>(reinterpret_cast<U*>(attr_addr)));
+        data_vec_size = std::min(4, data_vec_size);
+      }
+    }
+    for (int size = data_vec_size; size > 0; size /= 2) {
+      if (cols_per_thread % size == 0) {
+        vec_size = size;
+        break;
+      }
+    }
+  }
+
+#define IMPL_LAYER_NORM_WELFORD_CASE(index_t, scale_t, is_same_, vec_size_) \
+  case (vec_size_): {                                                       \
+    LayerNormFwdWithWelford<index_t, T, U, is_same_, vec_size_>             \
+        <<<block_size, threads, 0, dev_ctx.stream()>>>(                     \
+            x_data,                                                         \
+            y_data,                                                         \
+            static_cast<const scale_t*>(void_scale_data),                   \
+            static_cast<const scale_t*>(void_bias_data),                    \
+            mean_data,                                                      \
+            var_data,                                                       \
+            static_cast<const U>(epsilon),                                  \
+            rows,                                                           \
+            cols,                                                           \
+            cols_per_thread,                                                \
+            valid_scale,                                                    \
+            valid_bias);                                                    \
+  } break
+
+#define IMPL_LAYER_NORM_WELFORD(index_t, scale_t, is_same_)    \
+  IMPL_LAYER_NORM_WELFORD_CASE(index_t, scale_t, is_same_, 4); \
+  IMPL_LAYER_NORM_WELFORD_CASE(index_t, scale_t, is_same_, 2); \
+  IMPL_LAYER_NORM_WELFORD_CASE(index_t, scale_t, is_same_, 1);
+
+  if (rows < std::numeric_limits<int32_t>::max()) {
+    if (is_same_type) {
+      switch (vec_size) { IMPL_LAYER_NORM_WELFORD(int32_t, T, true); }
+    } else {
+      switch (vec_size) { IMPL_LAYER_NORM_WELFORD(int32_t, U, false); }
+    }
+  } else {
+    if (is_same_type) {
+      switch (vec_size) { IMPL_LAYER_NORM_WELFORD(int64_t, T, true); }
+    } else {
+      switch (vec_size) { IMPL_LAYER_NORM_WELFORD(int64_t, U, false); }
+    }
+  }
+#undef IMPL_LAYER_NORM_WELFORD_CASE
+#undef IMPL_LAYER_NORM_WELFORD
+}
+#endif  // PADDLE_WITH_CUDA
+
+template <typename T, typename U>
+void LayerNormDirectCUDAFunctor<T, U>::operator()(
+    gpuStream_t stream,
+    const T* input,
+    std::vector<int64_t> input_shape,
+    const U* bias,
+    const U* scale,
+    T* output,
+    U* mean,
+    U* variance,
+    int begin_norm_axis,
+    float eps) {
+  const auto x_dims = make_ddim(input_shape);
+  auto matrix_dim = common::flatten_to_2d(x_dims, begin_norm_axis);
+  int64_t batch_size = matrix_dim[0];
+  int64_t feature_size = matrix_dim[1];
+  // TODO(large-tensor): generic kernel launch uses int32 grid dim
+  PADDLE_ENFORCE_LE_INT_MAX(batch_size, "batch_size");
+  switch (funcs::GetDesiredBlockDim(feature_size)) {
+    FIXED_BLOCK_DIM_CASE(
+        funcs::LayerNormForward<T, U, kBlockDim>
+        <<<batch_size, kBlockDim, 0, stream>>>(
+            input, scale, bias, output, mean, variance, eps, feature_size));
+    default:
+      PADDLE_THROW(common::errors::InvalidArgument(
+          "Product from begin_norm_axis to end in layer_norm must be larger "
+          "than 1"));
+      break;
+  }
+}
+
+template class PADDLE_API LayerNormDirectCUDAFunctor<float, float>;
+template class PADDLE_API LayerNormDirectCUDAFunctor<double, double>;
+#if defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
+template class PADDLE_API LayerNormDirectCUDAFunctor<half, float>;
+#endif
+static inline LayerNormKernelVariant LayerNormKernelDispatch(
+    const paddle::DataType weight_type,
+    const paddle::DataType input_type,
+    const paddle::DataType output_type,
+    const paddle::DataType compute_type,
+    const uint32_t hidden_size,
+    const int64_t x_numel,
+    const DenseTensor* scale,
+    const DenseTensor* bias) {
+  if (scale == nullptr || bias == nullptr) {
+    return LayerNormKernelVariant::GENERIC;
+  }
+#ifdef PADDLE_WITH_CUDA
+  if (FLAGS_use_accuracy_compatible_kernel) {
+    return LayerNormKernelVariant::GENERIC;
+  }
+#endif
+#if defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && !defined(_WIN32)
+  if (input_type != paddle::DataType::FLOAT32 && hidden_size != 4096 &&
+      hidden_size > 1024 && hidden_size <= 10240 &&
+      x_numel <= std::numeric_limits<uint32_t>::max()) {
+    // using fast_ln_v2 only sm > 70 and x_numel <= uint32_max
+    auto prop = funcs::fast_ln_v2::GetDeviceProp();
+    if (prop->major > 7 &&
+        funcs::fast_ln_v2::has_fast_ln_v2_fwd_kernel(
+            weight_type, input_type, output_type, compute_type, hidden_size)) {
+      return LayerNormKernelVariant::FAST_LN_V2;
+    }
+  }
+#endif
+  if ((hidden_size >= 768 && hidden_size <= 2048 && hidden_size % 256 == 0 ||
+       hidden_size == 4096) &&
+      x_numel <= std::numeric_limits<int>::max() && scale != nullptr &&
+      bias != nullptr) {
+    return LayerNormKernelVariant::FAST_LN_V1;
+  }
+
+  return LayerNormKernelVariant::GENERIC;
+}
+
+template <typename T, typename Context>
+void LayerNormKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const optional<DenseTensor>& scale_opt,
+                     const optional<DenseTensor>& bias_opt,
+                     double epsilon,
+                     int begin_norm_axis,
+                     DenseTensor* y,
+                     DenseTensor* mean,
+                     DenseTensor* var) {
+  using U = funcs::LayerNormParamType<T>;
+  auto* scale = scale_opt.get_ptr();
+  auto* bias = bias_opt.get_ptr();
+
+  const auto x_dims = x.dims();
+  auto* x_data = x.data<T>();
+  auto* y_data = dev_ctx.template Alloc<T>(y);
+  auto* mean_data = dev_ctx.template Alloc<U>(mean);
+  auto* var_data = dev_ctx.template Alloc<U>(var);
+  if (x.numel() == 0) return;
+
+  bool valid_scale = (scale != nullptr);
+  bool valid_bias = (bias != nullptr);
+  auto* void_scale_data = valid_scale ? scale->data() : nullptr;
+  auto* void_bias_data = valid_bias ? bias->data() : nullptr;
+
+  auto x_dtype = x.dtype();
+  auto y_dtype = y->dtype();
+  phi::DataType scale_bias_dtype;
+  if (valid_scale) {
+    scale_bias_dtype = scale->dtype();
+    if (valid_bias) {
+      PADDLE_ENFORCE_EQ(scale->dtype(),
+                        bias->dtype(),
+                        common::errors::InvalidArgument(
+                            "This Scale and Bias of layer_norm op "
+                            "should have the same data type."));
+    }
+  } else {
+    scale_bias_dtype = valid_bias ? bias->dtype() : x_dtype;
+  }
+
+  bool is_scale_bias_same_dtype_with_x = x_dtype == scale_bias_dtype;
+  if (!is_scale_bias_same_dtype_with_x) {
+    PADDLE_ENFORCE_EQ(scale_bias_dtype,
+                      phi::CppTypeToDataType<U>::Type(),
+                      common::errors::InvalidArgument(
+                          "Unsupported data type of Scale and Bias"));
+  }
+
+  auto matrix_dim = common::flatten_to_2d(x_dims, begin_norm_axis);
+  int64_t batch_size = matrix_dim[0];
+  // TODO(large-tensor): generic kernel launch uses int32 grid dim
+  PADDLE_ENFORCE_LE_INT_MAX(batch_size, "batch_size");
+  int64_t feature_size = matrix_dim[1];
+  auto stream = dev_ctx.stream();
+  auto place = x.place();
+
+#define PADDLE_LAUNCH_LAYERNORM_FWD(ScaleBiasT, IsScaleBiasSameDTypeWithX)    \
+  do {                                                                        \
+    switch (funcs::GetDesiredBlockDim(feature_size)) {                        \
+      FIXED_BLOCK_DIM_CASE(                                                   \
+          funcs::LayerNormForward<T, U, kBlockDim, IsScaleBiasSameDTypeWithX> \
+          <<<batch_size, kBlockDim, 0, stream>>>(                             \
+              x_data,                                                         \
+              static_cast<const ScaleBiasT*>(void_scale_data),                \
+              static_cast<const ScaleBiasT*>(void_bias_data),                 \
+              y_data,                                                         \
+              mean_data,                                                      \
+              var_data,                                                       \
+              epsilon,                                                        \
+              feature_size));                                                 \
+      default:                                                                \
+        PADDLE_THROW(common::errors::InvalidArgument(                         \
+            "Product from begin_norm_axis to end must be larger than 1"));    \
+        break;                                                                \
+    }                                                                         \
+  } while (0)
+
+#define PADDLE_LAUNCH_FAST_LAYERNORM_V1_FWD_BASE(ScaleT, feature_size)       \
+  case (feature_size): {                                                     \
+    constexpr int WARPS_N = feature_size < 1024 ? 1 : (feature_size / 1024); \
+    constexpr int WARPS_M = 4 / WARPS_N;                                     \
+    const int THREADS_PER_WARP = 32;                                         \
+    const int BYTES_PER_LDG = 16;                                            \
+    const int VecSize = BYTES_PER_LDG / sizeof(T);                           \
+    const int THREADS_PER_CTA = WARPS_N * THREADS_PER_WARP * WARPS_M;        \
+    const int ROWS_PER_CTA = WARPS_M;                                        \
+    const int grid = static_cast<int>(                                       \
+        std::ceil(batch_size / static_cast<float>(ROWS_PER_CTA)));           \
+    funcs::fast_ln_v1::fast_ln_v1_fwd_kernel<T,                              \
+                                             U,                              \
+                                             ScaleT,                         \
+                                             VecSize,                        \
+                                             WARPS_M,                        \
+                                             WARPS_N,                        \
+                                             BYTES_PER_LDG,                  \
+                                             feature_size>                   \
+        <<<grid, THREADS_PER_CTA, 0, stream>>>(                              \
+            batch_size,                                                      \
+            feature_size,                                                    \
+            epsilon,                                                         \
+            x_data,                                                          \
+            static_cast<const ScaleT*>(void_scale_data),                     \
+            static_cast<const ScaleT*>(void_bias_data),                      \
+            mean_data,                                                       \
+            var_data,                                                        \
+            y_data);                                                         \
+  } break
+
+#define PADDLE_LAUNCH_FAST_LAYERNORM_V1_FWD(ScaleT)       \
+  PADDLE_LAUNCH_FAST_LAYERNORM_V1_FWD_BASE(ScaleT, 768);  \
+  PADDLE_LAUNCH_FAST_LAYERNORM_V1_FWD_BASE(ScaleT, 1024); \
+  PADDLE_LAUNCH_FAST_LAYERNORM_V1_FWD_BASE(ScaleT, 1280); \
+  PADDLE_LAUNCH_FAST_LAYERNORM_V1_FWD_BASE(ScaleT, 1536); \
+  PADDLE_LAUNCH_FAST_LAYERNORM_V1_FWD_BASE(ScaleT, 1792); \
+  PADDLE_LAUNCH_FAST_LAYERNORM_V1_FWD_BASE(ScaleT, 2048); \
+  PADDLE_LAUNCH_FAST_LAYERNORM_V1_FWD_BASE(ScaleT, 4096)
+  auto compute_dtype = phi::CppTypeToDataType<U>::Type();
+  auto kernel_variant = LayerNormKernelDispatch(scale_bias_dtype,
+                                                x_dtype,
+                                                y_dtype,
+                                                compute_dtype,
+                                                feature_size,
+                                                x.numel(),
+                                                scale,
+                                                bias);
+
+  switch (kernel_variant) {
+#if defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && !defined(_WIN32)
+    case LayerNormKernelVariant::FAST_LN_V2:
+      funcs::fast_ln_v2::LaunchNormFwd<T, Context>(dev_ctx,
+                                                   stream,
+                                                   place,
+                                                   x_data,
+                                                   void_scale_data,
+                                                   void_bias_data,
+                                                   y_data,
+                                                   mean_data,
+                                                   var_data,
+                                                   scale_bias_dtype,
+                                                   x_dtype,
+                                                   y_dtype,
+                                                   compute_dtype,
+                                                   feature_size,
+                                                   batch_size,
+                                                   feature_size,
+                                                   epsilon);
+      break;
+#endif
+    case LayerNormKernelVariant::FAST_LN_V1:
+      if (is_scale_bias_same_dtype_with_x) {
+        switch (feature_size) {
+          PADDLE_LAUNCH_FAST_LAYERNORM_V1_FWD(T);
+          default:
+            break;
+        }
+      } else {
+        switch (feature_size) {
+          PADDLE_LAUNCH_FAST_LAYERNORM_V1_FWD(U);
+          default:
+            break;
+        }
+      }
+      break;
+    case LayerNormKernelVariant::GENERIC:
+    default:
+#ifdef PADDLE_WITH_CUDA
+      if (FLAGS_use_accuracy_compatible_kernel ||
+          (!isPowerOfTwo(feature_size) && feature_size > 1024)) {
+        LayerNormFwdCompatKernel<T, Context>(
+            dev_ctx,
+            x_data,
+            valid_scale ? static_cast<const T*>(void_scale_data) : nullptr,
+            valid_bias ? static_cast<const T*>(void_bias_data) : nullptr,
+            epsilon,
+            batch_size,
+            feature_size,
+            y_data,
+            mean_data,
+            var_data);
+      } else if (FLAGS_use_fast_math && feature_size <= 1024 &&
+                 (!std::is_same<T, int8_t>::value)) {
+        // WarpShuffle intrinsics is involved in LaunchLayerNormKernel.
+        LaunchLayerNormKernel<Context, T, U>(dev_ctx,
+                                             x_data,
+                                             y_data,
+                                             void_scale_data,
+                                             void_bias_data,
+                                             mean_data,
+                                             var_data,
+                                             epsilon,
+                                             batch_size,
+                                             feature_size,
+                                             valid_scale,
+                                             valid_bias,
+                                             is_scale_bias_same_dtype_with_x);
+      } else {
+#endif
+        if (is_scale_bias_same_dtype_with_x) {
+          PADDLE_LAUNCH_LAYERNORM_FWD(T, true);
+        } else {
+          PADDLE_LAUNCH_LAYERNORM_FWD(U, false);
+        }
+#ifdef PADDLE_WITH_CUDA
+      }
+#endif
+      break;
+  }
+
+#undef PADDLE_LAUNCH_LAYERNORM_FWD
+#undef PADDLE_LAUNCH_FAST_LAYERNORM_V1_FWD
+}
+#ifdef _WIN32
+template PADDLE_API void LayerNormKernel<float, GPUContext>(
+    const GPUContext& dev_ctx,
+    const DenseTensor& x,
+    const optional<DenseTensor>& scale_opt,
+    const optional<DenseTensor>& bias_opt,
+    double epsilon,
+    int begin_norm_axis,
+    DenseTensor* y,
+    DenseTensor* mean,
+    DenseTensor* var);
+template PADDLE_API void LayerNormKernel<phi::dtype::float16, GPUContext>(
+    const GPUContext& dev_ctx,
+    const DenseTensor& x,
+    const optional<DenseTensor>& scale_opt,
+    const optional<DenseTensor>& bias_opt,
+    double epsilon,
+    int begin_norm_axis,
+    DenseTensor* y,
+    DenseTensor* mean,
+    DenseTensor* var);
+template PADDLE_API void LayerNormKernel<double, GPUContext>(
+    const GPUContext& dev_ctx,
+    const DenseTensor& x,
+    const optional<DenseTensor>& scale_opt,
+    const optional<DenseTensor>& bias_opt,
+    double epsilon,
+    int begin_norm_axis,
+    DenseTensor* y,
+    DenseTensor* mean,
+    DenseTensor* var);
+#endif
+}  // namespace phi
+
+PD_REGISTER_PLUGIN_KERNEL(layer_norm,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::LayerNormKernel,
+                          float,
+                          double,
+                          phi::float16) {
+  kernel->OutputAt(1).SetDataType(phi::DataType::UNDEFINED);
+  kernel->OutputAt(2).SetDataType(phi::DataType::UNDEFINED);
+}
diff --git a/backends/metax_gpu/kernels/impl/addmm_kernel_impl.h b/backends/metax_gpu/kernels/impl/addmm_kernel_impl.h
index a2c69b6adf..9de0b3aadd 100644
--- a/backends/metax_gpu/kernels/impl/addmm_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/addmm_kernel_impl.h
@@ -26,14 +26,11 @@ limitations under the License. */
 // clang-format on
 namespace phi {
 
-template <typename T,
-          size_t D,
-          int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using PhiEigenTensor = EigenTensor<T, D, MajorType, IndexType>;
+template <typename T, size_t D, int MajorType = Eigen::RowMajor>
+using PhiEigenTensor = EigenTensor<T, D, MajorType>;
 
-using Array1 = Eigen::DSizes<Eigen::DenseIndex, 1>;
-using Array2 = Eigen::DSizes<Eigen::DenseIndex, 2>;
+using Array1 = Eigen::DSizes<int64_t, 1>;
+using Array2 = Eigen::DSizes<int64_t, 2>;
 
 template <typename T, typename Context>
 void AddmmKernel(const Context& dev_ctx,
diff --git a/backends/metax_gpu/kernels/impl/layer_norm_impl.cu.h b/backends/metax_gpu/kernels/impl/layer_norm_impl.cu.h
index 18e3401bcd..53c5aff887 100644
--- a/backends/metax_gpu/kernels/impl/layer_norm_impl.cu.h
+++ b/backends/metax_gpu/kernels/impl/layer_norm_impl.cu.h
@@ -33,6 +33,7 @@ namespace cub = hipcub;
 #include "paddle/phi/backends/gpu/gpu_dnn.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/kernels/funcs/aligned_vector.h"
+#include "paddle/phi/kernels/funcs/fast_ln_v1.h"
 
 namespace phi {
 namespace funcs {

From 03b31332e649c6077bdf10a8fbe675ce8674c115 Mon Sep 17 00:00:00 2001
From: duqimeng <1640472053@qq.com>
Date: Mon, 30 Mar 2026 16:03:44 +0800
Subject: [PATCH 8/8] [Metax] Fix Bug

---
 .../custom_kernel/lstm_kernel_register.cu     | 788 +-----------------
 1 file changed, 5 insertions(+), 783 deletions(-)

diff --git a/backends/metax_gpu/kernels/custom_kernel/lstm_kernel_register.cu b/backends/metax_gpu/kernels/custom_kernel/lstm_kernel_register.cu
index 1b094198dc..15cd9b9238 100644
--- a/backends/metax_gpu/kernels/custom_kernel/lstm_kernel_register.cu
+++ b/backends/metax_gpu/kernels/custom_kernel/lstm_kernel_register.cu
@@ -1,4 +1,4 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,786 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// #include "kernels/funcs/layer_norm_util.h"
-#include "paddle/common/flags.h"
-#include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/layer_norm_kernel.h"
-#if defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && !defined(_WIN32)
-#include "paddle/phi/kernels/funcs/fast_ln_v2.h"
-#endif
-#include "funcs/layer_norm_util.h"
-#include "impl/layer_norm_impl.cu.h"
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/phi/kernels/gpu/rms_norm_cuda_kernel.h"
-#endif
-COMMON_DECLARE_bool(use_fast_math);
+#include "kernels/impl/lstm_kernel_impl.h"
+#include "paddle/phi/kernels/lstm_kernel.h"
 
-namespace phi {
-
-enum class LayerNormKernelVariant { FAST_LN_V1, FAST_LN_V2, GENERIC };
-
-#ifdef PADDLE_WITH_CUDA
-template <typename U>
-__device__ inline void WelfordOnline(U val, U* mean, U* square, U* count) {
-  *count += 1;
-  U delta1 = val - *mean;
-  *mean += delta1 / (*count);
-  U delta2 = val - *mean;
-  *square += delta1 * delta2;
-}
-
-template <typename U>
-__device__ inline void WelfordOnline(
-    U b_mean, U b_square, U b_cnt, U* mean, U* square, U* count) {
-  if (b_cnt == 0) {
-    return;
-  }
-
-  U new_cnt = *count + b_cnt;
-  U nb_n = b_cnt / new_cnt;
-  U delta = b_mean - *mean;
-  *mean += delta * nb_n;
-  *square += b_square + delta * delta * (*count) * nb_n;
-  *count = new_cnt;
-}
-
-template <typename U>
-__device__ inline void WelfordWarpAllReduce(U* mean, U* square, U* count) {
-  constexpr int kWarpSize = 32;
-#pragma unroll
-  for (int mask = 1; mask < kWarpSize; mask *= 2) {
-    U b_mean = __shfl_down_sync(0xffffffff, *mean, mask);
-    U b_square = __shfl_down_sync(0xffffffff, *square, mask);
-    U b_cnt = __shfl_down_sync(0xffffffff, *count, mask);
-    WelfordOnline<U>(b_mean, b_square, b_cnt, mean, square, count);
-  }
-
-  *mean = __shfl_sync(0xffffffff, *mean, 0, kWarpSize);
-  *square = __shfl_sync(0xffffffff, *square, 0, kWarpSize);
-  *count = __shfl_sync(0xffffffff, *count, 0, kWarpSize);
-}
-
-template <int VecSize>
-struct ThreadAssigner {
-  __device__ __forceinline__ int operator()(const int cols,
-                                            const int cols_per_thread,
-                                            int32_t* last_tid_idx) {
-    return cols_per_thread;
-  }
-};
-
-template <>
-struct ThreadAssigner<1> {
-  __device__ inline int operator()(const int cols,
-                                   const int cols_per_thread,
-                                   int* last_tid_idx) {
-    int cols_this_thread = cols_per_thread;
-    int last_tid = (cols / cols_per_thread);
-    *last_tid_idx = last_tid;
-    if (threadIdx.x == last_tid) {
-      cols_this_thread = cols - cols_per_thread * last_tid;
-    } else if (threadIdx.x > last_tid) {
-      cols_this_thread = 0;
-    }
-    return cols_this_thread;
-  }
-};
-
-template <typename T, typename U, int VecSize>
-struct LayerNormDataReader {
-  __device__ inline void operator()(const T* __restrict__ row_src,
-                                    U* buffer,
-                                    const int last_tid_idx,
-                                    const int read_times,
-                                    const int cols_this_thread) {
-    using VecT = AlignedVector<T, VecSize>;
-    const VecT* __restrict__ v_src =
-        reinterpret_cast<const VecT* __restrict__>(row_src);
-
-    for (int i = 0; i < read_times; ++i) {
-      VecT temp_src = v_src[threadIdx.x + i * blockDim.x];
-#pragma unroll
-      for (int j = 0; j < VecSize; ++j) {
-        buffer[i * VecSize + j] = static_cast<U>(temp_src[j]);
-      }
-    }
-  }
-};
-
-template <typename T, typename U>
-struct LayerNormDataReader<T, U, 1> {
-  __device__ inline void operator()(const T* __restrict__ row_src,
-                                    U* buffer,
-                                    const int last_tid_idx,
-                                    const int read_times,
-                                    const int cols_this_thread) {
-    // read_time is just cols_per_thread while VecSize is 1.
-    if (threadIdx.x < last_tid_idx) {
-      for (int i = 0; i < cols_this_thread; ++i) {
-        buffer[i] = static_cast<U>(row_src[threadIdx.x + last_tid_idx * i]);
-      }
-    } else {
-      for (int i = 0; i < cols_this_thread; ++i) {
-        buffer[i] = static_cast<U>(row_src[i + read_times * last_tid_idx]);
-      }
-    }
-  }
-};
-
-template <typename T, typename U, bool IsSameType, int VecSize>
-struct LayerNormDataWriter {
-  __device__ inline void operator()(
-      T* __restrict__ row_dst,
-      const U* __restrict__ buffer,
-      const funcs::LayerNormScaleBiasT<T, U, IsSameType>* __restrict__ scale,
-      const funcs::LayerNormScaleBiasT<T, U, IsSameType>* __restrict__ bias,
-      const U row_mean,
-      const U row_inv_var,
-      const int write_times,
-      const int cols_this_thread,
-      const int last_tid_idx,
-      const bool valid_scale,
-      const bool valid_bias) {
-    using VecT = AlignedVector<T, VecSize>;
-    using ScaleT = funcs::LayerNormScaleBiasT<T, U, IsSameType>;
-    using VecScaleT = AlignedVector<ScaleT, VecSize>;
-    VecT* v_dst = reinterpret_cast<VecT*>(row_dst);
-
-    // cols_this_thread is just cols_per_thread
-    if ((!valid_scale) && (!valid_bias)) {
-      for (int i = 0; i < write_times; ++i) {
-        VecT temp_dst;
-#pragma unroll
-        for (int j = 0; j < VecSize; ++j) {
-          temp_dst[j] = static_cast<T>((buffer[i * VecSize + j] - row_mean) *
-                                       row_inv_var);
-        }
-        v_dst[threadIdx.x + static_cast<int64_t>(blockDim.x) * i] = temp_dst;
-      }
-    } else {
-      const VecScaleT* __restrict__ v_scale =
-          reinterpret_cast<const VecScaleT* __restrict__>(scale);
-      const VecScaleT* __restrict__ v_bias =
-          reinterpret_cast<const VecScaleT* __restrict__>(bias);
-      if (valid_scale && valid_bias) {
-        for (int i = 0; i < write_times; ++i) {
-          int64_t idx = threadIdx.x + static_cast<int64_t>(blockDim.x) * i;
-          VecT temp_dst;
-          VecScaleT temp_v_scale = v_scale[idx];
-          VecScaleT temp_v_bias = v_bias[idx];
-#pragma unroll
-          for (int j = 0; j < VecSize; ++j) {
-            temp_dst[j] = static_cast<T>(
-                static_cast<U>(temp_v_scale[j]) *
-                    (buffer[i * VecSize + j] - row_mean) * row_inv_var +
-                static_cast<U>(temp_v_bias[j]));
-          }
-          v_dst[idx] = temp_dst;
-        }
-      } else {
-        if (valid_scale) {
-          for (int i = 0; i < write_times; ++i) {
-            int64_t idx = threadIdx.x + static_cast<int64_t>(blockDim.x) * i;
-            VecT temp_dst;
-            VecScaleT temp_v_scale = v_scale[idx];
-#pragma unroll
-            for (int j = 0; j < VecSize; ++j) {
-              temp_dst[j] = static_cast<T>(
-                  static_cast<U>(temp_v_scale[j]) *
-                  (buffer[i * VecSize + j] - row_mean) * row_inv_var);
-            }
-            v_dst[idx] = temp_dst;
-          }
-        } else {
-          for (int i = 0; i < write_times; ++i) {
-            int idx = threadIdx.x + blockDim.x * i;
-            VecT temp_dst;
-            VecScaleT temp_v_bias = v_bias[idx];
-#pragma unroll
-            for (int j = 0; j < VecSize; ++j) {
-              temp_dst[j] = static_cast<T>(
-                  (buffer[i * VecSize + j] - row_mean) * row_inv_var +
-                  static_cast<U>(temp_v_bias[j]));
-            }
-            v_dst[idx] = temp_dst;
-          }
-        }
-      }
-    }
-  }
-};
-
-template <typename T, typename U, bool IsSameType>
-struct LayerNormDataWriter<T, U, IsSameType, 1> {
-  __device__ __forceinline__ void operator()(
-      T* __restrict__ row_dst,
-      U* __restrict__ buffer,
-      const funcs::LayerNormScaleBiasT<T, U, IsSameType>* __restrict__ scale,
-      const funcs::LayerNormScaleBiasT<T, U, IsSameType>* __restrict__ bias,
-      const U row_mean,
-      const U row_inv_var,
-      const int write_times,
-      const int cols_this_thread,
-      const int last_tid_idx,
-      const bool valid_scale,
-      const bool valid_bias) {
-    // write_times is just col_per_thread.
-    if ((!valid_scale) && (!valid_bias)) {
-      if (threadIdx.x < last_tid_idx) {
-        for (int i = 0; i < cols_this_thread; ++i) {
-          row_dst[threadIdx.x + static_cast<int64_t>(last_tid_idx) * i] =
-              (buffer[i] - row_mean) * row_inv_var;
-        }
-      } else {
-        for (int i = 0; i < cols_this_thread; ++i) {
-          row_dst[static_cast<int64_t>(last_tid_idx) * write_times + i] =
-              (buffer[i] - row_mean) * row_inv_var;
-        }
-      }
-    } else if (valid_scale && valid_bias) {
-      if (threadIdx.x < last_tid_idx) {
-        for (int i = 0; i < cols_this_thread; ++i) {
-          int64_t idx = threadIdx.x + static_cast<int64_t>(last_tid_idx) * i;
-          row_dst[idx] =
-              static_cast<T>(static_cast<U>(scale[idx]) *
-                                 (buffer[i] - row_mean) * row_inv_var +
-                             static_cast<U>(bias[idx]));
-        }
-      } else {
-        for (int i = 0; i < cols_this_thread; ++i) {
-          int64_t idx = static_cast<int64_t>(last_tid_idx) * write_times + i;
-          row_dst[idx] =
-              static_cast<T>(static_cast<U>(scale[idx]) *
-                                 (buffer[i] - row_mean) * row_inv_var +
-                             static_cast<U>(bias[idx]));
-        }
-      }
-    } else {
-      if (valid_scale) {
-        if (threadIdx.x < last_tid_idx) {
-          for (int i = 0; i < cols_this_thread; ++i) {
-            int64_t idx = threadIdx.x + static_cast<int64_t>(last_tid_idx) * i;
-            row_dst[idx] = static_cast<T>(static_cast<U>(scale[idx]) *
-                                          (buffer[i] - row_mean) * row_inv_var);
-          }
-        } else {
-          for (int i = 0; i < cols_this_thread; ++i) {
-            int64_t idx = static_cast<int64_t>(last_tid_idx) * write_times + i;
-            row_dst[idx] = static_cast<T>(static_cast<U>(scale[idx]) *
-                                          (buffer[i] - row_mean) * row_inv_var);
-          }
-        }
-      } else {
-        if (threadIdx.x < last_tid_idx) {
-          for (int i = 0; i < cols_this_thread; ++i) {
-            int64_t idx = threadIdx.x + static_cast<int64_t>(last_tid_idx) * i;
-            row_dst[idx] = static_cast<T>((buffer[i] - row_mean) * row_inv_var +
-                                          static_cast<U>(bias[idx]));
-          }
-        } else {
-          for (int i = 0; i < cols_this_thread; ++i) {
-            int64_t idx = static_cast<int64_t>(last_tid_idx) * write_times + i;
-            row_dst[idx] = static_cast<T>((buffer[i] - row_mean) * row_inv_var +
-                                          static_cast<U>(bias[idx]));
-          }
-        }
-      }
-    }
-  }
-};
-
-template <typename IndexT, typename T, typename U, bool IsSameType, int VecSize>
-__global__ void LayerNormFwdWithWelford(
-    const T* __restrict__ src_data,
-    T* dst_data,
-    const funcs::LayerNormScaleBiasT<T, U, IsSameType>* __restrict__ scale,
-    const funcs::LayerNormScaleBiasT<T, U, IsSameType>* __restrict__ bias,
-    U* mean,
-    U* var,
-    const U epsilon,
-    const IndexT rows,
-    const int32_t cols,
-    const int32_t cols_per_thread,
-    const bool valid_scale,
-    const bool valid_bias) {
-  constexpr int kWarpSize = 32;
-  int last_tid_idx = 0;  // For condition once vecSize is 1.
-  IndexT row_offset = blockIdx.x * blockDim.y + threadIdx.y;
-  int cols_this_thread =
-      ThreadAssigner<VecSize>()(cols, cols_per_thread, &last_tid_idx);
-  int read_times = cols_per_thread / VecSize;
-
-  if (row_offset < rows) {
-    U buffer[kWarpSize];
-    U tid_cnt = static_cast<U>(0);
-    U tid_mean = static_cast<U>(0);
-    U tid_square = static_cast<U>(0);
-
-    const T* __restrict__ row_src = src_data + row_offset * cols;
-    T* row_dst = dst_data + row_offset * cols;
-    LayerNormDataReader<T, U, VecSize>()(
-        row_src, buffer, last_tid_idx, read_times, cols_this_thread);
-
-    for (int i = 0; i < cols_this_thread; i++) {
-      WelfordOnline<U>(buffer[i], &tid_mean, &tid_square, &tid_cnt);
-    }
-
-    U warp_cnt = tid_cnt;
-    U warp_mean = tid_mean;
-    U warp_square = tid_square;
-    WelfordWarpAllReduce<U>(&warp_mean, &warp_square, &warp_cnt);
-
-    U row_variance = max(warp_square / warp_cnt, 0.f);
-    U row_inv_var = funcs::rsqrt_(row_variance + epsilon);
-
-    // TODO(limingshu): make code below vectorization.
-    if (threadIdx.x == 0) {
-      // warp_mean is just row_mean here.
-      mean[row_offset] = warp_mean;
-      var[row_offset] = row_variance;
-    }
-    LayerNormDataWriter<T, U, IsSameType, VecSize>()(row_dst,
-                                                     buffer,
-                                                     scale,
-                                                     bias,
-                                                     warp_mean,
-                                                     row_inv_var,
-                                                     read_times,
-                                                     cols_this_thread,
-                                                     last_tid_idx,
-                                                     valid_scale,
-                                                     valid_bias);
-  }
-}
-
-template <typename Context, typename T, typename U>
-void LaunchLayerNormKernel(const Context& dev_ctx,
-                           const T* x_data,
-                           T* y_data,
-                           const void* void_scale_data,
-                           const void* void_bias_data,
-                           U* mean_data,
-                           U* var_data,
-                           double epsilon,
-                           const int64_t rows,
-                           const int cols,
-                           const bool valid_scale,
-                           const bool valid_bias,
-                           const bool is_same_type) {
-  constexpr int WarpSize = 32;
-  constexpr int RowPerBlock = 4;
-  int64_t block_size = (rows + (RowPerBlock - 1)) / RowPerBlock;
-  dim3 threads(WarpSize, RowPerBlock, 1);
-
-  int vec_size = 1;
-  int cols_per_thread = (cols + (WarpSize - 1)) / WarpSize;
-  if (cols_per_thread > 1 && (cols % WarpSize == 0)) {
-    int data_vec_size = 0;
-    uint64_t addr = (reinterpret_cast<uint64_t>(x_data) |
-                     reinterpret_cast<uint64_t>(y_data));
-    if (valid_bias || valid_scale) {
-      if (is_same_type) {
-        addr = valid_scale
-                   ? (addr | reinterpret_cast<uint64_t>(void_scale_data))
-                   : addr;
-        addr = valid_bias ? (addr | reinterpret_cast<uint64_t>(void_bias_data))
-                          : addr;
-        data_vec_size =
-            std::min(4, phi::GetVectorizedSize<T>(reinterpret_cast<T*>(addr)));
-      } else {
-        uint64_t bias_addr = reinterpret_cast<uint64_t>(void_bias_data);
-        uint64_t attr_addr = valid_scale
-                                 ? reinterpret_cast<uint64_t>(void_scale_data)
-                                 : bias_addr;
-        attr_addr = valid_bias
-                        ? (valid_scale ? (attr_addr | bias_addr) : attr_addr)
-                        : attr_addr;
-        data_vec_size = std::min(
-            phi::GetVectorizedSize<T>(reinterpret_cast<T*>(addr)),
-            phi::GetVectorizedSize<U>(reinterpret_cast<U*>(attr_addr)));
-        data_vec_size = std::min(4, data_vec_size);
-      }
-    }
-    for (int size = data_vec_size; size > 0; size /= 2) {
-      if (cols_per_thread % size == 0) {
-        vec_size = size;
-        break;
-      }
-    }
-  }
-
-#define IMPL_LAYER_NORM_WELFORD_CASE(index_t, scale_t, is_same_, vec_size_) \
-  case (vec_size_): {                                                       \
-    LayerNormFwdWithWelford<index_t, T, U, is_same_, vec_size_>             \
-        <<<block_size, threads, 0, dev_ctx.stream()>>>(                     \
-            x_data,                                                         \
-            y_data,                                                         \
-            static_cast<const scale_t*>(void_scale_data),                   \
-            static_cast<const scale_t*>(void_bias_data),                    \
-            mean_data,                                                      \
-            var_data,                                                       \
-            static_cast<const U>(epsilon),                                  \
-            rows,                                                           \
-            cols,                                                           \
-            cols_per_thread,                                                \
-            valid_scale,                                                    \
-            valid_bias);                                                    \
-  } break
-
-#define IMPL_LAYER_NORM_WELFORD(index_t, scale_t, is_same_)    \
-  IMPL_LAYER_NORM_WELFORD_CASE(index_t, scale_t, is_same_, 4); \
-  IMPL_LAYER_NORM_WELFORD_CASE(index_t, scale_t, is_same_, 2); \
-  IMPL_LAYER_NORM_WELFORD_CASE(index_t, scale_t, is_same_, 1);
-
-  if (rows < std::numeric_limits<int32_t>::max()) {
-    if (is_same_type) {
-      switch (vec_size) { IMPL_LAYER_NORM_WELFORD(int32_t, T, true); }
-    } else {
-      switch (vec_size) { IMPL_LAYER_NORM_WELFORD(int32_t, U, false); }
-    }
-  } else {
-    if (is_same_type) {
-      switch (vec_size) { IMPL_LAYER_NORM_WELFORD(int64_t, T, true); }
-    } else {
-      switch (vec_size) { IMPL_LAYER_NORM_WELFORD(int64_t, U, false); }
-    }
-  }
-#undef IMPL_LAYER_NORM_WELFORD_CASE
-#undef IMPL_LAYER_NORM_WELFORD
-}
-#endif  // PADDLE_WITH_CUDA
-
-template <typename T, typename U>
-void LayerNormDirectCUDAFunctor<T, U>::operator()(
-    gpuStream_t stream,
-    const T* input,
-    std::vector<int64_t> input_shape,
-    const U* bias,
-    const U* scale,
-    T* output,
-    U* mean,
-    U* variance,
-    int begin_norm_axis,
-    float eps) {
-  const auto x_dims = make_ddim(input_shape);
-  auto matrix_dim = common::flatten_to_2d(x_dims, begin_norm_axis);
-  int64_t batch_size = matrix_dim[0];
-  int64_t feature_size = matrix_dim[1];
-  // TODO(large-tensor): generic kernel launch uses int32 grid dim
-  PADDLE_ENFORCE_LE_INT_MAX(batch_size, "batch_size");
-  switch (funcs::GetDesiredBlockDim(feature_size)) {
-    FIXED_BLOCK_DIM_CASE(
-        funcs::LayerNormForward<T, U, kBlockDim>
-        <<<batch_size, kBlockDim, 0, stream>>>(
-            input, scale, bias, output, mean, variance, eps, feature_size));
-    default:
-      PADDLE_THROW(common::errors::InvalidArgument(
-          "Product from begin_norm_axis to end in layer_norm must be larger "
-          "than 1"));
-      break;
-  }
-}
-
-template class PADDLE_API LayerNormDirectCUDAFunctor<float, float>;
-template class PADDLE_API LayerNormDirectCUDAFunctor<double, double>;
-#if defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
-template class PADDLE_API LayerNormDirectCUDAFunctor<half, float>;
-#endif
-static inline LayerNormKernelVariant LayerNormKernelDispatch(
-    const paddle::DataType weight_type,
-    const paddle::DataType input_type,
-    const paddle::DataType output_type,
-    const paddle::DataType compute_type,
-    const uint32_t hidden_size,
-    const int64_t x_numel,
-    const DenseTensor* scale,
-    const DenseTensor* bias) {
-  if (scale == nullptr || bias == nullptr) {
-    return LayerNormKernelVariant::GENERIC;
-  }
-#ifdef PADDLE_WITH_CUDA
-  if (FLAGS_use_accuracy_compatible_kernel) {
-    return LayerNormKernelVariant::GENERIC;
-  }
-#endif
-#if defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && !defined(_WIN32)
-  if (input_type != paddle::DataType::FLOAT32 && hidden_size != 4096 &&
-      hidden_size > 1024 && hidden_size <= 10240 &&
-      x_numel <= std::numeric_limits<uint32_t>::max()) {
-    // using fast_ln_v2 only sm > 70 and x_numel <= uint32_max
-    auto prop = funcs::fast_ln_v2::GetDeviceProp();
-    if (prop->major > 7 &&
-        funcs::fast_ln_v2::has_fast_ln_v2_fwd_kernel(
-            weight_type, input_type, output_type, compute_type, hidden_size)) {
-      return LayerNormKernelVariant::FAST_LN_V2;
-    }
-  }
-#endif
-  if ((hidden_size >= 768 && hidden_size <= 2048 && hidden_size % 256 == 0 ||
-       hidden_size == 4096) &&
-      x_numel <= std::numeric_limits<int>::max() && scale != nullptr &&
-      bias != nullptr) {
-    return LayerNormKernelVariant::FAST_LN_V1;
-  }
-
-  return LayerNormKernelVariant::GENERIC;
-}
-
-template <typename T, typename Context>
-void LayerNormKernel(const Context& dev_ctx,
-                     const DenseTensor& x,
-                     const optional<DenseTensor>& scale_opt,
-                     const optional<DenseTensor>& bias_opt,
-                     double epsilon,
-                     int begin_norm_axis,
-                     DenseTensor* y,
-                     DenseTensor* mean,
-                     DenseTensor* var) {
-  using U = funcs::LayerNormParamType<T>;
-  auto* scale = scale_opt.get_ptr();
-  auto* bias = bias_opt.get_ptr();
-
-  const auto x_dims = x.dims();
-  auto* x_data = x.data<T>();
-  auto* y_data = dev_ctx.template Alloc<T>(y);
-  auto* mean_data = dev_ctx.template Alloc<U>(mean);
-  auto* var_data = dev_ctx.template Alloc<U>(var);
-  if (x.numel() == 0) return;
-
-  bool valid_scale = (scale != nullptr);
-  bool valid_bias = (bias != nullptr);
-  auto* void_scale_data = valid_scale ? scale->data() : nullptr;
-  auto* void_bias_data = valid_bias ? bias->data() : nullptr;
-
-  auto x_dtype = x.dtype();
-  auto y_dtype = y->dtype();
-  phi::DataType scale_bias_dtype;
-  if (valid_scale) {
-    scale_bias_dtype = scale->dtype();
-    if (valid_bias) {
-      PADDLE_ENFORCE_EQ(scale->dtype(),
-                        bias->dtype(),
-                        common::errors::InvalidArgument(
-                            "This Scale and Bias of layer_norm op "
-                            "should have the same data type."));
-    }
-  } else {
-    scale_bias_dtype = valid_bias ? bias->dtype() : x_dtype;
-  }
-
-  bool is_scale_bias_same_dtype_with_x = x_dtype == scale_bias_dtype;
-  if (!is_scale_bias_same_dtype_with_x) {
-    PADDLE_ENFORCE_EQ(scale_bias_dtype,
-                      phi::CppTypeToDataType<U>::Type(),
-                      common::errors::InvalidArgument(
-                          "Unsupported data type of Scale and Bias"));
-  }
-
-  auto matrix_dim = common::flatten_to_2d(x_dims, begin_norm_axis);
-  int64_t batch_size = matrix_dim[0];
-  // TODO(large-tensor): generic kernel launch uses int32 grid dim
-  PADDLE_ENFORCE_LE_INT_MAX(batch_size, "batch_size");
-  int64_t feature_size = matrix_dim[1];
-  auto stream = dev_ctx.stream();
-  auto place = x.place();
-
-#define PADDLE_LAUNCH_LAYERNORM_FWD(ScaleBiasT, IsScaleBiasSameDTypeWithX)    \
-  do {                                                                        \
-    switch (funcs::GetDesiredBlockDim(feature_size)) {                        \
-      FIXED_BLOCK_DIM_CASE(                                                   \
-          funcs::LayerNormForward<T, U, kBlockDim, IsScaleBiasSameDTypeWithX> \
-          <<<batch_size, kBlockDim, 0, stream>>>(                             \
-              x_data,                                                         \
-              static_cast<const ScaleBiasT*>(void_scale_data),                \
-              static_cast<const ScaleBiasT*>(void_bias_data),                 \
-              y_data,                                                         \
-              mean_data,                                                      \
-              var_data,                                                       \
-              epsilon,                                                        \
-              feature_size));                                                 \
-      default:                                                                \
-        PADDLE_THROW(common::errors::InvalidArgument(                         \
-            "Product from begin_norm_axis to end must be larger than 1"));    \
-        break;                                                                \
-    }                                                                         \
-  } while (0)
-
-#define PADDLE_LAUNCH_FAST_LAYERNORM_V1_FWD_BASE(ScaleT, feature_size)       \
-  case (feature_size): {                                                     \
-    constexpr int WARPS_N = feature_size < 1024 ? 1 : (feature_size / 1024); \
-    constexpr int WARPS_M = 4 / WARPS_N;                                     \
-    const int THREADS_PER_WARP = 32;                                         \
-    const int BYTES_PER_LDG = 16;                                            \
-    const int VecSize = BYTES_PER_LDG / sizeof(T);                           \
-    const int THREADS_PER_CTA = WARPS_N * THREADS_PER_WARP * WARPS_M;        \
-    const int ROWS_PER_CTA = WARPS_M;                                        \
-    const int grid = static_cast<int>(                                       \
-        std::ceil(batch_size / static_cast<float>(ROWS_PER_CTA)));           \
-    funcs::fast_ln_v1::fast_ln_v1_fwd_kernel<T,                              \
-                                             U,                              \
-                                             ScaleT,                         \
-                                             VecSize,                        \
-                                             WARPS_M,                        \
-                                             WARPS_N,                        \
-                                             BYTES_PER_LDG,                  \
-                                             feature_size>                   \
-        <<<grid, THREADS_PER_CTA, 0, stream>>>(                              \
-            batch_size,                                                      \
-            feature_size,                                                    \
-            epsilon,                                                         \
-            x_data,                                                          \
-            static_cast<const ScaleT*>(void_scale_data),                     \
-            static_cast<const ScaleT*>(void_bias_data),                      \
-            mean_data,                                                       \
-            var_data,                                                        \
-            y_data);                                                         \
-  } break
-
-#define PADDLE_LAUNCH_FAST_LAYERNORM_V1_FWD(ScaleT)       \
-  PADDLE_LAUNCH_FAST_LAYERNORM_V1_FWD_BASE(ScaleT, 768);  \
-  PADDLE_LAUNCH_FAST_LAYERNORM_V1_FWD_BASE(ScaleT, 1024); \
-  PADDLE_LAUNCH_FAST_LAYERNORM_V1_FWD_BASE(ScaleT, 1280); \
-  PADDLE_LAUNCH_FAST_LAYERNORM_V1_FWD_BASE(ScaleT, 1536); \
-  PADDLE_LAUNCH_FAST_LAYERNORM_V1_FWD_BASE(ScaleT, 1792); \
-  PADDLE_LAUNCH_FAST_LAYERNORM_V1_FWD_BASE(ScaleT, 2048); \
-  PADDLE_LAUNCH_FAST_LAYERNORM_V1_FWD_BASE(ScaleT, 4096)
-  auto compute_dtype = phi::CppTypeToDataType<U>::Type();
-  auto kernel_variant = LayerNormKernelDispatch(scale_bias_dtype,
-                                                x_dtype,
-                                                y_dtype,
-                                                compute_dtype,
-                                                feature_size,
-                                                x.numel(),
-                                                scale,
-                                                bias);
-
-  switch (kernel_variant) {
-#if defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && !defined(_WIN32)
-    case LayerNormKernelVariant::FAST_LN_V2:
-      funcs::fast_ln_v2::LaunchNormFwd<T, Context>(dev_ctx,
-                                                   stream,
-                                                   place,
-                                                   x_data,
-                                                   void_scale_data,
-                                                   void_bias_data,
-                                                   y_data,
-                                                   mean_data,
-                                                   var_data,
-                                                   scale_bias_dtype,
-                                                   x_dtype,
-                                                   y_dtype,
-                                                   compute_dtype,
-                                                   feature_size,
-                                                   batch_size,
-                                                   feature_size,
-                                                   epsilon);
-      break;
-#endif
-    case LayerNormKernelVariant::FAST_LN_V1:
-      if (is_scale_bias_same_dtype_with_x) {
-        switch (feature_size) {
-          PADDLE_LAUNCH_FAST_LAYERNORM_V1_FWD(T);
-          default:
-            break;
-        }
-      } else {
-        switch (feature_size) {
-          PADDLE_LAUNCH_FAST_LAYERNORM_V1_FWD(U);
-          default:
-            break;
-        }
-      }
-      break;
-    case LayerNormKernelVariant::GENERIC:
-    default:
-#ifdef PADDLE_WITH_CUDA
-      if (FLAGS_use_accuracy_compatible_kernel ||
-          (!isPowerOfTwo(feature_size) && feature_size > 1024)) {
-        LayerNormFwdCompatKernel<T, Context>(
-            dev_ctx,
-            x_data,
-            valid_scale ? static_cast<const T*>(void_scale_data) : nullptr,
-            valid_bias ? static_cast<const T*>(void_bias_data) : nullptr,
-            epsilon,
-            batch_size,
-            feature_size,
-            y_data,
-            mean_data,
-            var_data);
-      } else if (FLAGS_use_fast_math && feature_size <= 1024 &&
-                 (!std::is_same<T, int8_t>::value)) {
-        // WarpShuffle intrinsics is involved in LaunchLayerNormKernel.
-        LaunchLayerNormKernel<Context, T, U>(dev_ctx,
-                                             x_data,
-                                             y_data,
-                                             void_scale_data,
-                                             void_bias_data,
-                                             mean_data,
-                                             var_data,
-                                             epsilon,
-                                             batch_size,
-                                             feature_size,
-                                             valid_scale,
-                                             valid_bias,
-                                             is_scale_bias_same_dtype_with_x);
-      } else {
-#endif
-        if (is_scale_bias_same_dtype_with_x) {
-          PADDLE_LAUNCH_LAYERNORM_FWD(T, true);
-        } else {
-          PADDLE_LAUNCH_LAYERNORM_FWD(U, false);
-        }
-#ifdef PADDLE_WITH_CUDA
-      }
-#endif
-      break;
-  }
-
-#undef PADDLE_LAUNCH_LAYERNORM_FWD
-#undef PADDLE_LAUNCH_FAST_LAYERNORM_V1_FWD
-}
-#ifdef _WIN32
-template PADDLE_API void LayerNormKernel<float, GPUContext>(
-    const GPUContext& dev_ctx,
-    const DenseTensor& x,
-    const optional<DenseTensor>& scale_opt,
-    const optional<DenseTensor>& bias_opt,
-    double epsilon,
-    int begin_norm_axis,
-    DenseTensor* y,
-    DenseTensor* mean,
-    DenseTensor* var);
-template PADDLE_API void LayerNormKernel<phi::dtype::float16, GPUContext>(
-    const GPUContext& dev_ctx,
-    const DenseTensor& x,
-    const optional<DenseTensor>& scale_opt,
-    const optional<DenseTensor>& bias_opt,
-    double epsilon,
-    int begin_norm_axis,
-    DenseTensor* y,
-    DenseTensor* mean,
-    DenseTensor* var);
-template PADDLE_API void LayerNormKernel<double, GPUContext>(
-    const GPUContext& dev_ctx,
-    const DenseTensor& x,
-    const optional<DenseTensor>& scale_opt,
-    const optional<DenseTensor>& bias_opt,
-    double epsilon,
-    int begin_norm_axis,
-    DenseTensor* y,
-    DenseTensor* mean,
-    DenseTensor* var);
-#endif
-}  // namespace phi
-
-PD_REGISTER_PLUGIN_KERNEL(layer_norm,
-                          metax_gpu,
-                          ALL_LAYOUT,
-                          phi::LayerNormKernel,
-                          float,
-                          double,
-                          phi::float16) {
-  kernel->OutputAt(1).SetDataType(phi::DataType::UNDEFINED);
-  kernel->OutputAt(2).SetDataType(phi::DataType::UNDEFINED);
-}
+PD_REGISTER_PLUGIN_KERNEL(
+    lstm, metax_gpu, ALL_LAYOUT, phi::LSTMKernel, float, double) {}