From 009626c9f8effd13a17085e1a45c27e90d0491be Mon Sep 17 00:00:00 2001 From: Carl Philipp Klemm Date: Wed, 13 Aug 2025 11:31:33 +0200 Subject: [PATCH 1/4] CUDA/HIP: add expicit conversion operator to support older versions of rocm --- ggml/src/ggml-cuda/convert.cu | 6 ++--- ggml/src/ggml-cuda/convert.cuh | 39 ++++++++++++++++++++++++++++++++ ggml/src/ggml-cuda/cpy-utils.cuh | 3 ++- ggml/src/ggml-cuda/getrows.cu | 7 +++--- ggml/src/ggml-cuda/mmvf.cu | 5 ++-- 5 files changed, 51 insertions(+), 9 deletions(-) diff --git a/ggml/src/ggml-cuda/convert.cu b/ggml/src/ggml-cuda/convert.cu index e3beddbc1b23b..7690812a649de 100644 --- a/ggml/src/ggml-cuda/convert.cu +++ b/ggml/src/ggml-cuda/convert.cu @@ -31,8 +31,8 @@ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __ dequantize_kernel(vx, ib, iqs, v); const int64_t iy0 = ((i03*ne02 + i02)*ne01 + i01)*ne00 + iybs + iqs; - y[iy0 + 0] = float(v.x); - y[iy0 + y_offset] = float(v.y); + y[iy0 + 0] = ggml_cuda_convert_val(v.x); + y[iy0 + y_offset] = ggml_cuda_convert_val(v.y); } template @@ -630,7 +630,7 @@ static __global__ void convert_unary( const int64_t ix = i03*s03 + i02*s02 + i01*s01 + i00; const int64_t iy = ((i03*ne02 + i02)*ne01 + i01)*ne00 + i00; - y[iy] = float(x[ix]); + y[iy] = ggml_cuda_convert_val(x[ix]); } template diff --git a/ggml/src/ggml-cuda/convert.cuh b/ggml/src/ggml-cuda/convert.cuh index f04214be175ba..35da9b39a0af9 100644 --- a/ggml/src/ggml-cuda/convert.cuh +++ b/ggml/src/ggml-cuda/convert.cuh @@ -29,3 +29,42 @@ typedef to_t_nc_cuda_t to_bf16_nc_cuda_t; to_fp32_nc_cuda_t ggml_get_to_fp32_nc_cuda(ggml_type type); to_fp16_nc_cuda_t ggml_get_to_fp16_nc_cuda(ggml_type type); to_bf16_nc_cuda_t ggml_get_to_bf16_nc_cuda(ggml_type type); + +template + __host__ __device__ inline dest_t ggml_cuda_convert_val(src_t x) { + if constexpr (std::is_same_v) { + return x; + } else { + return float(x); + } +} + +template<> +__host__ __device__ inline float ggml_cuda_convert_val(nv_bfloat16 x) { + return __bfloat162float(x); +} + +template<> +__host__ __device__ inline nv_bfloat16 ggml_cuda_convert_val(float x) { + return __float2bfloat16(x); +} + +template<> +__host__ __device__ inline half ggml_cuda_convert_val(nv_bfloat16 x) { + return half(__bfloat162float(x)); +} + +template<> +__host__ __device__ inline nv_bfloat16 ggml_cuda_convert_val(half x) { + return __float2bfloat16(float(x)); +} + +template<> +__host__ __device__ inline int ggml_cuda_convert_val(nv_bfloat16 x) { + return int(__bfloat162float(x)); +} + +template<> +__host__ __device__ inline nv_bfloat16 ggml_cuda_convert_val(int x) { + return __float2bfloat16(float(x)); +} diff --git a/ggml/src/ggml-cuda/cpy-utils.cuh b/ggml/src/ggml-cuda/cpy-utils.cuh index 410c12b7ba56b..603af215f27d9 100644 --- a/ggml/src/ggml-cuda/cpy-utils.cuh +++ b/ggml/src/ggml-cuda/cpy-utils.cuh @@ -1,13 +1,14 @@ #pragma once #include "ggml-common.h" +#include "convert.cuh" template static __device__ __forceinline__ void convert_flt(const src_t * src, dst_t * dst) { if constexpr (std::is_same_v) { *dst = *src; } else { - *dst = float(*src); + *dst = ggml_cuda_convert_val(*src); } } diff --git a/ggml/src/ggml-cuda/getrows.cu b/ggml/src/ggml-cuda/getrows.cu index f77b2629a19b0..24daf7ded464d 100644 --- a/ggml/src/ggml-cuda/getrows.cu +++ b/ggml/src/ggml-cuda/getrows.cu @@ -1,5 +1,6 @@ #include "getrows.cuh" #include "dequantize.cuh" +#include "convert.cuh" template static __global__ void k_get_rows( @@ -34,8 +35,8 @@ static __global__ void k_get_rows( dfloat2 v; dequantize_kernel(src0_row, ib, iqs, v); - dst_row[iybs + iqs + 0] = float(v.x); - dst_row[iybs + iqs + y_offset] = float(v.y); + dst_row[iybs + iqs + 0] = ggml_cuda_convert_val(v.x); + dst_row[iybs + iqs + y_offset] = ggml_cuda_convert_val(v.y); } template @@ -62,7 +63,7 @@ static __global__ void k_get_rows_float( dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3; const src0_t * src0_row = (const src0_t *)((const char *) src0 + i01*nb01 + i11*nb02 + i12*nb03); - dst_row[i00] = float(src0_row[i00]); + dst_row[i00] = ggml_cuda_convert_val(src0_row[i00]); } template diff --git a/ggml/src/ggml-cuda/mmvf.cu b/ggml/src/ggml-cuda/mmvf.cu index 1ad4bc75ba614..6628313ad035f 100644 --- a/ggml/src/ggml-cuda/mmvf.cu +++ b/ggml/src/ggml-cuda/mmvf.cu @@ -1,5 +1,6 @@ #include "ggml.h" #include "common.cuh" +#include "convert.cuh" #include "mmvf.cuh" template @@ -93,8 +94,8 @@ static __global__ void mul_mat_vec_f( #pragma unroll for (int j = 0; j < ncols_dst; ++j) { const float2 tmpy = y2[j*stride_col_y2 + col2]; - sumf[j] += float(reinterpret_cast(&tmpx)[0]) * tmpy.x; - sumf[j] += float(reinterpret_cast(&tmpx)[1]) * tmpy.y; + sumf[j] += ggml_cuda_convert_val(reinterpret_cast(&tmpx)[0]) * tmpy.x; + sumf[j] += ggml_cuda_convert_val(reinterpret_cast(&tmpx)[1]) * tmpy.y; } } } else { From e9ff641affbd447f6c8e99375825ed20f8120836 Mon Sep 17 00:00:00 2001 From: Carl Philipp Klemm Date: Thu, 14 Aug 2025 11:45:33 +0200 Subject: [PATCH 2/4] HIP: Cleanup hipification header Switch over to hip_bf16 from legacy hip_bfloat16 Simplify RDNA3 define Reduce swap over of new hipblas api to rocm 6.5 as this version is used for rocm 7.0 previews --- ggml/src/ggml-cuda/vendors/hip.h | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h index ec1b59caafc9a..6e9c67aca096e 100644 --- a/ggml/src/ggml-cuda/vendors/hip.h +++ b/ggml/src/ggml-cuda/vendors/hip.h @@ -4,7 +4,7 @@ #include #include #include -#include +#include #define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT #define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT @@ -135,7 +135,7 @@ #define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR #define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED -#if HIP_VERSION >= 70000000 +#if HIP_VERSION >= 60500000 #define CUBLAS_COMPUTE_16F HIPBLAS_COMPUTE_16F #define CUBLAS_COMPUTE_32F HIPBLAS_COMPUTE_32F #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_COMPUTE_32F_FAST_16F @@ -147,7 +147,7 @@ #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F #define cublasComputeType_t hipblasDatatype_t #define cudaDataType_t hipblasDatatype_t -#endif // HIP_VERSION >= 7000000 +#endif // HIP_VERSION >= 6050000 #if !defined(__HIP_PLATFORM_AMD__) #error "The HIP backend supports only AMD targets" @@ -179,8 +179,7 @@ #define RDNA4 #endif -#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \ - defined(__gfx1150__) || defined(__gfx1151__) +#if defined(__GFX11__) #define RDNA3 #endif @@ -197,8 +196,8 @@ #define __has_builtin(x) 0 #endif -typedef hip_bfloat16 nv_bfloat16; -typedef short2 nv_bfloat162; // FIXME there is no 2x BF16 type being defined in bfloat16.h, ad-hoc compilation fix +typedef __hip_bfloat16 nv_bfloat16; +typedef __hip_bfloat162 nv_bfloat162; typedef int8_t int8x4_t __attribute__((ext_vector_type(4))); typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4))); From 173df5e4a8b4639f8b8dc093feb275b96aab22e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Thu, 14 Aug 2025 12:37:50 +0200 Subject: [PATCH 3/4] convert_val -> cast --- ggml/src/ggml-cuda/convert.cu | 6 +++--- ggml/src/ggml-cuda/convert.cuh | 14 +++++++------- ggml/src/ggml-cuda/cpy-utils.cuh | 6 +----- ggml/src/ggml-cuda/getrows.cu | 6 +++--- ggml/src/ggml-cuda/mmvf.cu | 4 ++-- 5 files changed, 16 insertions(+), 20 deletions(-) diff --git a/ggml/src/ggml-cuda/convert.cu b/ggml/src/ggml-cuda/convert.cu index 7690812a649de..dbd9f608f134d 100644 --- a/ggml/src/ggml-cuda/convert.cu +++ b/ggml/src/ggml-cuda/convert.cu @@ -31,8 +31,8 @@ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __ dequantize_kernel(vx, ib, iqs, v); const int64_t iy0 = ((i03*ne02 + i02)*ne01 + i01)*ne00 + iybs + iqs; - y[iy0 + 0] = ggml_cuda_convert_val(v.x); - y[iy0 + y_offset] = ggml_cuda_convert_val(v.y); + y[iy0 + 0] = ggml_cuda_cast(v.x); + y[iy0 + y_offset] = ggml_cuda_cast(v.y); } template @@ -630,7 +630,7 @@ static __global__ void convert_unary( const int64_t ix = i03*s03 + i02*s02 + i01*s01 + i00; const int64_t iy = ((i03*ne02 + i02)*ne01 + i01)*ne00 + i00; - y[iy] = ggml_cuda_convert_val(x[ix]); + y[iy] = ggml_cuda_cast(x[ix]); } template diff --git a/ggml/src/ggml-cuda/convert.cuh b/ggml/src/ggml-cuda/convert.cuh index 35da9b39a0af9..0a812dd06787e 100644 --- a/ggml/src/ggml-cuda/convert.cuh +++ b/ggml/src/ggml-cuda/convert.cuh @@ -31,7 +31,7 @@ to_fp16_nc_cuda_t ggml_get_to_fp16_nc_cuda(ggml_type type); to_bf16_nc_cuda_t ggml_get_to_bf16_nc_cuda(ggml_type type); template - __host__ __device__ inline dest_t ggml_cuda_convert_val(src_t x) { + __host__ __device__ inline dest_t ggml_cuda_cast(src_t x) { if constexpr (std::is_same_v) { return x; } else { @@ -40,31 +40,31 @@ template } template<> -__host__ __device__ inline float ggml_cuda_convert_val(nv_bfloat16 x) { +__host__ __device__ inline float ggml_cuda_cast(nv_bfloat16 x) { return __bfloat162float(x); } template<> -__host__ __device__ inline nv_bfloat16 ggml_cuda_convert_val(float x) { +__host__ __device__ inline nv_bfloat16 ggml_cuda_cast(float x) { return __float2bfloat16(x); } template<> -__host__ __device__ inline half ggml_cuda_convert_val(nv_bfloat16 x) { +__host__ __device__ inline half ggml_cuda_cast(nv_bfloat16 x) { return half(__bfloat162float(x)); } template<> -__host__ __device__ inline nv_bfloat16 ggml_cuda_convert_val(half x) { +__host__ __device__ inline nv_bfloat16 ggml_cuda_cast(half x) { return __float2bfloat16(float(x)); } template<> -__host__ __device__ inline int ggml_cuda_convert_val(nv_bfloat16 x) { +__host__ __device__ inline int ggml_cuda_cast(nv_bfloat16 x) { return int(__bfloat162float(x)); } template<> -__host__ __device__ inline nv_bfloat16 ggml_cuda_convert_val(int x) { +__host__ __device__ inline nv_bfloat16 ggml_cuda_cast(int x) { return __float2bfloat16(float(x)); } diff --git a/ggml/src/ggml-cuda/cpy-utils.cuh b/ggml/src/ggml-cuda/cpy-utils.cuh index 603af215f27d9..a893a5e73a091 100644 --- a/ggml/src/ggml-cuda/cpy-utils.cuh +++ b/ggml/src/ggml-cuda/cpy-utils.cuh @@ -5,11 +5,7 @@ template static __device__ __forceinline__ void convert_flt(const src_t * src, dst_t * dst) { - if constexpr (std::is_same_v) { - *dst = *src; - } else { - *dst = ggml_cuda_convert_val(*src); - } + *dst = ggml_cuda_cast(*src); } static __device__ __forceinline__ int best_index_int8(int n, const int8_t * val, float x) { diff --git a/ggml/src/ggml-cuda/getrows.cu b/ggml/src/ggml-cuda/getrows.cu index 24daf7ded464d..f90a371fac66b 100644 --- a/ggml/src/ggml-cuda/getrows.cu +++ b/ggml/src/ggml-cuda/getrows.cu @@ -35,8 +35,8 @@ static __global__ void k_get_rows( dfloat2 v; dequantize_kernel(src0_row, ib, iqs, v); - dst_row[iybs + iqs + 0] = ggml_cuda_convert_val(v.x); - dst_row[iybs + iqs + y_offset] = ggml_cuda_convert_val(v.y); + dst_row[iybs + iqs + 0] = ggml_cuda_cast(v.x); + dst_row[iybs + iqs + y_offset] = ggml_cuda_cast(v.y); } template @@ -63,7 +63,7 @@ static __global__ void k_get_rows_float( dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3; const src0_t * src0_row = (const src0_t *)((const char *) src0 + i01*nb01 + i11*nb02 + i12*nb03); - dst_row[i00] = ggml_cuda_convert_val(src0_row[i00]); + dst_row[i00] = ggml_cuda_cast(src0_row[i00]); } template diff --git a/ggml/src/ggml-cuda/mmvf.cu b/ggml/src/ggml-cuda/mmvf.cu index 6628313ad035f..95c1cef32b4f4 100644 --- a/ggml/src/ggml-cuda/mmvf.cu +++ b/ggml/src/ggml-cuda/mmvf.cu @@ -94,8 +94,8 @@ static __global__ void mul_mat_vec_f( #pragma unroll for (int j = 0; j < ncols_dst; ++j) { const float2 tmpy = y2[j*stride_col_y2 + col2]; - sumf[j] += ggml_cuda_convert_val(reinterpret_cast(&tmpx)[0]) * tmpy.x; - sumf[j] += ggml_cuda_convert_val(reinterpret_cast(&tmpx)[1]) * tmpy.y; + sumf[j] += ggml_cuda_cast(reinterpret_cast(&tmpx)[0]) * tmpy.x; + sumf[j] += ggml_cuda_cast(reinterpret_cast(&tmpx)[1]) * tmpy.y; } } } else { From 6dcde5ac4c50c06c552cdf55d7466f607d4e1f1e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Thu, 14 Aug 2025 12:59:36 +0200 Subject: [PATCH 4/4] swap arg order, consolidate --- ggml/src/ggml-cuda/convert.cu | 6 ++--- ggml/src/ggml-cuda/convert.cuh | 40 ++++++-------------------------- ggml/src/ggml-cuda/cpy-utils.cuh | 7 +----- ggml/src/ggml-cuda/getrows.cu | 6 ++--- ggml/src/ggml-cuda/mmvf.cu | 4 ++-- ggml/src/ggml-cuda/set-rows.cu | 9 +------ 6 files changed, 17 insertions(+), 55 deletions(-) diff --git a/ggml/src/ggml-cuda/convert.cu b/ggml/src/ggml-cuda/convert.cu index dbd9f608f134d..8f0efdcc1260b 100644 --- a/ggml/src/ggml-cuda/convert.cu +++ b/ggml/src/ggml-cuda/convert.cu @@ -31,8 +31,8 @@ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __ dequantize_kernel(vx, ib, iqs, v); const int64_t iy0 = ((i03*ne02 + i02)*ne01 + i01)*ne00 + iybs + iqs; - y[iy0 + 0] = ggml_cuda_cast(v.x); - y[iy0 + y_offset] = ggml_cuda_cast(v.y); + y[iy0 + 0] = ggml_cuda_cast(v.x); + y[iy0 + y_offset] = ggml_cuda_cast(v.y); } template @@ -630,7 +630,7 @@ static __global__ void convert_unary( const int64_t ix = i03*s03 + i02*s02 + i01*s01 + i00; const int64_t iy = ((i03*ne02 + i02)*ne01 + i01)*ne00 + i00; - y[iy] = ggml_cuda_cast(x[ix]); + y[iy] = ggml_cuda_cast(x[ix]); } template diff --git a/ggml/src/ggml-cuda/convert.cuh b/ggml/src/ggml-cuda/convert.cuh index 0a812dd06787e..c62e8a1b1040a 100644 --- a/ggml/src/ggml-cuda/convert.cuh +++ b/ggml/src/ggml-cuda/convert.cuh @@ -30,41 +30,15 @@ to_fp32_nc_cuda_t ggml_get_to_fp32_nc_cuda(ggml_type type); to_fp16_nc_cuda_t ggml_get_to_fp16_nc_cuda(ggml_type type); to_bf16_nc_cuda_t ggml_get_to_bf16_nc_cuda(ggml_type type); -template - __host__ __device__ inline dest_t ggml_cuda_cast(src_t x) { - if constexpr (std::is_same_v) { +template + __host__ __device__ inline dst_t ggml_cuda_cast(src_t x) { + if constexpr (std::is_same_v) { return x; + } else if constexpr(std::is_same_v) { + return __float2bfloat16(float(x)); + } else if constexpr(std::is_same_v) { + return __bfloat162float(x); } else { return float(x); } } - -template<> -__host__ __device__ inline float ggml_cuda_cast(nv_bfloat16 x) { - return __bfloat162float(x); -} - -template<> -__host__ __device__ inline nv_bfloat16 ggml_cuda_cast(float x) { - return __float2bfloat16(x); -} - -template<> -__host__ __device__ inline half ggml_cuda_cast(nv_bfloat16 x) { - return half(__bfloat162float(x)); -} - -template<> -__host__ __device__ inline nv_bfloat16 ggml_cuda_cast(half x) { - return __float2bfloat16(float(x)); -} - -template<> -__host__ __device__ inline int ggml_cuda_cast(nv_bfloat16 x) { - return int(__bfloat162float(x)); -} - -template<> -__host__ __device__ inline nv_bfloat16 ggml_cuda_cast(int x) { - return __float2bfloat16(float(x)); -} diff --git a/ggml/src/ggml-cuda/cpy-utils.cuh b/ggml/src/ggml-cuda/cpy-utils.cuh index a893a5e73a091..e621cb9811ab6 100644 --- a/ggml/src/ggml-cuda/cpy-utils.cuh +++ b/ggml/src/ggml-cuda/cpy-utils.cuh @@ -3,11 +3,6 @@ #include "ggml-common.h" #include "convert.cuh" -template -static __device__ __forceinline__ void convert_flt(const src_t * src, dst_t * dst) { - *dst = ggml_cuda_cast(*src); -} - static __device__ __forceinline__ int best_index_int8(int n, const int8_t * val, float x) { if (x <= val[0]) return 0; if (x >= val[n-1]) return n-1; @@ -218,5 +213,5 @@ static __device__ void cpy_blck_f32_iq4_nl(const char * cxi, char * cdsti) { template static __device__ void cpy_1_flt(const char * cxi, char * cdsti) { - convert_flt((const src_t *)cxi, (dst_t *)cdsti); + *(dst_t *) cdsti = ggml_cuda_cast(*(const src_t *) cxi); } diff --git a/ggml/src/ggml-cuda/getrows.cu b/ggml/src/ggml-cuda/getrows.cu index f90a371fac66b..68d3254fbe472 100644 --- a/ggml/src/ggml-cuda/getrows.cu +++ b/ggml/src/ggml-cuda/getrows.cu @@ -35,8 +35,8 @@ static __global__ void k_get_rows( dfloat2 v; dequantize_kernel(src0_row, ib, iqs, v); - dst_row[iybs + iqs + 0] = ggml_cuda_cast(v.x); - dst_row[iybs + iqs + y_offset] = ggml_cuda_cast(v.y); + dst_row[iybs + iqs + 0] = ggml_cuda_cast(v.x); + dst_row[iybs + iqs + y_offset] = ggml_cuda_cast(v.y); } template @@ -63,7 +63,7 @@ static __global__ void k_get_rows_float( dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3; const src0_t * src0_row = (const src0_t *)((const char *) src0 + i01*nb01 + i11*nb02 + i12*nb03); - dst_row[i00] = ggml_cuda_cast(src0_row[i00]); + dst_row[i00] = ggml_cuda_cast(src0_row[i00]); } template diff --git a/ggml/src/ggml-cuda/mmvf.cu b/ggml/src/ggml-cuda/mmvf.cu index 95c1cef32b4f4..16100b680456a 100644 --- a/ggml/src/ggml-cuda/mmvf.cu +++ b/ggml/src/ggml-cuda/mmvf.cu @@ -94,8 +94,8 @@ static __global__ void mul_mat_vec_f( #pragma unroll for (int j = 0; j < ncols_dst; ++j) { const float2 tmpy = y2[j*stride_col_y2 + col2]; - sumf[j] += ggml_cuda_cast(reinterpret_cast(&tmpx)[0]) * tmpy.x; - sumf[j] += ggml_cuda_cast(reinterpret_cast(&tmpx)[1]) * tmpy.y; + sumf[j] += ggml_cuda_cast(reinterpret_cast(&tmpx)[0]) * tmpy.x; + sumf[j] += ggml_cuda_cast(reinterpret_cast(&tmpx)[1]) * tmpy.y; } } } else { diff --git a/ggml/src/ggml-cuda/set-rows.cu b/ggml/src/ggml-cuda/set-rows.cu index 07983436459d4..b4115a43c2a32 100644 --- a/ggml/src/ggml-cuda/set-rows.cu +++ b/ggml/src/ggml-cuda/set-rows.cu @@ -3,11 +3,6 @@ typedef void (*set_rows_kernel_t)(const char * src, char * dst); -template -__device__ __forceinline__ void set_rows_1(const src_t * src_f, dst_t * dst_f) { - convert_flt(src_f, dst_f); -} - // Generic quantized set_rows kernel template template static __global__ void k_set_rows_quant( @@ -117,9 +112,7 @@ static __global__ void k_set_rows( const src_t * src0_row = src0 + i01*s01 + i02*s02 + i03*s03; dst_t * dst_row_ptr = dst + dst_row*s1 + i02*s2 + i03*s3; - const src_t* src_elem = src0_row + i00; - dst_t* dst_elem = dst_row_ptr + i00; - set_rows_1(src_elem, dst_elem); + dst_row_ptr[i00] = ggml_cuda_cast(src0_row[i00]); GGML_UNUSED(ne10); GGML_UNUSED(ne13);