From 009626c9f8effd13a17085e1a45c27e90d0491be Mon Sep 17 00:00:00 2001
From: Carl Philipp Klemm <carl@uvos.xyz>
Date: Wed, 13 Aug 2025 11:31:33 +0200
Subject: [PATCH 1/4] CUDA/HIP: add expicit conversion operator to support
 older versions of rocm

---
 ggml/src/ggml-cuda/convert.cu    |  6 ++---
 ggml/src/ggml-cuda/convert.cuh   | 39 ++++++++++++++++++++++++++++++++
 ggml/src/ggml-cuda/cpy-utils.cuh |  3 ++-
 ggml/src/ggml-cuda/getrows.cu    |  7 +++---
 ggml/src/ggml-cuda/mmvf.cu       |  5 ++--
 5 files changed, 51 insertions(+), 9 deletions(-)
diff --git a/ggml/src/ggml-cuda/convert.cu b/ggml/src/ggml-cuda/convert.cu
index e3beddbc1b23b..7690812a649de 100644
--- a/ggml/src/ggml-cuda/convert.cu
+++ b/ggml/src/ggml-cuda/convert.cu
@@ -31,8 +31,8 @@ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __
     dequantize_kernel(vx, ib, iqs, v);
 
     const int64_t iy0 = ((i03*ne02 + i02)*ne01 + i01)*ne00 + iybs + iqs;
-    y[iy0 + 0]        = float(v.x);
-    y[iy0 + y_offset] = float(v.y);
+    y[iy0 + 0]        = ggml_cuda_convert_val<float, dst_t>(v.x);
+    y[iy0 + y_offset] = ggml_cuda_convert_val<float, dst_t>(v.y);
 }
 
 template <bool need_check>
@@ -630,7 +630,7 @@ static __global__ void convert_unary(
 
     const int64_t ix = i03*s03 + i02*s02 + i01*s01 + i00;
     const int64_t iy = ((i03*ne02 + i02)*ne01 + i01)*ne00 + i00;
-    y[iy] = float(x[ix]);
+    y[iy] = ggml_cuda_convert_val<src_t, dst_t>(x[ix]);
 }
 
 template <typename src_t, typename dst_t>
diff --git a/ggml/src/ggml-cuda/convert.cuh b/ggml/src/ggml-cuda/convert.cuh
index f04214be175ba..35da9b39a0af9 100644
--- a/ggml/src/ggml-cuda/convert.cuh
+++ b/ggml/src/ggml-cuda/convert.cuh
@@ -29,3 +29,42 @@ typedef to_t_nc_cuda_t<nv_bfloat16> to_bf16_nc_cuda_t;
 to_fp32_nc_cuda_t ggml_get_to_fp32_nc_cuda(ggml_type type);
 to_fp16_nc_cuda_t ggml_get_to_fp16_nc_cuda(ggml_type type);
 to_bf16_nc_cuda_t ggml_get_to_bf16_nc_cuda(ggml_type type);
+
+template<typename src_t, typename dest_t>
+ __host__ __device__ inline dest_t ggml_cuda_convert_val(src_t x) {
+    if constexpr (std::is_same_v<src_t, dest_t>) {
+        return x;
+    } else {
+        return float(x);
+    }
+}
+
+template<>
+__host__ __device__ inline float ggml_cuda_convert_val<nv_bfloat16, float>(nv_bfloat16 x) {
+    return __bfloat162float(x);
+}
+
+template<>
+__host__ __device__ inline nv_bfloat16 ggml_cuda_convert_val<float, nv_bfloat16>(float x) {
+    return __float2bfloat16(x);
+}
+
+template<>
+__host__ __device__ inline half ggml_cuda_convert_val<nv_bfloat16, half>(nv_bfloat16 x) {
+    return half(__bfloat162float(x));
+}
+
+template<>
+__host__ __device__ inline nv_bfloat16 ggml_cuda_convert_val<half, nv_bfloat16>(half x) {
+    return __float2bfloat16(float(x));
+}
+
+template<>
+__host__ __device__ inline int ggml_cuda_convert_val<nv_bfloat16, int>(nv_bfloat16 x) {
+    return int(__bfloat162float(x));
+}
+
+template<>
+__host__ __device__ inline nv_bfloat16 ggml_cuda_convert_val<int, nv_bfloat16>(int x) {
+    return __float2bfloat16(float(x));
+}
diff --git a/ggml/src/ggml-cuda/cpy-utils.cuh b/ggml/src/ggml-cuda/cpy-utils.cuh
index 410c12b7ba56b..603af215f27d9 100644
--- a/ggml/src/ggml-cuda/cpy-utils.cuh
+++ b/ggml/src/ggml-cuda/cpy-utils.cuh
@@ -1,13 +1,14 @@
 #pragma once
 
 #include "ggml-common.h"
+#include "convert.cuh"
 
 template<typename src_t, typename dst_t>
 static __device__ __forceinline__ void convert_flt(const src_t * src, dst_t * dst) {
     if constexpr (std::is_same_v<src_t, dst_t>) {
         *dst = *src;
     } else {
-        *dst = float(*src);
+        *dst = ggml_cuda_convert_val<src_t, dst_t>(*src);
     }
 }
 
diff --git a/ggml/src/ggml-cuda/getrows.cu b/ggml/src/ggml-cuda/getrows.cu
index f77b2629a19b0..24daf7ded464d 100644
--- a/ggml/src/ggml-cuda/getrows.cu
+++ b/ggml/src/ggml-cuda/getrows.cu
@@ -1,5 +1,6 @@
 #include "getrows.cuh"
 #include "dequantize.cuh"
+#include "convert.cuh"
 
 template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
 static __global__ void k_get_rows(
@@ -34,8 +35,8 @@ static __global__ void k_get_rows(
     dfloat2 v;
     dequantize_kernel(src0_row, ib, iqs, v);
 
-    dst_row[iybs + iqs + 0]        = float(v.x);
-    dst_row[iybs + iqs + y_offset] = float(v.y);
+    dst_row[iybs + iqs + 0]        = ggml_cuda_convert_val<float, dst_t>(v.x);
+    dst_row[iybs + iqs + y_offset] = ggml_cuda_convert_val<float, dst_t>(v.y);
 }
 
 template<typename src0_t, typename dst_t>
@@ -62,7 +63,7 @@ static __global__ void k_get_rows_float(
     dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
     const src0_t * src0_row = (const src0_t *)((const char *) src0 + i01*nb01 + i11*nb02 + i12*nb03);
 
-    dst_row[i00] = float(src0_row[i00]);
+    dst_row[i00] = ggml_cuda_convert_val<src0_t, dst_t>(src0_row[i00]);
 }
 
 template<typename grad_t, typename dst_t>
diff --git a/ggml/src/ggml-cuda/mmvf.cu b/ggml/src/ggml-cuda/mmvf.cu
index 1ad4bc75ba614..6628313ad035f 100644
--- a/ggml/src/ggml-cuda/mmvf.cu
+++ b/ggml/src/ggml-cuda/mmvf.cu
@@ -1,5 +1,6 @@
 #include "ggml.h"
 #include "common.cuh"
+#include "convert.cuh"
 #include "mmvf.cuh"
 
 template <typename T, typename type_acc, int ncols_dst, int block_size>
@@ -93,8 +94,8 @@ static __global__ void mul_mat_vec_f(
 #pragma unroll
             for (int j = 0; j < ncols_dst; ++j) {
                 const float2 tmpy = y2[j*stride_col_y2 + col2];
-                sumf[j] += float(reinterpret_cast<const nv_bfloat16 *>(&tmpx)[0]) * tmpy.x;
-                sumf[j] += float(reinterpret_cast<const nv_bfloat16 *>(&tmpx)[1]) * tmpy.y;
+                sumf[j] += ggml_cuda_convert_val<nv_bfloat16, float>(reinterpret_cast<const nv_bfloat16 *>(&tmpx)[0]) * tmpy.x;
+                sumf[j] += ggml_cuda_convert_val<nv_bfloat16, float>(reinterpret_cast<const nv_bfloat16 *>(&tmpx)[1]) * tmpy.y;
             }
         }
     } else {

From e9ff641affbd447f6c8e99375825ed20f8120836 Mon Sep 17 00:00:00 2001
From: Carl Philipp Klemm <carl@uvos.xyz>
Date: Thu, 14 Aug 2025 11:45:33 +0200
Subject: [PATCH 2/4] HIP: Cleanup hipification header

Switch over to hip_bf16 from legacy hip_bfloat16
Simplify RDNA3 define
Reduce swap over of new hipblas api to rocm 6.5 as this version is used for rocm 7.0 previews
---
 ggml/src/ggml-cuda/vendors/hip.h | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h
index ec1b59caafc9a..6e9c67aca096e 100644
--- a/ggml/src/ggml-cuda/vendors/hip.h
+++ b/ggml/src/ggml-cuda/vendors/hip.h
@@ -4,7 +4,7 @@
 #include <hip/hip_runtime.h>
 #include <hipblas/hipblas.h>
 #include <hip/hip_fp16.h>
-#include <hip/hip_bfloat16.h>
+#include <hip/hip_bf16.h>
 
 #define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
 #define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
@@ -135,7 +135,7 @@
 #define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
 #define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
 
-#if HIP_VERSION >= 70000000
+#if HIP_VERSION >= 60500000
 #define CUBLAS_COMPUTE_16F HIPBLAS_COMPUTE_16F
 #define CUBLAS_COMPUTE_32F HIPBLAS_COMPUTE_32F
 #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_COMPUTE_32F_FAST_16F
@@ -147,7 +147,7 @@
 #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
 #define cublasComputeType_t hipblasDatatype_t
 #define cudaDataType_t hipblasDatatype_t
-#endif // HIP_VERSION >= 7000000
+#endif // HIP_VERSION >= 6050000
 
 #if !defined(__HIP_PLATFORM_AMD__)
 #error "The HIP backend supports only AMD targets"
@@ -179,8 +179,7 @@
 #define RDNA4
 #endif
 
-#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
-    defined(__gfx1150__) || defined(__gfx1151__)
+#if defined(__GFX11__)
 #define RDNA3
 #endif
 
@@ -197,8 +196,8 @@
     #define __has_builtin(x) 0
 #endif
 
-typedef hip_bfloat16 nv_bfloat16;
-typedef short2 nv_bfloat162; // FIXME there is no 2x BF16 type being defined in bfloat16.h, ad-hoc compilation fix
+typedef __hip_bfloat16 nv_bfloat16;
+typedef __hip_bfloat162 nv_bfloat162;
 
 typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
 typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));

From 173df5e4a8b4639f8b8dc093feb275b96aab22e2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Thu, 14 Aug 2025 12:37:50 +0200
Subject: [PATCH 3/4] convert_val -> cast

---
 ggml/src/ggml-cuda/convert.cu    |  6 +++---
 ggml/src/ggml-cuda/convert.cuh   | 14 +++++++-------
 ggml/src/ggml-cuda/cpy-utils.cuh |  6 +-----
 ggml/src/ggml-cuda/getrows.cu    |  6 +++---
 ggml/src/ggml-cuda/mmvf.cu       |  4 ++--
 5 files changed, 16 insertions(+), 20 deletions(-)

diff --git a/ggml/src/ggml-cuda/convert.cu b/ggml/src/ggml-cuda/convert.cu
index 7690812a649de..dbd9f608f134d 100644
--- a/ggml/src/ggml-cuda/convert.cu
+++ b/ggml/src/ggml-cuda/convert.cu
@@ -31,8 +31,8 @@ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __
     dequantize_kernel(vx, ib, iqs, v);
 
     const int64_t iy0 = ((i03*ne02 + i02)*ne01 + i01)*ne00 + iybs + iqs;
-    y[iy0 + 0]        = ggml_cuda_convert_val<float, dst_t>(v.x);
-    y[iy0 + y_offset] = ggml_cuda_convert_val<float, dst_t>(v.y);
+    y[iy0 + 0]        = ggml_cuda_cast<float, dst_t>(v.x);
+    y[iy0 + y_offset] = ggml_cuda_cast<float, dst_t>(v.y);
 }
 
 template <bool need_check>
@@ -630,7 +630,7 @@ static __global__ void convert_unary(
 
     const int64_t ix = i03*s03 + i02*s02 + i01*s01 + i00;
     const int64_t iy = ((i03*ne02 + i02)*ne01 + i01)*ne00 + i00;
-    y[iy] = ggml_cuda_convert_val<src_t, dst_t>(x[ix]);
+    y[iy] = ggml_cuda_cast<src_t, dst_t>(x[ix]);
 }
 
 template <typename src_t, typename dst_t>
diff --git a/ggml/src/ggml-cuda/convert.cuh b/ggml/src/ggml-cuda/convert.cuh
index 35da9b39a0af9..0a812dd06787e 100644
--- a/ggml/src/ggml-cuda/convert.cuh
+++ b/ggml/src/ggml-cuda/convert.cuh
@@ -31,7 +31,7 @@ to_fp16_nc_cuda_t ggml_get_to_fp16_nc_cuda(ggml_type type);
 to_bf16_nc_cuda_t ggml_get_to_bf16_nc_cuda(ggml_type type);
 
 template<typename src_t, typename dest_t>
- __host__ __device__ inline dest_t ggml_cuda_convert_val(src_t x) {
+ __host__ __device__ inline dest_t ggml_cuda_cast(src_t x) {
     if constexpr (std::is_same_v<src_t, dest_t>) {
         return x;
     } else {
@@ -40,31 +40,31 @@ template<typename src_t, typename dest_t>
 }
 
 template<>
-__host__ __device__ inline float ggml_cuda_convert_val<nv_bfloat16, float>(nv_bfloat16 x) {
+__host__ __device__ inline float ggml_cuda_cast<nv_bfloat16, float>(nv_bfloat16 x) {
     return __bfloat162float(x);
 }
 
 template<>
-__host__ __device__ inline nv_bfloat16 ggml_cuda_convert_val<float, nv_bfloat16>(float x) {
+__host__ __device__ inline nv_bfloat16 ggml_cuda_cast<float, nv_bfloat16>(float x) {
     return __float2bfloat16(x);
 }
 
 template<>
-__host__ __device__ inline half ggml_cuda_convert_val<nv_bfloat16, half>(nv_bfloat16 x) {
+__host__ __device__ inline half ggml_cuda_cast<nv_bfloat16, half>(nv_bfloat16 x) {
     return half(__bfloat162float(x));
 }
 
 template<>
-__host__ __device__ inline nv_bfloat16 ggml_cuda_convert_val<half, nv_bfloat16>(half x) {
+__host__ __device__ inline nv_bfloat16 ggml_cuda_cast<half, nv_bfloat16>(half x) {
     return __float2bfloat16(float(x));
 }
 
 template<>
-__host__ __device__ inline int ggml_cuda_convert_val<nv_bfloat16, int>(nv_bfloat16 x) {
+__host__ __device__ inline int ggml_cuda_cast<nv_bfloat16, int>(nv_bfloat16 x) {
     return int(__bfloat162float(x));
 }
 
 template<>
-__host__ __device__ inline nv_bfloat16 ggml_cuda_convert_val<int, nv_bfloat16>(int x) {
+__host__ __device__ inline nv_bfloat16 ggml_cuda_cast<int, nv_bfloat16>(int x) {
     return __float2bfloat16(float(x));
 }
diff --git a/ggml/src/ggml-cuda/cpy-utils.cuh b/ggml/src/ggml-cuda/cpy-utils.cuh
index 603af215f27d9..a893a5e73a091 100644
--- a/ggml/src/ggml-cuda/cpy-utils.cuh
+++ b/ggml/src/ggml-cuda/cpy-utils.cuh
@@ -5,11 +5,7 @@
 
 template<typename src_t, typename dst_t>
 static __device__ __forceinline__ void convert_flt(const src_t * src, dst_t * dst) {
-    if constexpr (std::is_same_v<src_t, dst_t>) {
-        *dst = *src;
-    } else {
-        *dst = ggml_cuda_convert_val<src_t, dst_t>(*src);
-    }
+    *dst = ggml_cuda_cast<src_t, dst_t>(*src);
 }
 
 static __device__ __forceinline__ int best_index_int8(int n, const int8_t * val, float x) {
diff --git a/ggml/src/ggml-cuda/getrows.cu b/ggml/src/ggml-cuda/getrows.cu
index 24daf7ded464d..f90a371fac66b 100644
--- a/ggml/src/ggml-cuda/getrows.cu
+++ b/ggml/src/ggml-cuda/getrows.cu
@@ -35,8 +35,8 @@ static __global__ void k_get_rows(
     dfloat2 v;
     dequantize_kernel(src0_row, ib, iqs, v);
 
-    dst_row[iybs + iqs + 0]        = ggml_cuda_convert_val<float, dst_t>(v.x);
-    dst_row[iybs + iqs + y_offset] = ggml_cuda_convert_val<float, dst_t>(v.y);
+    dst_row[iybs + iqs + 0]        = ggml_cuda_cast<float, dst_t>(v.x);
+    dst_row[iybs + iqs + y_offset] = ggml_cuda_cast<float, dst_t>(v.y);
 }
 
 template<typename src0_t, typename dst_t>
@@ -63,7 +63,7 @@ static __global__ void k_get_rows_float(
     dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
     const src0_t * src0_row = (const src0_t *)((const char *) src0 + i01*nb01 + i11*nb02 + i12*nb03);
 
-    dst_row[i00] = ggml_cuda_convert_val<src0_t, dst_t>(src0_row[i00]);
+    dst_row[i00] = ggml_cuda_cast<src0_t, dst_t>(src0_row[i00]);
 }
 
 template<typename grad_t, typename dst_t>
diff --git a/ggml/src/ggml-cuda/mmvf.cu b/ggml/src/ggml-cuda/mmvf.cu
index 6628313ad035f..95c1cef32b4f4 100644
--- a/ggml/src/ggml-cuda/mmvf.cu
+++ b/ggml/src/ggml-cuda/mmvf.cu
@@ -94,8 +94,8 @@ static __global__ void mul_mat_vec_f(
 #pragma unroll
             for (int j = 0; j < ncols_dst; ++j) {
                 const float2 tmpy = y2[j*stride_col_y2 + col2];
-                sumf[j] += ggml_cuda_convert_val<nv_bfloat16, float>(reinterpret_cast<const nv_bfloat16 *>(&tmpx)[0]) * tmpy.x;
-                sumf[j] += ggml_cuda_convert_val<nv_bfloat16, float>(reinterpret_cast<const nv_bfloat16 *>(&tmpx)[1]) * tmpy.y;
+                sumf[j] += ggml_cuda_cast<nv_bfloat16, float>(reinterpret_cast<const nv_bfloat16 *>(&tmpx)[0]) * tmpy.x;
+                sumf[j] += ggml_cuda_cast<nv_bfloat16, float>(reinterpret_cast<const nv_bfloat16 *>(&tmpx)[1]) * tmpy.y;
             }
         }
     } else {

From 6dcde5ac4c50c06c552cdf55d7466f607d4e1f1e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Thu, 14 Aug 2025 12:59:36 +0200
Subject: [PATCH 4/4] swap arg order, consolidate

---
 ggml/src/ggml-cuda/convert.cu    |  6 ++---
 ggml/src/ggml-cuda/convert.cuh   | 40 ++++++--------------------------
 ggml/src/ggml-cuda/cpy-utils.cuh |  7 +-----
 ggml/src/ggml-cuda/getrows.cu    |  6 ++---
 ggml/src/ggml-cuda/mmvf.cu       |  4 ++--
 ggml/src/ggml-cuda/set-rows.cu   |  9 +------
 6 files changed, 17 insertions(+), 55 deletions(-)

diff --git a/ggml/src/ggml-cuda/convert.cu b/ggml/src/ggml-cuda/convert.cu
index dbd9f608f134d..8f0efdcc1260b 100644
--- a/ggml/src/ggml-cuda/convert.cu
+++ b/ggml/src/ggml-cuda/convert.cu
@@ -31,8 +31,8 @@ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __
     dequantize_kernel(vx, ib, iqs, v);
 
     const int64_t iy0 = ((i03*ne02 + i02)*ne01 + i01)*ne00 + iybs + iqs;
-    y[iy0 + 0]        = ggml_cuda_cast<float, dst_t>(v.x);
-    y[iy0 + y_offset] = ggml_cuda_cast<float, dst_t>(v.y);
+    y[iy0 + 0]        = ggml_cuda_cast<dst_t>(v.x);
+    y[iy0 + y_offset] = ggml_cuda_cast<dst_t>(v.y);
 }
 
 template <bool need_check>
@@ -630,7 +630,7 @@ static __global__ void convert_unary(
 
     const int64_t ix = i03*s03 + i02*s02 + i01*s01 + i00;
     const int64_t iy = ((i03*ne02 + i02)*ne01 + i01)*ne00 + i00;
-    y[iy] = ggml_cuda_cast<src_t, dst_t>(x[ix]);
+    y[iy] = ggml_cuda_cast<dst_t>(x[ix]);
 }
 
 template <typename src_t, typename dst_t>
diff --git a/ggml/src/ggml-cuda/convert.cuh b/ggml/src/ggml-cuda/convert.cuh
index 0a812dd06787e..c62e8a1b1040a 100644
--- a/ggml/src/ggml-cuda/convert.cuh
+++ b/ggml/src/ggml-cuda/convert.cuh
@@ -30,41 +30,15 @@ to_fp32_nc_cuda_t ggml_get_to_fp32_nc_cuda(ggml_type type);
 to_fp16_nc_cuda_t ggml_get_to_fp16_nc_cuda(ggml_type type);
 to_bf16_nc_cuda_t ggml_get_to_bf16_nc_cuda(ggml_type type);
 
-template<typename src_t, typename dest_t>
- __host__ __device__ inline dest_t ggml_cuda_cast(src_t x) {
-    if constexpr (std::is_same_v<src_t, dest_t>) {
+template<typename dst_t, typename src_t>
+ __host__ __device__ inline dst_t ggml_cuda_cast(src_t x) {
+    if constexpr (std::is_same_v<dst_t, src_t>) {
         return x;
+    } else if constexpr(std::is_same_v<dst_t, nv_bfloat16>) {
+        return __float2bfloat16(float(x));
+    } else if constexpr(std::is_same_v<src_t, nv_bfloat16>) {
+        return __bfloat162float(x);
     } else {
         return float(x);
     }
 }
-
-template<>
-__host__ __device__ inline float ggml_cuda_cast<nv_bfloat16, float>(nv_bfloat16 x) {
-    return __bfloat162float(x);
-}
-
-template<>
-__host__ __device__ inline nv_bfloat16 ggml_cuda_cast<float, nv_bfloat16>(float x) {
-    return __float2bfloat16(x);
-}
-
-template<>
-__host__ __device__ inline half ggml_cuda_cast<nv_bfloat16, half>(nv_bfloat16 x) {
-    return half(__bfloat162float(x));
-}
-
-template<>
-__host__ __device__ inline nv_bfloat16 ggml_cuda_cast<half, nv_bfloat16>(half x) {
-    return __float2bfloat16(float(x));
-}
-
-template<>
-__host__ __device__ inline int ggml_cuda_cast<nv_bfloat16, int>(nv_bfloat16 x) {
-    return int(__bfloat162float(x));
-}
-
-template<>
-__host__ __device__ inline nv_bfloat16 ggml_cuda_cast<int, nv_bfloat16>(int x) {
-    return __float2bfloat16(float(x));
-}
diff --git a/ggml/src/ggml-cuda/cpy-utils.cuh b/ggml/src/ggml-cuda/cpy-utils.cuh
index a893a5e73a091..e621cb9811ab6 100644
--- a/ggml/src/ggml-cuda/cpy-utils.cuh
+++ b/ggml/src/ggml-cuda/cpy-utils.cuh
@@ -3,11 +3,6 @@
 #include "ggml-common.h"
 #include "convert.cuh"
 
-template<typename src_t, typename dst_t>
-static __device__ __forceinline__ void convert_flt(const src_t * src, dst_t * dst) {
-    *dst = ggml_cuda_cast<src_t, dst_t>(*src);
-}
-
 static __device__ __forceinline__ int best_index_int8(int n, const int8_t * val, float x) {
     if (x <= val[0]) return 0;
     if (x >= val[n-1]) return n-1;
@@ -218,5 +213,5 @@ static __device__ void cpy_blck_f32_iq4_nl(const char * cxi, char * cdsti) {
 
 template<typename src_t, typename dst_t>
 static __device__ void cpy_1_flt(const char * cxi, char * cdsti) {
-    convert_flt((const src_t *)cxi, (dst_t *)cdsti);
+    *(dst_t *) cdsti = ggml_cuda_cast<dst_t>(*(const src_t *) cxi);
 }
diff --git a/ggml/src/ggml-cuda/getrows.cu b/ggml/src/ggml-cuda/getrows.cu
index f90a371fac66b..68d3254fbe472 100644
--- a/ggml/src/ggml-cuda/getrows.cu
+++ b/ggml/src/ggml-cuda/getrows.cu
@@ -35,8 +35,8 @@ static __global__ void k_get_rows(
     dfloat2 v;
     dequantize_kernel(src0_row, ib, iqs, v);
 
-    dst_row[iybs + iqs + 0]        = ggml_cuda_cast<float, dst_t>(v.x);
-    dst_row[iybs + iqs + y_offset] = ggml_cuda_cast<float, dst_t>(v.y);
+    dst_row[iybs + iqs + 0]        = ggml_cuda_cast<dst_t>(v.x);
+    dst_row[iybs + iqs + y_offset] = ggml_cuda_cast<dst_t>(v.y);
 }
 
 template<typename src0_t, typename dst_t>
@@ -63,7 +63,7 @@ static __global__ void k_get_rows_float(
     dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
     const src0_t * src0_row = (const src0_t *)((const char *) src0 + i01*nb01 + i11*nb02 + i12*nb03);
 
-    dst_row[i00] = ggml_cuda_cast<src0_t, dst_t>(src0_row[i00]);
+    dst_row[i00] = ggml_cuda_cast<dst_t>(src0_row[i00]);
 }
 
 template<typename grad_t, typename dst_t>
diff --git a/ggml/src/ggml-cuda/mmvf.cu b/ggml/src/ggml-cuda/mmvf.cu
index 95c1cef32b4f4..16100b680456a 100644
--- a/ggml/src/ggml-cuda/mmvf.cu
+++ b/ggml/src/ggml-cuda/mmvf.cu
@@ -94,8 +94,8 @@ static __global__ void mul_mat_vec_f(
 #pragma unroll
             for (int j = 0; j < ncols_dst; ++j) {
                 const float2 tmpy = y2[j*stride_col_y2 + col2];
-                sumf[j] += ggml_cuda_cast<nv_bfloat16, float>(reinterpret_cast<const nv_bfloat16 *>(&tmpx)[0]) * tmpy.x;
-                sumf[j] += ggml_cuda_cast<nv_bfloat16, float>(reinterpret_cast<const nv_bfloat16 *>(&tmpx)[1]) * tmpy.y;
+                sumf[j] += ggml_cuda_cast<float>(reinterpret_cast<const nv_bfloat16 *>(&tmpx)[0]) * tmpy.x;
+                sumf[j] += ggml_cuda_cast<float>(reinterpret_cast<const nv_bfloat16 *>(&tmpx)[1]) * tmpy.y;
             }
         }
     } else {
diff --git a/ggml/src/ggml-cuda/set-rows.cu b/ggml/src/ggml-cuda/set-rows.cu
index 07983436459d4..b4115a43c2a32 100644
--- a/ggml/src/ggml-cuda/set-rows.cu
+++ b/ggml/src/ggml-cuda/set-rows.cu
@@ -3,11 +3,6 @@
 
 typedef void (*set_rows_kernel_t)(const char * src, char * dst);
 
-template<typename src_t, typename dst_t>
-__device__ __forceinline__ void set_rows_1(const src_t * src_f, dst_t * dst_f) {
-    convert_flt(src_f, dst_f);
-}
-
 // Generic quantized set_rows kernel template
 template<typename block_type, int qk, void (*quantize_func)(const float*, block_type*)>
 static __global__ void k_set_rows_quant(
@@ -117,9 +112,7 @@ static __global__ void k_set_rows(
     const src_t * src0_row = src0 + i01*s01 + i02*s02 + i03*s03;
     dst_t * dst_row_ptr    = dst + dst_row*s1 + i02*s2 + i03*s3;
 
-    const src_t* src_elem = src0_row + i00;
-    dst_t* dst_elem = dst_row_ptr + i00;
-    set_rows_1(src_elem, dst_elem);
+    dst_row_ptr[i00] = ggml_cuda_cast<dst_t>(src0_row[i00]);
 
     GGML_UNUSED(ne10);
     GGML_UNUSED(ne13);