JeffreyXiang · ZJLi2013 · Apr 8, 2026
diff --git a/setup.py b/setup.py
@@ -115,9 +115,7 @@
         ],
         extra_compile_args={
             "cxx": cxx_flags,
-            "nvcc": nvcc_flags + [
-                # The following definitions must be undefined
-                # since we need half-precision operation.
+            "nvcc": nvcc_flags if IS_HIP else nvcc_flags + [
                 "--extended-lambda",
                 "-U__CUDA_NO_HALF_OPERATORS__",
                 "-U__CUDA_NO_HALF_CONVERSIONS__",

diff --git a/src/atlas.cu b/src/atlas.cu
@@ -322,11 +322,7 @@ static void get_chart_connectivity(
     ));
     CUDA_CHECK(cudaFree(cu_raw_lengths));
 
-    #if CUDART_VERSION >= 12090
-        auto reduce_op = ::cuda::std::plus();
-    #else
-        auto reduce_op = cub::Sum();
-    #endif
+    auto reduce_op = cub::Sum();
 
 
     // 1.3 Reduce By Key (Aggregate duplicate chart pairs by summing lengths)

diff --git a/src/clean_up.cu b/src/clean_up.cu
@@ -227,13 +227,24 @@ static __global__ void select_first_in_each_group_kernel(
 }
 
 
+#if defined(__HIP_PLATFORM_AMD__)
+#include <rocprim/types/tuple.hpp>
+struct int3_decomposer
+{
+    __host__ __device__ ::rocprim::tuple<int&, int&, int&> operator()(int3& key) const
+    {
+        return ::rocprim::tuple<int&, int&, int&>{key.x, key.y, key.z};
+    }
+};
+#else
 struct int3_decomposer
 {
     __host__ __device__ ::cuda::std::tuple<int&, int&, int&> operator()(int3& key) const
     {
         return {key.x, key.y, key.z};
     }
 };
+#endif
 
 
 void CuMesh::remove_duplicate_faces() {

diff --git a/src/dtypes.cuh b/src/dtypes.cuh
@@ -13,24 +13,24 @@ namespace cumesh {
 struct __align__(16) Vec3f {
     float x, y, z;
 
-    __device__ __forceinline__ Vec3f();
-    __device__ __forceinline__ Vec3f(float x, float y, float z);
-    __device__ __forceinline__ Vec3f(float3 v);
-    __device__ __forceinline__ Vec3f operator+(const Vec3f& o) const;
-    __device__ __forceinline__ Vec3f& operator+=(const Vec3f& o);
-    __device__ __forceinline__ Vec3f operator-(const Vec3f& o) const;
-    __device__ __forceinline__ Vec3f& operator-=(const Vec3f& o);
-    __device__ __forceinline__ Vec3f operator*(float s) const;
-    __device__ __forceinline__ Vec3f& operator*=(float s);
-    __device__ __forceinline__ Vec3f operator/(float s) const;
-    __device__ __forceinline__ Vec3f& operator/=(float s);
-    __device__ __forceinline__ float dot(const Vec3f& o) const;
-    __device__ __forceinline__ float norm() const;
-    __device__ __forceinline__ float norm2() const;
-    __device__ __forceinline__ Vec3f normalized() const;
-    __device__ __forceinline__ void normalize();
-    __device__ __forceinline__ Vec3f cross(const Vec3f& o) const;
-    __device__ __forceinline__ Vec3f slerp(const Vec3f& o, float t) const;
+    __host__ __device__ __forceinline__ Vec3f();
+    __host__ __device__ __forceinline__ Vec3f(float x, float y, float z);
+    __host__ __device__ __forceinline__ Vec3f(float3 v);
+    __host__ __device__ __forceinline__ Vec3f operator+(const Vec3f& o) const;
+    __host__ __device__ __forceinline__ Vec3f& operator+=(const Vec3f& o);
+    __host__ __device__ __forceinline__ Vec3f operator-(const Vec3f& o) const;
+    __host__ __device__ __forceinline__ Vec3f& operator-=(const Vec3f& o);
+    __host__ __device__ __forceinline__ Vec3f operator*(float s) const;
+    __host__ __device__ __forceinline__ Vec3f& operator*=(float s);
+    __host__ __device__ __forceinline__ Vec3f operator/(float s) const;
+    __host__ __device__ __forceinline__ Vec3f& operator/=(float s);
+    __host__ __device__ __forceinline__ float dot(const Vec3f& o) const;
+    __host__ __device__ __forceinline__ float norm() const;
+    __host__ __device__ __forceinline__ float norm2() const;
+    __host__ __device__ __forceinline__ Vec3f normalized() const;
+    __host__ __device__ __forceinline__ void normalize();
+    __host__ __device__ __forceinline__ Vec3f cross(const Vec3f& o) const;
+    __host__ __device__ __forceinline__ Vec3f slerp(const Vec3f& o, float t) const;
 };
 
 
@@ -43,124 +43,124 @@ struct __align__(16) QEM
     // e = [ 00, 01, 02, 03, 11, 12, 13, 22, 23, 33 ]
     float e[10];
 
-    __device__ __forceinline__ QEM();
-    __device__ __forceinline__ QEM operator+(const QEM& o) const;
-    __device__ __forceinline__ QEM& operator+=(const QEM& o);
-    __device__ __forceinline__ QEM operator-(const QEM& o) const;
-    __device__ __forceinline__ QEM& operator-=(const QEM& o);
-    __device__ __forceinline__ void zero();
-    __device__ __forceinline__ void add_plane(float4 p);
-    __device__ __forceinline__ float evaluate(const Vec3f& p) const;
-    __device__ __forceinline__ bool solve_optimal(float3 &out, float &err) const;
+    __host__ __device__ __forceinline__ QEM();
+    __host__ __device__ __forceinline__ QEM operator+(const QEM& o) const;
+    __host__ __device__ __forceinline__ QEM& operator+=(const QEM& o);
+    __host__ __device__ __forceinline__ QEM operator-(const QEM& o) const;
+    __host__ __device__ __forceinline__ QEM& operator-=(const QEM& o);
+    __host__ __device__ __forceinline__ void zero();
+    __host__ __device__ __forceinline__ void add_plane(float4 p);
+    __host__ __device__ __forceinline__ float evaluate(const Vec3f& p) const;
+    __host__ __device__ __forceinline__ bool solve_optimal(float3 &out, float &err) const;
 };
 
 
-__device__ __forceinline__ Vec3f::Vec3f() {
+__host__ __device__ __forceinline__ Vec3f::Vec3f() {
     x = 0.0f;
     y = 0.0f;
     z = 0.0f;
 }
 
-__device__ __forceinline__ Vec3f::Vec3f(float x, float y, float z) {
+__host__ __device__ __forceinline__ Vec3f::Vec3f(float x, float y, float z) {
     this->x = x;
     this->y = y;
     this->z = z;
 }
 
-__device__ __forceinline__ Vec3f::Vec3f(float3 v) {
+__host__ __device__ __forceinline__ Vec3f::Vec3f(float3 v) {
     x = v.x;
     y = v.y;
     z = v.z;
 }
 
 
-__device__ __forceinline__ Vec3f Vec3f::operator+(const Vec3f& o) const {
+__host__ __device__ __forceinline__ Vec3f Vec3f::operator+(const Vec3f& o) const {
     return Vec3f(x + o.x, y + o.y, z + o.z);
 }
 
 
-__device__ __forceinline__ Vec3f& Vec3f::operator+=(const Vec3f& o) {
+__host__ __device__ __forceinline__ Vec3f& Vec3f::operator+=(const Vec3f& o) {
     x += o.x;
     y += o.y;
     z += o.z;
     return *this;
 }
 
 
-__device__ __forceinline__ Vec3f Vec3f::operator-(const Vec3f& o) const {
+__host__ __device__ __forceinline__ Vec3f Vec3f::operator-(const Vec3f& o) const {
     return Vec3f(x - o.x, y - o.y, z - o.z);
 }
 
 
-__device__ __forceinline__ Vec3f& Vec3f::operator-=(const Vec3f& o) {
+__host__ __device__ __forceinline__ Vec3f& Vec3f::operator-=(const Vec3f& o) {
     x -= o.x;
     y -= o.y;
     z -= o.z;
     return *this;
 }
 
 
-__device__ __forceinline__ Vec3f Vec3f::operator*(float s) const {
+__host__ __device__ __forceinline__ Vec3f Vec3f::operator*(float s) const {
     return Vec3f(x * s, y * s, z * s);
 }
 
 
-__device__ __forceinline__ Vec3f& Vec3f::operator*=(float s) {
+__host__ __device__ __forceinline__ Vec3f& Vec3f::operator*=(float s) {
     x *= s;
     y *= s;
     z *= s;
     return *this;
 }
 
 
-__device__ __forceinline__ Vec3f Vec3f::operator/(float s) const {
+__host__ __device__ __forceinline__ Vec3f Vec3f::operator/(float s) const {
     return Vec3f(x / s, y / s, z / s);
 }
 
 
-__device__ __forceinline__ Vec3f& Vec3f::operator/=(float s) {
+__host__ __device__ __forceinline__ Vec3f& Vec3f::operator/=(float s) {
     x /= s;
     y /= s;
     z /= s;
     return *this;
 }
 
 
-__device__ __forceinline__ float Vec3f::dot(const Vec3f& o) const {
+__host__ __device__ __forceinline__ float Vec3f::dot(const Vec3f& o) const {
     return x * o.x + y * o.y + z * o.z;
 }
 
 
-__device__ __forceinline__ float Vec3f::norm() const {
+__host__ __device__ __forceinline__ float Vec3f::norm() const {
     return sqrtf(x * x + y * y + z * z);
 }
 
 
-__device__ __forceinline__ float Vec3f::norm2() const {
+__host__ __device__ __forceinline__ float Vec3f::norm2() const {
     return x * x + y * y + z * z;
 }
 
 
-__device__ __forceinline__ Vec3f Vec3f::normalized() const {
+__host__ __device__ __forceinline__ Vec3f Vec3f::normalized() const {
     float inv_norm = rsqrtf(x * x + y * y + z * z);
     return Vec3f(x * inv_norm, y * inv_norm, z * inv_norm);
 }
 
 
-__device__ __forceinline__ void Vec3f::normalize() {
+__host__ __device__ __forceinline__ void Vec3f::normalize() {
     float inv_norm = rsqrtf(x * x + y * y + z * z);
     x *= inv_norm;
     y *= inv_norm;
     z *= inv_norm;
 }
 
 
-__device__ __forceinline__ Vec3f Vec3f::cross(const Vec3f& o) const {
+__host__ __device__ __forceinline__ Vec3f Vec3f::cross(const Vec3f& o) const {
     return Vec3f(y * o.z - z * o.y, z * o.x - x * o.z, x * o.y - y * o.x);
 }
 
 
-__device__ __forceinline__ Vec3f Vec3f::slerp(const Vec3f& o, float t) const {
+__host__ __device__ __forceinline__ Vec3f Vec3f::slerp(const Vec3f& o, float t) const {
     float dot_prod = this->dot(o);
     dot_prod = fmaxf(fminf(dot_prod, 1.0f), -1.0f); // Clamp to [-1, 1]
     float theta = acosf(dot_prod) * t;
@@ -169,48 +169,48 @@ __device__ __forceinline__ Vec3f Vec3f::slerp(const Vec3f& o, float t) const {
 }
 
 
-__device__ __forceinline__ QEM::QEM() {
+__host__ __device__ __forceinline__ QEM::QEM() {
     zero();
 }
 
 
-__device__ __forceinline__ QEM QEM::operator+(const QEM& o) const {
+__host__ __device__ __forceinline__ QEM QEM::operator+(const QEM& o) const {
     QEM res;
     #pragma unroll
     for (int i = 0; i < 10; ++i) res.e[i] = e[i] + o.e[i];
     return res;
 }
 
 
-__device__ __forceinline__ QEM& QEM::operator+=(const QEM& o) {
+__host__ __device__ __forceinline__ QEM& QEM::operator+=(const QEM& o) {
     #pragma unroll
     for (int i = 0; i < 10; ++i) e[i] += o.e[i];
     return *this;
 }
 
 
-__device__ __forceinline__ QEM QEM::operator-(const QEM& o) const {
+__host__ __device__ __forceinline__ QEM QEM::operator-(const QEM& o) const {
     QEM res;
     #pragma unroll
     for (int i = 0; i < 10; ++i) res.e[i] = e[i] - o.e[i];
     return res;
 }
 
 
-__device__ __forceinline__ QEM& QEM::operator-=(const QEM& o) {
+__host__ __device__ __forceinline__ QEM& QEM::operator-=(const QEM& o) {
     #pragma unroll
     for (int i = 0; i < 10; ++i) e[i] -= o.e[i];
     return *this;
 }
 
-__device__ __forceinline__ void QEM::zero() {
+__host__ __device__ __forceinline__ void QEM::zero() {
     #pragma unroll
     for (int i = 0; i < 10; ++i) e[i] = 0.0f;
 }
 
 
 // Add plane p = (a,b,c,d) as outer product p * p^T
-__device__ __forceinline__ void QEM::add_plane(float4 p) {
+__host__ __device__ __forceinline__ void QEM::add_plane(float4 p) {
     // upper triangle indices mapping:
     // (0,0)->e[0]
     // (0,1)->e[1]
@@ -237,7 +237,7 @@ __device__ __forceinline__ void QEM::add_plane(float4 p) {
 
 
 // Evaluate v^T * Q * v for v = (x,y,z,1)
-__device__ __forceinline__ float QEM::evaluate(const Vec3f& p) const {
+__host__ __device__ __forceinline__ float QEM::evaluate(const Vec3f& p) const {
     // compute v = [x,y,z,1]
     float x = p.x, y = p.y, z = p.z, w = 1.0f;
     // expand symmetric multiplication using stored upper triangular
@@ -271,7 +271,7 @@ __device__ __forceinline__ float QEM::evaluate(const Vec3f& p) const {
 // Solve the linear system: A * [x y z]^T = -b, where
 // A = top-left 3x3 of Q, b = [e03, e13, e23] (note signs)
 // Return true if solved (matrix invertible), false otherwise. err returns the error at the solution.
-__device__ __forceinline__ bool QEM::solve_optimal(float3 &out, float &err) const {
+__host__ __device__ __forceinline__ bool QEM::solve_optimal(float3 &out, float &err) const {
     // Build A (symmetric)
     float A00 = e[0];
     float A01 = e[1];