From b12363ef20b953a7904c31b346da892b38b77bc5 Mon Sep 17 00:00:00 2001
From: Andy Luo <andyluo7@users.noreply.github.com>
Date: Mon, 27 Apr 2026 16:33:58 +0000
Subject: [PATCH] [ROCm] Add HIP support for AMD Instinct GPUs (MI300X)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Port all CUDA-specific code to work on both CUDA and ROCm via HIP:

- Headers: #ifdef __HIP_PLATFORM_AMD__ guards for cuda.h → hip/hip_runtime.h,
  cub/cub.cuh → hipcub/hipcub.hpp
- API: cudaMalloc/Free/Memcpy → hipMalloc/Free/Memcpy (guarded)
- CUB → hipcub namespace mapping
- ::cuda::std::tuple → ::rocprim::tuple (for DeviceRadixSort decomposer)
- ::cuda::std::plus → hipcub::Sum()
- Vec3f: add default __host__ __device__ constructor for hipcub compatibility
- ATen/cuda → ATen/hip (in cubvh submodule)
- setup.py: default arch gfx942, gate CUDA-only flags behind IS_HIP check

Tested on AMD MI300X (gfx942) with ROCm 7.0.2 + PyTorch 2.9.1.
All code remains cross-compilable for CUDA.

Signed-off-by: Andy Luo <andyluo7@users.noreply.github.com>
---
 setup.py                           |    6 +-
 src/atlas.cu                       |  234 +++---
 src/atlas.hip                      | 1223 +++++++++++++++++++++++++++
 src/clean_up.cu                    |  356 ++++----
 src/clean_up.hip                   | 1237 ++++++++++++++++++++++++++++
 src/connectivity.cu                |  214 ++---
 src/connectivity.hip               | 1095 ++++++++++++++++++++++++
 src/cumesh.h                       |    4 +
 src/cumesh.hip                     |  143 ++++
 src/cumesh_hip.h                   |  509 ++++++++++++
 src/dtypes.cuh                     |   16 +-
 src/dtypes_hip.cuh                 |  329 ++++++++
 src/ext_hip.cpp                    |   69 ++
 src/geometry.cu                    |   10 +-
 src/geometry.hip                   |  136 +++
 src/hash/hash.cu                   |    4 +
 src/hash/hash.hip                  |  451 ++++++++++
 src/io.cu                          |   10 +-
 src/io.hip                         |  251 ++++++
 src/remesh/simple_dual_contour.cu  |    6 +-
 src/remesh/simple_dual_contour.hip |  228 +++++
 src/remesh/svox2vert.cu            |   26 +-
 src/remesh/svox2vert.hip           |  236 ++++++
 src/shared.h                       |   64 +-
 src/shared.hip                     |   69 ++
 src/shared_hip.h                   |  294 +++++++
 src/simplify.cu                    |   40 +-
 src/simplify.hip                   |  588 +++++++++++++
 src/utils.h                        |   26 +-
 src/utils_hip.h                    |  110 +++
 30 files changed, 7508 insertions(+), 476 deletions(-)
 create mode 100644 src/atlas.hip
 create mode 100644 src/clean_up.hip
 create mode 100644 src/connectivity.hip
 create mode 100644 src/cumesh.hip
 create mode 100644 src/cumesh_hip.h
 create mode 100644 src/dtypes_hip.cuh
 create mode 100644 src/ext_hip.cpp
 create mode 100644 src/geometry.hip
 create mode 100644 src/hash/hash.hip
 create mode 100644 src/io.hip
 create mode 100644 src/remesh/simple_dual_contour.hip
 create mode 100644 src/remesh/svox2vert.hip
 create mode 100644 src/shared.hip
 create mode 100644 src/shared_hip.h
 create mode 100644 src/simplify.hip
 create mode 100644 src/utils_hip.h

diff --git a/setup.py b/setup.py
index 8849b10..d9a2d60 100644
--- a/setup.py
+++ b/setup.py
@@ -58,7 +58,7 @@
 # CUDA / ROCm specific
 # -------------------------------------------------
 if IS_HIP:
-    archs = os.getenv("GPU_ARCHS", "native").split(";")
+    archs = os.getenv("GPU_ARCHS", "gfx942").split(";")
     nvcc_flags += [f"--offload-arch={arch}" for arch in archs]
 else:
     # CUDA only
@@ -115,14 +115,14 @@
         ],
         extra_compile_args={
             "cxx": cxx_flags,
-            "nvcc": nvcc_flags + [
+            "nvcc": nvcc_flags + ([] if IS_HIP else [
                 # The following definitions must be undefined
                 # since we need half-precision operation.
                 "--extended-lambda",
                 "-U__CUDA_NO_HALF_OPERATORS__",
                 "-U__CUDA_NO_HALF_CONVERSIONS__",
                 "-U__CUDA_NO_HALF2_OPERATORS__",
-            ],
+            ]),
         },
     ),
 
diff --git a/src/atlas.cu b/src/atlas.cu
index 2d78d8c..87dbf0d 100644
--- a/src/atlas.cu
+++ b/src/atlas.cu
@@ -1,7 +1,11 @@
 #include "cumesh.h"
 #include "dtypes.cuh"
 #include "shared.h"
+#ifdef __HIP_PLATFORM_AMD__
+#include <hipcub/hipcub.hpp>
+#else
 #include <cub/cub.cuh>
+#endif
 
 
 namespace cumesh {
@@ -286,8 +290,8 @@ static void get_chart_connectivity(
     mesh.atlas_chart_adj.resize(M);
     mesh.atlas_chart_adj_length.resize(M);
     float *cu_raw_lengths, *cu_sorted_lengths;
-    CUDA_CHECK(cudaMalloc(&cu_raw_lengths, M * sizeof(float)));
-    CUDA_CHECK(cudaMalloc(&cu_sorted_lengths, M * sizeof(float)));
+    CUDA_CHECK(hipMalloc(&cu_raw_lengths, M * sizeof(float)));
+    CUDA_CHECK(hipMalloc(&cu_sorted_lengths, M * sizeof(float)));
 
     init_chart_adj_kernel<<<(M + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
         mesh.vertices.ptr,
@@ -298,12 +302,12 @@ static void get_chart_connectivity(
         mesh.atlas_chart_adj.ptr,
         cu_raw_lengths
     );
-    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(hipGetLastError());
 
     // 1.2 Sort
     size_t temp_storage_bytes = 0;
     mesh.temp_storage.resize(M * sizeof(uint64_t));
-    CUDA_CHECK(cub::DeviceRadixSort::SortPairs(
+    CUDA_CHECK(hipcub::DeviceRadixSort::SortPairs(
         nullptr, temp_storage_bytes,
         mesh.atlas_chart_adj.ptr,
         reinterpret_cast<uint64_t*>(mesh.temp_storage.ptr),
@@ -312,7 +316,7 @@ static void get_chart_connectivity(
         M
     ));
     mesh.cub_temp_storage.resize(temp_storage_bytes);
-    CUDA_CHECK(cub::DeviceRadixSort::SortPairs(
+    CUDA_CHECK(hipcub::DeviceRadixSort::SortPairs(
         mesh.cub_temp_storage.ptr, temp_storage_bytes,
         mesh.atlas_chart_adj.ptr,
         reinterpret_cast<uint64_t*>(mesh.temp_storage.ptr),
@@ -320,20 +324,16 @@ static void get_chart_connectivity(
         cu_sorted_lengths,
         M
     ));
-    CUDA_CHECK(cudaFree(cu_raw_lengths));
+    CUDA_CHECK(hipFree(cu_raw_lengths));
 	
-    #if CUDART_VERSION >= 12090
-        auto reduce_op = ::cuda::std::plus();
-    #else
-        auto reduce_op = cub::Sum();
-    #endif
+    auto reduce_op = hipcub::Sum();
 	
 
     // 1.3 Reduce By Key (Aggregate duplicate chart pairs by summing lengths)
     int* cu_num_chart_adjs;
-    CUDA_CHECK(cudaMalloc(&cu_num_chart_adjs, sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_num_chart_adjs, sizeof(int)));
     temp_storage_bytes = 0;
-    CUDA_CHECK(cub::DeviceReduce::ReduceByKey(
+    CUDA_CHECK(hipcub::DeviceReduce::ReduceByKey(
         nullptr, temp_storage_bytes,
         reinterpret_cast<uint64_t*>(mesh.temp_storage.ptr),
         mesh.atlas_chart_adj.ptr,
@@ -344,7 +344,7 @@ static void get_chart_connectivity(
         M
     ));
     mesh.cub_temp_storage.resize(temp_storage_bytes);
-    CUDA_CHECK(cub::DeviceReduce::ReduceByKey(
+    CUDA_CHECK(hipcub::DeviceReduce::ReduceByKey(
         mesh.cub_temp_storage.ptr, temp_storage_bytes,
         reinterpret_cast<uint64_t*>(mesh.temp_storage.ptr),
         mesh.atlas_chart_adj.ptr,
@@ -354,15 +354,15 @@ static void get_chart_connectivity(
 		reduce_op,
         M
     ));
-    CUDA_CHECK(cudaMemcpy(&mesh.atlas_chart_adj.size, cu_num_chart_adjs, sizeof(int), cudaMemcpyDeviceToHost));
+    CUDA_CHECK(hipMemcpy(&mesh.atlas_chart_adj.size, cu_num_chart_adjs, sizeof(int), hipMemcpyDeviceToHost));
     mesh.atlas_chart_adj_length.size = mesh.atlas_chart_adj.size;
-    CUDA_CHECK(cudaFree(cu_sorted_lengths));
-    CUDA_CHECK(cudaFree(cu_num_chart_adjs));
+    CUDA_CHECK(hipFree(cu_sorted_lengths));
+    CUDA_CHECK(hipFree(cu_num_chart_adjs));
     // Remove invalid edge (UINT64_MAX) if present
     // Since we sorted, invalid edges are at the end.
     uint64_t last_key;
     if (mesh.atlas_chart_adj.size > 0) {
-        CUDA_CHECK(cudaMemcpy(&last_key, mesh.atlas_chart_adj.ptr + mesh.atlas_chart_adj.size - 1, sizeof(uint64_t), cudaMemcpyDeviceToHost));
+        CUDA_CHECK(hipMemcpy(&last_key, mesh.atlas_chart_adj.ptr + mesh.atlas_chart_adj.size - 1, sizeof(uint64_t), hipMemcpyDeviceToHost));
         if (last_key == UINT64_MAX) { 
             mesh.atlas_chart_adj.size -= 1;
             mesh.atlas_chart_adj_length.size -= 1;
@@ -388,18 +388,18 @@ static void get_chart_connectivity(
         mesh.atlas_chart2edge_cnt.ptr,
         mesh.atlas_chart_perims.ptr
     );
-    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(hipGetLastError());
     // 2.2 Prepare CSR format for chart-edge connectivity
     mesh.atlas_chart2edge_offset.resize(C + 1);
     temp_storage_bytes = 0;
-    CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
+    CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum(
         nullptr, temp_storage_bytes,
         mesh.atlas_chart2edge_cnt.ptr,
         mesh.atlas_chart2edge_offset.ptr,
         C + 1
     ));
     mesh.cub_temp_storage.resize(temp_storage_bytes);
-    CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
+    CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum(
         mesh.cub_temp_storage.ptr, temp_storage_bytes,
         mesh.atlas_chart2edge_cnt.ptr,
         mesh.atlas_chart2edge_offset.ptr,
@@ -415,7 +415,7 @@ static void get_chart_connectivity(
         mesh.atlas_chart2edge_offset.ptr,
         mesh.atlas_chart2edge_cnt.ptr
     );
-    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(hipGetLastError());
 }
 
 
@@ -489,106 +489,106 @@ void compute_chart_normal_cones(
     int* sorted_chart_ids;
     int* faces_ids;
     int* argsorted_faces_ids;
-    CUDA_CHECK(cudaMalloc(&sorted_chart_ids, F * sizeof(int)));
-    CUDA_CHECK(cudaMalloc(&faces_ids, F * sizeof(int)));
-    CUDA_CHECK(cudaMalloc(&argsorted_faces_ids, F * sizeof(int)));
+    CUDA_CHECK(hipMalloc(&sorted_chart_ids, F * sizeof(int)));
+    CUDA_CHECK(hipMalloc(&faces_ids, F * sizeof(int)));
+    CUDA_CHECK(hipMalloc(&argsorted_faces_ids, F * sizeof(int)));
     arange_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
         faces_ids,
         F
     );
-    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(hipGetLastError());
     size_t temp_storage_bytes = 0;
-    CUDA_CHECK(cub::DeviceRadixSort::SortPairs(
+    CUDA_CHECK(hipcub::DeviceRadixSort::SortPairs(
         nullptr, temp_storage_bytes,
         mesh.atlas_chart_ids.ptr, sorted_chart_ids,
         faces_ids, argsorted_faces_ids,
         F
     ));
     mesh.cub_temp_storage.resize(temp_storage_bytes);
-    CUDA_CHECK(cub::DeviceRadixSort::SortPairs(
+    CUDA_CHECK(hipcub::DeviceRadixSort::SortPairs(
         mesh.cub_temp_storage.ptr, temp_storage_bytes,
         mesh.atlas_chart_ids.ptr, sorted_chart_ids,
         faces_ids, argsorted_faces_ids,
         F
     ));
-    CUDA_CHECK(cudaFree(faces_ids));
+    CUDA_CHECK(hipFree(faces_ids));
     
     // 2. Get CSR format for chart-face assignment
     int* cu_chart_size;
     int* cu_num_charts;
     int* cu_unique_chart_ids;
-    CUDA_CHECK(cudaMalloc(&cu_chart_size, (C + 1) * sizeof(int)));
-    CUDA_CHECK(cudaMalloc(&cu_num_charts, sizeof(int)));
-    CUDA_CHECK(cudaMalloc(&cu_unique_chart_ids, (C + 1) * sizeof(int)));
-    CUDA_CHECK(cub::DeviceRunLengthEncode::Encode(
+    CUDA_CHECK(hipMalloc(&cu_chart_size, (C + 1) * sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_num_charts, sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_unique_chart_ids, (C + 1) * sizeof(int)));
+    CUDA_CHECK(hipcub::DeviceRunLengthEncode::Encode(
         nullptr, temp_storage_bytes,
         sorted_chart_ids, cu_unique_chart_ids, cu_chart_size, cu_num_charts,
         F
     ));
     mesh.cub_temp_storage.resize(temp_storage_bytes);
-    CUDA_CHECK(cub::DeviceRunLengthEncode::Encode(
+    CUDA_CHECK(hipcub::DeviceRunLengthEncode::Encode(
         mesh.cub_temp_storage.ptr, temp_storage_bytes,
         sorted_chart_ids, cu_unique_chart_ids, cu_chart_size, cu_num_charts,
         F
     ));
-    CUDA_CHECK(cudaFree(cu_num_charts));
-    CUDA_CHECK(cudaFree(cu_unique_chart_ids));
+    CUDA_CHECK(hipFree(cu_num_charts));
+    CUDA_CHECK(hipFree(cu_unique_chart_ids));
 
     int* cu_chart_offsets;
-    CUDA_CHECK(cudaMalloc(&cu_chart_offsets, (C + 1) * sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_chart_offsets, (C + 1) * sizeof(int)));
     temp_storage_bytes = 0;
-    CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
+    CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum(
         nullptr, temp_storage_bytes,
         cu_chart_size, cu_chart_offsets,
         C + 1
     ));
     mesh.cub_temp_storage.resize(temp_storage_bytes);
-    CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
+    CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum(
         mesh.cub_temp_storage.ptr, temp_storage_bytes,
         cu_chart_size, cu_chart_offsets,
         C + 1
     ));
-    CUDA_CHECK(cudaFree(cu_chart_size));
+    CUDA_CHECK(hipFree(cu_chart_size));
 
     // 3. Compute chart normals and areas
     float* cu_sorted_face_areas;
-    CUDA_CHECK(cudaMalloc(&cu_sorted_face_areas, F * sizeof(float)));
+    CUDA_CHECK(hipMalloc(&cu_sorted_face_areas, F * sizeof(float)));
     index_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
         mesh.face_areas.ptr,
         argsorted_faces_ids,
         F,
         cu_sorted_face_areas
     );
-    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(hipGetLastError());
     mesh.atlas_chart_areas.resize(C);
-    CUDA_CHECK(cub::DeviceSegmentedReduce::Sum(
+    CUDA_CHECK(hipcub::DeviceSegmentedReduce::Sum(
         nullptr, temp_storage_bytes,
         cu_sorted_face_areas, mesh.atlas_chart_areas.ptr,
         C,
         cu_chart_offsets, cu_chart_offsets + 1
     ));
     mesh.cub_temp_storage.resize(temp_storage_bytes);
-    CUDA_CHECK(cub::DeviceSegmentedReduce::Sum(
+    CUDA_CHECK(hipcub::DeviceSegmentedReduce::Sum(
         mesh.cub_temp_storage.ptr, temp_storage_bytes,
         cu_sorted_face_areas, mesh.atlas_chart_areas.ptr,
         C,
         cu_chart_offsets, cu_chart_offsets + 1
     ));
-    CUDA_CHECK(cudaFree(cu_sorted_face_areas));
+    CUDA_CHECK(hipFree(cu_sorted_face_areas));
 
     float3* cu_sorted_face_normals;
-    CUDA_CHECK(cudaMalloc(&cu_sorted_face_normals, F * sizeof(float3)));
+    CUDA_CHECK(hipMalloc(&cu_sorted_face_normals, F * sizeof(float3)));
     index_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
         mesh.face_normals.ptr,
         argsorted_faces_ids,
         F,
         cu_sorted_face_normals
     );
-    CUDA_CHECK(cudaGetLastError());
-    CUDA_CHECK(cudaFree(argsorted_faces_ids));
+    CUDA_CHECK(hipGetLastError());
+    CUDA_CHECK(hipFree(argsorted_faces_ids));
     float3* cu_chart_normals;
-    CUDA_CHECK(cudaMalloc(&cu_chart_normals, C * sizeof(float3)));
-    CUDA_CHECK(cub::DeviceSegmentedReduce::Reduce(
+    CUDA_CHECK(hipMalloc(&cu_chart_normals, C * sizeof(float3)));
+    CUDA_CHECK(hipcub::DeviceSegmentedReduce::Reduce(
         nullptr, temp_storage_bytes,
         cu_sorted_face_normals, cu_chart_normals,
         C,
@@ -597,7 +597,7 @@ void compute_chart_normal_cones(
         make_float3(0.0f, 0.0f, 0.0f)
     ));
     mesh.cub_temp_storage.resize(temp_storage_bytes);
-    CUDA_CHECK(cub::DeviceSegmentedReduce::Reduce(
+    CUDA_CHECK(hipcub::DeviceSegmentedReduce::Reduce(
         mesh.cub_temp_storage.ptr, temp_storage_bytes,
         cu_sorted_face_normals, cu_chart_normals,
         C,
@@ -609,11 +609,11 @@ void compute_chart_normal_cones(
         cu_chart_normals,
         C
     );
-    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(hipGetLastError());
 
     // 4. Compute normal difference
     float* cu_normal_diff;
-    CUDA_CHECK(cudaMalloc(&cu_normal_diff, F * sizeof(float)));
+    CUDA_CHECK(hipMalloc(&cu_normal_diff, F * sizeof(float)));
     normal_diff_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
         cu_chart_normals,
         cu_sorted_face_normals,
@@ -621,29 +621,29 @@ void compute_chart_normal_cones(
         F,
         cu_normal_diff
     );
-    CUDA_CHECK(cudaGetLastError());
-    CUDA_CHECK(cudaFree(cu_sorted_face_normals));
-    CUDA_CHECK(cudaFree(sorted_chart_ids));
+    CUDA_CHECK(hipGetLastError());
+    CUDA_CHECK(hipFree(cu_sorted_face_normals));
+    CUDA_CHECK(hipFree(sorted_chart_ids));
 
     // 5. Compute new cone half angles
     float* cu_new_cone_half_angles;
-    CUDA_CHECK(cudaMalloc(&cu_new_cone_half_angles, C * sizeof(float)));
+    CUDA_CHECK(hipMalloc(&cu_new_cone_half_angles, C * sizeof(float)));
     temp_storage_bytes = 0;
-    CUDA_CHECK(cub::DeviceSegmentedReduce::Max(
+    CUDA_CHECK(hipcub::DeviceSegmentedReduce::Max(
         nullptr, temp_storage_bytes,
         cu_normal_diff, cu_new_cone_half_angles,
         C,
         cu_chart_offsets, cu_chart_offsets + 1
     ));
     mesh.cub_temp_storage.resize(temp_storage_bytes);
-    CUDA_CHECK(cub::DeviceSegmentedReduce::Max(
+    CUDA_CHECK(hipcub::DeviceSegmentedReduce::Max(
         mesh.cub_temp_storage.ptr, temp_storage_bytes,
         cu_normal_diff, cu_new_cone_half_angles,
         C,
         cu_chart_offsets, cu_chart_offsets + 1
     ));
-    CUDA_CHECK(cudaFree(cu_chart_offsets));
-    CUDA_CHECK(cudaFree(cu_normal_diff));
+    CUDA_CHECK(hipFree(cu_chart_offsets));
+    CUDA_CHECK(hipFree(cu_normal_diff));
 
     // 6. Update chart normal cones
     mesh.atlas_chart_normal_cones.resize(C);
@@ -653,9 +653,9 @@ void compute_chart_normal_cones(
         cu_new_cone_half_angles,
         C
     );
-    CUDA_CHECK(cudaGetLastError());
-    CUDA_CHECK(cudaFree(cu_chart_normals));
-    CUDA_CHECK(cudaFree(cu_new_cone_half_angles));
+    CUDA_CHECK(hipGetLastError());
+    CUDA_CHECK(hipFree(cu_chart_normals));
+    CUDA_CHECK(hipFree(cu_new_cone_half_angles));
 }
 
 
@@ -832,13 +832,13 @@ static void reassign_chart_ids(
         reinterpret_cast<int*>(mesh.temp_storage.ptr),
         F
     );
-    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(hipGetLastError());
 
     int* cu_end_flag; int h_end_flag;
-    CUDA_CHECK(cudaMalloc(&cu_end_flag, sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_end_flag, sizeof(int)));
     do {
         h_end_flag = 1;
-        CUDA_CHECK(cudaMemcpy(cu_end_flag, &h_end_flag, sizeof(int), cudaMemcpyHostToDevice));
+        CUDA_CHECK(hipMemcpy(cu_end_flag, &h_end_flag, sizeof(int), hipMemcpyHostToDevice));
 
         // Hook
         hook_edges_if_same_chart_kernel<<<(M+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
@@ -848,17 +848,17 @@ static void reassign_chart_ids(
             reinterpret_cast<int*>(mesh.temp_storage.ptr),
             cu_end_flag
         );
-        CUDA_CHECK(cudaGetLastError());
+        CUDA_CHECK(hipGetLastError());
 
         // Compress
         compress_components_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
             reinterpret_cast<int*>(mesh.temp_storage.ptr),
             F
         );
-        CUDA_CHECK(cudaGetLastError());
-        CUDA_CHECK(cudaMemcpy(&h_end_flag, cu_end_flag, sizeof(int), cudaMemcpyDeviceToHost));
+        CUDA_CHECK(hipGetLastError());
+        CUDA_CHECK(hipMemcpy(&h_end_flag, cu_end_flag, sizeof(int), hipMemcpyDeviceToHost));
     } while (h_end_flag == 0);
-    CUDA_CHECK(cudaFree(cu_end_flag));
+    CUDA_CHECK(hipFree(cu_end_flag));
     
     swap_buffers(mesh.atlas_chart_ids, mesh.temp_storage);
     mesh.atlas_num_charts = compress_ids(mesh.atlas_chart_ids.ptr, F, mesh.cub_temp_storage);
@@ -940,67 +940,67 @@ void construct_chart_mesh(
     int* cu_sorted_chart_ids;
     int* cu_face_idx;
     int* cu_sorted_face_idx;
-    CUDA_CHECK(cudaMalloc(&cu_sorted_chart_ids, F * sizeof(int)));
-    CUDA_CHECK(cudaMalloc(&cu_face_idx, F * sizeof(int)));
-    CUDA_CHECK(cudaMalloc(&cu_sorted_face_idx, F * sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_sorted_chart_ids, F * sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_face_idx, F * sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_sorted_face_idx, F * sizeof(int)));
     arange_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
         cu_face_idx,
         F
     );
-    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(hipGetLastError());
     size_t temp_storage_bytes = 0;
-    CUDA_CHECK(cub::DeviceRadixSort::SortPairs(
+    CUDA_CHECK(hipcub::DeviceRadixSort::SortPairs(
         nullptr, temp_storage_bytes,
         mesh.atlas_chart_ids.ptr, cu_sorted_chart_ids,
         cu_face_idx, cu_sorted_face_idx,
         F
     ));
     mesh.cub_temp_storage.resize(temp_storage_bytes);
-    CUDA_CHECK(cub::DeviceRadixSort::SortPairs(
+    CUDA_CHECK(hipcub::DeviceRadixSort::SortPairs(
         mesh.cub_temp_storage.ptr, temp_storage_bytes,
         mesh.atlas_chart_ids.ptr, cu_sorted_chart_ids,
         cu_face_idx, cu_sorted_face_idx,
         F
     ));
-    CUDA_CHECK(cudaFree(cu_face_idx));
+    CUDA_CHECK(hipFree(cu_face_idx));
     // 2. RLE for chart size
     int* cu_chart_size;
     int* cu_num_chart;
     int* cu_unique_chart_ids;
-    CUDA_CHECK(cudaMalloc(&cu_chart_size, (mesh.atlas_num_charts + 1) * sizeof(int)));
-    CUDA_CHECK(cudaMalloc(&cu_num_chart, sizeof(int)));
-    CUDA_CHECK(cudaMalloc(&cu_unique_chart_ids, mesh.atlas_num_charts * sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_chart_size, (mesh.atlas_num_charts + 1) * sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_num_chart, sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_unique_chart_ids, mesh.atlas_num_charts * sizeof(int)));
     temp_storage_bytes = 0;
-    CUDA_CHECK(cub::DeviceRunLengthEncode::Encode(
+    CUDA_CHECK(hipcub::DeviceRunLengthEncode::Encode(
         nullptr, temp_storage_bytes,
         cu_sorted_chart_ids, cu_unique_chart_ids, cu_chart_size, cu_num_chart,
         F
     ));
     mesh.cub_temp_storage.resize(temp_storage_bytes);
-    CUDA_CHECK(cub::DeviceRunLengthEncode::Encode(
+    CUDA_CHECK(hipcub::DeviceRunLengthEncode::Encode(
         mesh.cub_temp_storage.ptr, temp_storage_bytes,
         cu_sorted_chart_ids, cu_unique_chart_ids, cu_chart_size, cu_num_chart,
         F
     ));
-    CUDA_CHECK(cudaFree(cu_unique_chart_ids));
-    CUDA_CHECK(cudaFree(cu_num_chart));
+    CUDA_CHECK(hipFree(cu_unique_chart_ids));
+    CUDA_CHECK(hipFree(cu_num_chart));
     // 3. Exclusive scan for chart face offset
     temp_storage_bytes = 0;
-    CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
+    CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum(
         nullptr, temp_storage_bytes,
         cu_chart_size, mesh.atlas_chart_faces_offset.ptr,
         mesh.atlas_num_charts + 1
     ));
     mesh.cub_temp_storage.resize(temp_storage_bytes);
-    CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
+    CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum(
         mesh.cub_temp_storage.ptr, temp_storage_bytes,
         cu_chart_size, mesh.atlas_chart_faces_offset.ptr,
         mesh.atlas_num_charts + 1
     ));
-    CUDA_CHECK(cudaFree(cu_chart_size));
+    CUDA_CHECK(hipFree(cu_chart_size));
     // 4. Expand chart ids and vertex ids
     uint64_t* cu_pack;
-    CUDA_CHECK(cudaMalloc(&cu_pack, 3 * F * sizeof(uint64_t)));
+    CUDA_CHECK(hipMalloc(&cu_pack, 3 * F * sizeof(uint64_t)));
     expand_chart_ids_and_vertex_ids_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
         cu_sorted_chart_ids,
         cu_sorted_face_idx,
@@ -1008,12 +1008,12 @@ void construct_chart_mesh(
         F,
         cu_pack
     );
-    CUDA_CHECK(cudaGetLastError());
-    CUDA_CHECK(cudaFree(cu_sorted_chart_ids));
-    CUDA_CHECK(cudaFree(cu_sorted_face_idx));
+    CUDA_CHECK(hipGetLastError());
+    CUDA_CHECK(hipFree(cu_sorted_chart_ids));
+    CUDA_CHECK(hipFree(cu_sorted_face_idx));
     // 5. Compress pair to construct all maps
     uint64_t* cu_inverse_pack;
-    CUDA_CHECK(cudaMalloc(&cu_inverse_pack, 3 * F * sizeof(uint64_t)));
+    CUDA_CHECK(hipMalloc(&cu_inverse_pack, 3 * F * sizeof(uint64_t)));
     int new_num_vertices = compress_ids(
         cu_pack,
         3 * F,
@@ -1028,15 +1028,15 @@ void construct_chart_mesh(
         mesh.atlas_chart_vertex_map.ptr,
         mesh.atlas_chart_vertex_offset.ptr
     );
-    CUDA_CHECK(cudaGetLastError());
-    CUDA_CHECK(cudaFree(cu_inverse_pack));
+    CUDA_CHECK(hipGetLastError());
+    CUDA_CHECK(hipFree(cu_inverse_pack));
     unpack_faces_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
         cu_pack,
         F,
         mesh.atlas_chart_faces.ptr
     );
-    CUDA_CHECK(cudaGetLastError());
-    CUDA_CHECK(cudaFree(cu_pack));
+    CUDA_CHECK(hipGetLastError());
+    CUDA_CHECK(hipFree(cu_pack));
 }
 
 
@@ -1066,15 +1066,15 @@ void CuMesh::compute_charts(
         this->atlas_chart_ids.ptr,
         F
     );
-    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(hipGetLastError());
 
     // Main Iteration: Collapse and Refine
     int* cu_end_flag; int h_end_flag;
-    CUDA_CHECK(cudaMalloc(&cu_end_flag, sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_end_flag, sizeof(int)));
     for (int i = 0; i < global_iterations; i++) {
         while (true) {
             h_end_flag = 1;
-            CUDA_CHECK(cudaMemcpy(cu_end_flag, &h_end_flag, sizeof(int), cudaMemcpyHostToDevice));
+            CUDA_CHECK(hipMemcpy(cu_end_flag, &h_end_flag, sizeof(int), hipMemcpyHostToDevice));
 
             // 1. Compute chart connectivity
             get_chart_connectivity(*this);
@@ -1097,7 +1097,7 @@ void CuMesh::compute_charts(
                 E,
                 this->edge_collapse_costs.ptr
             );
-            CUDA_CHECK(cudaGetLastError());CUDA_CHECK(cudaDeviceSynchronize());
+            CUDA_CHECK(hipGetLastError());CUDA_CHECK(hipDeviceSynchronize());
 
             // 4. Propagate costs
             size_t C = this->atlas_num_charts;
@@ -1109,7 +1109,7 @@ void CuMesh::compute_charts(
                 C,
                 this->propagated_costs.ptr
             );
-            CUDA_CHECK(cudaGetLastError());CUDA_CHECK(cudaDeviceSynchronize());
+            CUDA_CHECK(hipGetLastError());CUDA_CHECK(hipDeviceSynchronize());
 
             // 5. Collapse edges
             this->vertices_map.resize(C);      // store collapse map
@@ -1127,10 +1127,10 @@ void CuMesh::compute_charts(
                 this->atlas_chart_normal_cones.ptr,
                 cu_end_flag
             );
-            CUDA_CHECK(cudaGetLastError());CUDA_CHECK(cudaDeviceSynchronize());
+            CUDA_CHECK(hipGetLastError());CUDA_CHECK(hipDeviceSynchronize());
 
             // End of iteration
-            CUDA_CHECK(cudaMemcpy(&h_end_flag, cu_end_flag, sizeof(int), cudaMemcpyDeviceToHost));
+            CUDA_CHECK(hipMemcpy(&h_end_flag, cu_end_flag, sizeof(int), hipMemcpyDeviceToHost));
             if (h_end_flag == 1) break;
 
             // 6. Compress chart ids
@@ -1142,7 +1142,7 @@ void CuMesh::compute_charts(
                 F,
                 reinterpret_cast<int*>(this->temp_storage.ptr)
             );
-            CUDA_CHECK(cudaGetLastError());CUDA_CHECK(cudaDeviceSynchronize());
+            CUDA_CHECK(hipGetLastError());CUDA_CHECK(hipDeviceSynchronize());
             swap_buffers(this->atlas_chart_ids, this->temp_storage);
         }
 
@@ -1163,7 +1163,7 @@ void CuMesh::compute_charts(
                 this->atlas_chart_ids.ptr,
                 reinterpret_cast<int*>(this->temp_storage.ptr)
             );
-            CUDA_CHECK(cudaGetLastError());
+            CUDA_CHECK(hipGetLastError());
             swap_buffers(this->atlas_chart_ids, this->temp_storage);
             this->atlas_num_charts = compress_ids(this->atlas_chart_ids.ptr, F, this->cub_temp_storage);
         }
@@ -1171,7 +1171,7 @@ void CuMesh::compute_charts(
         // After refinement, the chart may become disconnected, so we need to re-assign chart ids
         reassign_chart_ids(*this);
     }
-    CUDA_CHECK(cudaFree(cu_end_flag));
+    CUDA_CHECK(hipFree(cu_end_flag));
 
     // Finalizing: calculate vmap, chart face and chart face offset
     construct_chart_mesh(*this);
@@ -1180,39 +1180,39 @@ void CuMesh::compute_charts(
 
 std::tuple<int, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor> CuMesh::read_atlas_charts() {
     auto chart_ids = torch::empty({ static_cast<int64_t>(this->faces.size) }, torch::dtype(torch::kInt32).device(torch::kCUDA));
-    CUDA_CHECK(cudaMemcpy(
+    CUDA_CHECK(hipMemcpy(
         chart_ids.data_ptr<int>(),
         this->atlas_chart_ids.ptr,
         this->faces.size * sizeof(int),
-        cudaMemcpyDeviceToDevice
+        hipMemcpyDeviceToDevice
     ));
     auto vertex_map = torch::empty({ static_cast<int64_t>(this->atlas_chart_vertex_map.size) }, torch::dtype(torch::kInt32).device(torch::kCUDA));
-    CUDA_CHECK(cudaMemcpy(
+    CUDA_CHECK(hipMemcpy(
         vertex_map.data_ptr<int>(),
         this->atlas_chart_vertex_map.ptr,
         this->atlas_chart_vertex_map.size * sizeof(int),
-        cudaMemcpyDeviceToDevice
+        hipMemcpyDeviceToDevice
     ));
     auto chart_faces = torch::empty({ static_cast<int64_t>(this->atlas_chart_faces.size), 3 }, torch::dtype(torch::kInt32).device(torch::kCUDA));
-    CUDA_CHECK(cudaMemcpy(
+    CUDA_CHECK(hipMemcpy(
         chart_faces.data_ptr<int>(),
         this->atlas_chart_faces.ptr,
         this->atlas_chart_faces.size * 3 * sizeof(int),
-        cudaMemcpyDeviceToDevice
+        hipMemcpyDeviceToDevice
     ));
     auto chart_vertex_offset = torch::empty({ static_cast<int64_t>(this->atlas_chart_vertex_offset.size) }, torch::dtype(torch::kInt32).device(torch::kCUDA));
-    CUDA_CHECK(cudaMemcpy(
+    CUDA_CHECK(hipMemcpy(
         chart_vertex_offset.data_ptr<int>(),
         this->atlas_chart_vertex_offset.ptr,
         this->atlas_chart_vertex_offset.size * sizeof(int),
-        cudaMemcpyDeviceToDevice
+        hipMemcpyDeviceToDevice
     ));
     auto chart_face_offset = torch::empty({ static_cast<int64_t>(this->atlas_chart_faces_offset.size) }, torch::dtype(torch::kInt32).device(torch::kCUDA));
-    CUDA_CHECK(cudaMemcpy(
+    CUDA_CHECK(hipMemcpy(
         chart_face_offset.data_ptr<int>(),
         this->atlas_chart_faces_offset.ptr,
         this->atlas_chart_faces_offset.size * sizeof(int),
-        cudaMemcpyDeviceToDevice
+        hipMemcpyDeviceToDevice
     ));
     return std::make_tuple(this->atlas_num_charts, chart_ids, vertex_map, chart_faces, chart_vertex_offset, chart_face_offset);
 }
diff --git a/src/atlas.hip b/src/atlas.hip
new file mode 100644
index 0000000..5aa28d5
--- /dev/null
+++ b/src/atlas.hip
@@ -0,0 +1,1223 @@
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
+#include "cumesh_hip.h"
+#include "dtypes_hip.cuh"
+#include "shared_hip.h"
+#ifdef __HIP_PLATFORM_AMD__
+#include <hipcub/hipcub.hpp>
+#else
+#include <hipcub/hipcub.hpp>
+#endif
+
+
+namespace cumesh {
+
+
+/*
+Fast mesh parameterization / UV unwrapping using GPU
+
+Three main steps:
+1. Split the mesh into charts
+   - Treat each chart as a node in a graph
+   - Use a parallel edge collapse algorithm to merge charts based on normal cone deviation
+2. Parameterize each chart using Least Squares Conformal Maps (LSCM)
+3. Pack the charts into a texture atlas
+*/
+
+
+__device__ inline uint64_t pack_key_value_positive(int key, float value) {
+    unsigned int v = __float_as_uint(value);
+    return (static_cast<uint64_t>(v) << 32) |
+           static_cast<unsigned int>(key);
+}
+
+
+__device__ inline void unpack_key_value_positive(uint64_t key_value, int& key, float& value) {
+    key = static_cast<int>(key_value & 0xffffffffu);
+    value = __uint_as_float(static_cast<unsigned int>(key_value >> 32));
+}
+
+
+// static __global__ void init_normal_cones_kernel(
+//     const float3* face_normals,
+//     const int F,
+//     float4* chart_normal_cones
+// ) {
+//     const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+//     if (tid >= F) return;
+// 
+//     float3 n = face_normals[tid];
+//     chart_normal_cones[tid] = make_float4(n.x, n.y, n.z, 0.0f); // half angle = 0
+// }
+
+
+static __global__ void init_chart_adj_kernel(
+    const float3* vertices,
+    const int3* faces,
+    const int2* face_adj,
+    const int* chart_ids,
+    const size_t M,
+    uint64_t* chart_adj,
+    float* length
+) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= M) return;
+
+    int f0 = face_adj[tid].x;
+    int f1 = face_adj[tid].y;
+    
+    int c0 = chart_ids[f0];
+    int c1 = chart_ids[f1];
+
+    if (c0 == c1) {
+        chart_adj[tid] = UINT64_MAX;
+        length[tid] = 0.0f;
+        return;
+    }
+
+    int min_c = min(c0, c1);
+    int max_c = max(c0, c1);
+    chart_adj[tid] = (static_cast<uint64_t>(min_c) << 32) | static_cast<uint64_t>(max_c);
+
+    int3 tri0 = faces[f0];
+    int3 tri1 = faces[f1];
+
+    int t0_indices[3] = {tri0.x, tri0.y, tri0.z};
+    int common_v_indices[2]; 
+    int found_count = 0;
+
+    #pragma unroll
+    for (int i = 0; i < 3; ++i) {
+        int v = t0_indices[i];
+        if (v == tri1.x || v == tri1.y || v == tri1.z) {
+            if (found_count < 2) {
+                common_v_indices[found_count] = v;
+            }
+            found_count++;
+        }
+    }
+
+    if (found_count >= 2) {
+        float3 p0 = vertices[common_v_indices[0]];
+        float3 p1 = vertices[common_v_indices[1]];
+
+        float dx = p0.x - p1.x;
+        float dy = p0.y - p1.y;
+        float dz = p0.z - p1.z;
+
+        length[tid] = sqrtf(dx * dx + dy * dy + dz * dz);
+    } else {
+        length[tid] = 0.0f;
+    }
+}
+
+
+static __global__ void get_chart_edge_cnt_kernel(
+    const uint64_t* chart_adj,
+    const float* chart_adj_length,
+    const int E,
+    int* chart2edge_cnt,
+    float* chart_perim
+) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= E) return;
+
+    // get edge
+    uint64_t c = chart_adj[tid];
+    float l = chart_adj_length[tid];
+    int c0 = int(c >> 32);
+    int c1 = int(c & 0xFFFFFFFF);
+
+    // count vertex adjacent edge number
+    atomicAdd(&chart2edge_cnt[c0], 1);
+    atomicAdd(&chart2edge_cnt[c1], 1);
+    atomicAdd(&chart_perim[c0], l);
+    atomicAdd(&chart_perim[c1], l);
+}
+
+
+static __global__ void get_chart_edge_adjacency_kernel(
+    const uint64_t* chart_adj,
+    const int E,
+    int* chart2edge,
+    int* chart2edge_offset,
+    int* chart2edge_cnt
+) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= E) return;
+
+    // get edge
+    uint64_t c = chart_adj[tid];
+    int c0 = int(c >> 32);
+    int c1 = int(c & 0xFFFFFFFF);
+
+    // assign connectivity
+    chart2edge[chart2edge_offset[c0] + atomicAdd(&chart2edge_cnt[c0], 1)] = tid;
+    chart2edge[chart2edge_offset[c1] + atomicAdd(&chart2edge_cnt[c1], 1)] = tid;
+}
+
+
+static __global__ void compute_chart_adjacency_cost_kernel(
+    const uint64_t* chart_adj,
+    const float4* chart_normal_cones,
+    const float* chart_adj_length,
+    const float* chart_perims,
+    const float* chart_areas,
+    float area_penalty_weight,
+    float perimeter_area_ratio_weight,
+    const int E,
+    float* chart_adj_costs
+) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= E) return;
+
+    uint64_t adj = chart_adj[tid];
+    int c0 = static_cast<int>(adj >> 32);
+    int c1 = static_cast<int>(adj & 0xFFFFFFFF);
+
+    float4 cone0 = chart_normal_cones[c0];
+    float4 cone1 = chart_normal_cones[c1];
+    Vec3f axis0(cone0.x, cone0.y, cone0.z);
+    Vec3f axis1(cone1.x, cone1.y, cone1.z);
+    float half_angle0 = cone0.w;
+    float half_angle1 = cone1.w;
+    float cos_angle = axis0.dot(axis1);
+    float axis_angle = acosf(fmaxf(fminf(cos_angle, 1.0f), -1.0f));
+    float new_cone_low = fminf(-half_angle0, axis_angle - half_angle1);
+    float new_cone_high = fmaxf(half_angle0, axis_angle + half_angle1);
+    float new_half_angle = (new_cone_high - new_cone_low) * 0.5f;
+    float cost = new_half_angle;
+
+    // Chart area panelty
+    float new_area = (chart_areas[c0] + chart_areas[c1]);
+    cost += area_penalty_weight * new_area;
+
+    // Perim-area ration panelty
+    float new_perim = chart_perims[c0] + chart_perims[c1] - 2 * chart_adj_length[tid];
+    cost += perimeter_area_ratio_weight * (new_perim * new_perim / new_area);
+
+    chart_adj_costs[tid] = cost;
+}
+
+
+static __global__ void propagate_cost_kernel(
+    const int* chart2edge,
+    const int* chart2edge_offset,
+    const float* edge_collapse_costs,
+    const int num_charts,
+    uint64_t* propagated_costs
+) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= num_charts) return;
+
+    // get edge with minimum cost
+    int min_eid = -1;
+    float min_cost = FLT_MAX;
+    for (int e = chart2edge_offset[tid]; e < chart2edge_offset[tid+1]; e++) {
+        int eid = chart2edge[e];
+        float cost = edge_collapse_costs[eid];
+        if (cost < min_cost || (cost == min_cost && eid < min_eid)) {
+            min_eid = eid;
+            min_cost = cost;
+        }
+    }
+
+    uint64_t cost = pack_key_value_positive(min_eid, min_cost);
+    propagated_costs[tid] = cost;
+}
+
+
+static __global__ void collapse_edges_kernel(
+    uint64_t* chart_adj,
+    const float* edge_collapse_costs,
+    const uint64_t* propagated_costs,
+    const float collapse_thresh,
+    const int E,
+    int* chart_map,
+    float4* chart_normal_cones,
+    int* end_flag
+) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= E) return;
+
+    float cost = edge_collapse_costs[tid];
+    if (cost > collapse_thresh) return;
+
+    // get edge
+    uint64_t c = chart_adj[tid];
+    int c0 = int(c >> 32);
+    int c1 = int(c & 0xFFFFFFFF);
+
+    // check if this edge has the minimum cost among neighboring charts
+    uint64_t pack = pack_key_value_positive(tid, cost);
+    if (propagated_costs[c0] < pack || propagated_costs[c1] < pack) return;
+
+    // collapse edge
+    chart_map[c1] = c0;
+
+    // update cone
+    float4 cone0 = chart_normal_cones[c0];
+    float4 cone1 = chart_normal_cones[c1];
+    Vec3f axis0(cone0.x, cone0.y, cone0.z);
+    Vec3f axis1(cone1.x, cone1.y, cone1.z);
+    float half_angle0 = cone0.w;
+    float half_angle1 = cone1.w;
+    float cos_angle = axis0.dot(axis1);
+    float axis_angle = acosf(fmaxf(fminf(cos_angle, 1.0f), -1.0f));
+    float new_cone_low = fminf(-half_angle0, axis_angle - half_angle1);
+    float new_cone_high = fmaxf(half_angle0, axis_angle + half_angle1);
+    float new_half_angle = (new_cone_high - new_cone_low) * 0.5f;
+    Vec3f new_axis;
+    if (axis_angle < 1e-3f) {
+        new_axis = axis0;
+    } else {
+        float new_axis_angle = (new_cone_high + new_cone_low) * 0.5f;
+        new_axis = axis0 * cosf(new_axis_angle) + (axis1 - axis0 * cos_angle).normalized() * sinf(new_axis_angle);
+        new_axis.normalize();
+    }
+    chart_normal_cones[c0] = make_float4(new_axis.x, new_axis.y, new_axis.z, new_half_angle);
+
+    // not end of iteration
+    *end_flag = 0;
+}
+
+
+static void get_chart_connectivity(
+    CuMesh& mesh
+) {
+    size_t M = mesh.manifold_face_adj.size;
+
+    // 1. Get chart adjacency
+    // 1.1 Initialize chart adjacency and edge lengths
+    mesh.atlas_chart_adj.resize(M);
+    mesh.atlas_chart_adj_length.resize(M);
+    float *cu_raw_lengths, *cu_sorted_lengths;
+    CUDA_CHECK(hipMalloc(&cu_raw_lengths, M * sizeof(float)));
+    CUDA_CHECK(hipMalloc(&cu_sorted_lengths, M * sizeof(float)));
+
+   hipLaunchKernelGGL(( init_chart_adj_kernel), dim3((M + BLOCK_SIZE - 1) / BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+        mesh.vertices.ptr,
+        mesh.faces.ptr,
+        mesh.manifold_face_adj.ptr,
+        mesh.atlas_chart_ids.ptr,
+        M,
+        mesh.atlas_chart_adj.ptr,
+        cu_raw_lengths
+    );
+    CUDA_CHECK(hipGetLastError());
+
+    // 1.2 Sort
+    size_t temp_storage_bytes = 0;
+    mesh.temp_storage.resize(M * sizeof(uint64_t));
+    CUDA_CHECK(hipcub::DeviceRadixSort::SortPairs(
+        nullptr, temp_storage_bytes,
+        mesh.atlas_chart_adj.ptr,
+        reinterpret_cast<uint64_t*>(mesh.temp_storage.ptr),
+        cu_raw_lengths,
+        cu_sorted_lengths,
+        M
+    ));
+    mesh.cub_temp_storage.resize(temp_storage_bytes);
+    CUDA_CHECK(hipcub::DeviceRadixSort::SortPairs(
+        mesh.cub_temp_storage.ptr, temp_storage_bytes,
+        mesh.atlas_chart_adj.ptr,
+        reinterpret_cast<uint64_t*>(mesh.temp_storage.ptr),
+        cu_raw_lengths,
+        cu_sorted_lengths,
+        M
+    ));
+    CUDA_CHECK(hipFree(cu_raw_lengths));
+	
+    auto reduce_op = hipcub::Sum();
+	
+
+    // 1.3 Reduce By Key (Aggregate duplicate chart pairs by summing lengths)
+    int* cu_num_chart_adjs;
+    CUDA_CHECK(hipMalloc(&cu_num_chart_adjs, sizeof(int)));
+    temp_storage_bytes = 0;
+    CUDA_CHECK(hipcub::DeviceReduce::ReduceByKey(
+        nullptr, temp_storage_bytes,
+        reinterpret_cast<uint64_t*>(mesh.temp_storage.ptr),
+        mesh.atlas_chart_adj.ptr,
+        cu_sorted_lengths,
+        mesh.atlas_chart_adj_length.ptr,
+        cu_num_chart_adjs,
+		reduce_op,
+        M
+    ));
+    mesh.cub_temp_storage.resize(temp_storage_bytes);
+    CUDA_CHECK(hipcub::DeviceReduce::ReduceByKey(
+        mesh.cub_temp_storage.ptr, temp_storage_bytes,
+        reinterpret_cast<uint64_t*>(mesh.temp_storage.ptr),
+        mesh.atlas_chart_adj.ptr,
+        cu_sorted_lengths,
+        mesh.atlas_chart_adj_length.ptr,
+        cu_num_chart_adjs,
+		reduce_op,
+        M
+    ));
+    CUDA_CHECK(hipMemcpy(&mesh.atlas_chart_adj.size, cu_num_chart_adjs, sizeof(int), hipMemcpyDeviceToHost));
+    mesh.atlas_chart_adj_length.size = mesh.atlas_chart_adj.size;
+    CUDA_CHECK(hipFree(cu_sorted_lengths));
+    CUDA_CHECK(hipFree(cu_num_chart_adjs));
+    // Remove invalid edge (UINT64_MAX) if present
+    // Since we sorted, invalid edges are at the end.
+    uint64_t last_key;
+    if (mesh.atlas_chart_adj.size > 0) {
+        CUDA_CHECK(hipMemcpy(&last_key, mesh.atlas_chart_adj.ptr + mesh.atlas_chart_adj.size - 1, sizeof(uint64_t), hipMemcpyDeviceToHost));
+        if (last_key == UINT64_MAX) { 
+            mesh.atlas_chart_adj.size -= 1;
+            mesh.atlas_chart_adj_length.size -= 1;
+        }
+    }
+    // Early stop if no valid edges
+    if (mesh.atlas_chart_adj.size == 0) {
+        return;
+    }
+
+    // 2. Get chart-edge connectivity
+    size_t E = mesh.atlas_chart_adj.size;
+    size_t C = mesh.atlas_num_charts;
+    // 2.1 Count edge number for each chart, along with perim
+    mesh.atlas_chart2edge_cnt.resize(C);
+    mesh.atlas_chart2edge_cnt.zero();
+    mesh.atlas_chart_perims.resize(C);
+    mesh.atlas_chart_perims.zero();
+   hipLaunchKernelGGL(( get_chart_edge_cnt_kernel), dim3((E + BLOCK_SIZE - 1) / BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+        mesh.atlas_chart_adj.ptr,
+        mesh.atlas_chart_adj_length.ptr,
+        E,
+        mesh.atlas_chart2edge_cnt.ptr,
+        mesh.atlas_chart_perims.ptr
+    );
+    CUDA_CHECK(hipGetLastError());
+    // 2.2 Prepare CSR format for chart-edge connectivity
+    mesh.atlas_chart2edge_offset.resize(C + 1);
+    temp_storage_bytes = 0;
+    CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum(
+        nullptr, temp_storage_bytes,
+        mesh.atlas_chart2edge_cnt.ptr,
+        mesh.atlas_chart2edge_offset.ptr,
+        C + 1
+    ));
+    mesh.cub_temp_storage.resize(temp_storage_bytes);
+    CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum(
+        mesh.cub_temp_storage.ptr, temp_storage_bytes,
+        mesh.atlas_chart2edge_cnt.ptr,
+        mesh.atlas_chart2edge_offset.ptr,
+        C + 1
+    ));
+    // 2.3 Fill CSR format for chart-edge connectivity
+    mesh.atlas_chart2edge.resize(2 * E); // each edge connects two charts
+    mesh.atlas_chart2edge_cnt.zero();
+   hipLaunchKernelGGL(( get_chart_edge_adjacency_kernel), dim3((E + BLOCK_SIZE - 1) / BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+        mesh.atlas_chart_adj.ptr,
+        E,
+        mesh.atlas_chart2edge.ptr,
+        mesh.atlas_chart2edge_offset.ptr,
+        mesh.atlas_chart2edge_cnt.ptr
+    );
+    CUDA_CHECK(hipGetLastError());
+}
+
+
+struct Float3Add
+{
+    __host__ __device__
+    float3 operator()(const float3 &a, const float3 &b) const
+    {
+        return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
+    }
+};
+
+
+static __global__ void normalize_kernel(
+    float3* chart_normals,
+    const int num_charts
+) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= num_charts) return;
+
+    float3 n = chart_normals[tid];
+    float norm = sqrtf(n.x * n.x + n.y * n.y + n.z * n.z);
+    if (norm > 0.0f) {
+        n.x /= norm;
+        n.y /= norm;
+        n.z /= norm;
+    }
+    chart_normals[tid] = n;
+}
+
+
+static __global__ void normal_diff_kernel(
+    const float3* chart_normals,
+    const float3* sorted_face_normals,
+    const int* sorted_chart_ids,
+    const size_t F,
+    float* normal_diff
+) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= F) return;
+
+    int c = sorted_chart_ids[tid];
+    Vec3f n(chart_normals[c]);
+    Vec3f fn(sorted_face_normals[tid]);
+    normal_diff[tid] = acosf(fmaxf(fminf(n.dot(fn), 1.0f), -1.0f));
+}
+
+
+static __global__ void update_normal_cones_kernel(
+    float4* chart_normal_cones,
+    const float3* chart_normals,
+    const float* new_cone_half_angles,
+    const int num_charts
+) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= num_charts) return;
+
+    float3 n = chart_normals[tid];
+    float half_angle = new_cone_half_angles[tid];
+    chart_normal_cones[tid] = make_float4(n.x, n.y, n.z, half_angle);
+}
+
+
+void compute_chart_normal_cones(
+    CuMesh& mesh
+) {
+    size_t C = mesh.atlas_num_charts;
+    size_t F = mesh.faces.size;
+
+    // 1. Sort faces by chart id
+    int* sorted_chart_ids;
+    int* faces_ids;
+    int* argsorted_faces_ids;
+    CUDA_CHECK(hipMalloc(&sorted_chart_ids, F * sizeof(int)));
+    CUDA_CHECK(hipMalloc(&faces_ids, F * sizeof(int)));
+    CUDA_CHECK(hipMalloc(&argsorted_faces_ids, F * sizeof(int)));
+   hipLaunchKernelGGL(( arange_kernel), dim3((F + BLOCK_SIZE - 1) / BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+        faces_ids,
+        F
+    );
+    CUDA_CHECK(hipGetLastError());
+    size_t temp_storage_bytes = 0;
+    CUDA_CHECK(hipcub::DeviceRadixSort::SortPairs(
+        nullptr, temp_storage_bytes,
+        mesh.atlas_chart_ids.ptr, sorted_chart_ids,
+        faces_ids, argsorted_faces_ids,
+        F
+    ));
+    mesh.cub_temp_storage.resize(temp_storage_bytes);
+    CUDA_CHECK(hipcub::DeviceRadixSort::SortPairs(
+        mesh.cub_temp_storage.ptr, temp_storage_bytes,
+        mesh.atlas_chart_ids.ptr, sorted_chart_ids,
+        faces_ids, argsorted_faces_ids,
+        F
+    ));
+    CUDA_CHECK(hipFree(faces_ids));
+    
+    // 2. Get CSR format for chart-face assignment
+    int* cu_chart_size;
+    int* cu_num_charts;
+    int* cu_unique_chart_ids;
+    CUDA_CHECK(hipMalloc(&cu_chart_size, (C + 1) * sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_num_charts, sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_unique_chart_ids, (C + 1) * sizeof(int)));
+    CUDA_CHECK(hipcub::DeviceRunLengthEncode::Encode(
+        nullptr, temp_storage_bytes,
+        sorted_chart_ids, cu_unique_chart_ids, cu_chart_size, cu_num_charts,
+        F
+    ));
+    mesh.cub_temp_storage.resize(temp_storage_bytes);
+    CUDA_CHECK(hipcub::DeviceRunLengthEncode::Encode(
+        mesh.cub_temp_storage.ptr, temp_storage_bytes,
+        sorted_chart_ids, cu_unique_chart_ids, cu_chart_size, cu_num_charts,
+        F
+    ));
+    CUDA_CHECK(hipFree(cu_num_charts));
+    CUDA_CHECK(hipFree(cu_unique_chart_ids));
+
+    int* cu_chart_offsets;
+    CUDA_CHECK(hipMalloc(&cu_chart_offsets, (C + 1) * sizeof(int)));
+    temp_storage_bytes = 0;
+    CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum(
+        nullptr, temp_storage_bytes,
+        cu_chart_size, cu_chart_offsets,
+        C + 1
+    ));
+    mesh.cub_temp_storage.resize(temp_storage_bytes);
+    CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum(
+        mesh.cub_temp_storage.ptr, temp_storage_bytes,
+        cu_chart_size, cu_chart_offsets,
+        C + 1
+    ));
+    CUDA_CHECK(hipFree(cu_chart_size));
+
+    // 3. Compute chart normals and areas
+    float* cu_sorted_face_areas;
+    CUDA_CHECK(hipMalloc(&cu_sorted_face_areas, F * sizeof(float)));
+   hipLaunchKernelGGL(( index_kernel), dim3((F + BLOCK_SIZE - 1) / BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+        mesh.face_areas.ptr,
+        argsorted_faces_ids,
+        F,
+        cu_sorted_face_areas
+    );
+    CUDA_CHECK(hipGetLastError());
+    mesh.atlas_chart_areas.resize(C);
+    CUDA_CHECK(hipcub::DeviceSegmentedReduce::Sum(
+        nullptr, temp_storage_bytes,
+        cu_sorted_face_areas, mesh.atlas_chart_areas.ptr,
+        C,
+        cu_chart_offsets, cu_chart_offsets + 1
+    ));
+    mesh.cub_temp_storage.resize(temp_storage_bytes);
+    CUDA_CHECK(hipcub::DeviceSegmentedReduce::Sum(
+        mesh.cub_temp_storage.ptr, temp_storage_bytes,
+        cu_sorted_face_areas, mesh.atlas_chart_areas.ptr,
+        C,
+        cu_chart_offsets, cu_chart_offsets + 1
+    ));
+    CUDA_CHECK(hipFree(cu_sorted_face_areas));
+
+    float3* cu_sorted_face_normals;
+    CUDA_CHECK(hipMalloc(&cu_sorted_face_normals, F * sizeof(float3)));
+   hipLaunchKernelGGL(( index_kernel), dim3((F + BLOCK_SIZE - 1) / BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+        mesh.face_normals.ptr,
+        argsorted_faces_ids,
+        F,
+        cu_sorted_face_normals
+    );
+    CUDA_CHECK(hipGetLastError());
+    CUDA_CHECK(hipFree(argsorted_faces_ids));
+    float3* cu_chart_normals;
+    CUDA_CHECK(hipMalloc(&cu_chart_normals, C * sizeof(float3)));
+    CUDA_CHECK(hipcub::DeviceSegmentedReduce::Reduce(
+        nullptr, temp_storage_bytes,
+        cu_sorted_face_normals, cu_chart_normals,
+        C,
+        cu_chart_offsets, cu_chart_offsets + 1,
+        Float3Add(),
+        make_float3(0.0f, 0.0f, 0.0f)
+    ));
+    mesh.cub_temp_storage.resize(temp_storage_bytes);
+    CUDA_CHECK(hipcub::DeviceSegmentedReduce::Reduce(
+        mesh.cub_temp_storage.ptr, temp_storage_bytes,
+        cu_sorted_face_normals, cu_chart_normals,
+        C,
+        cu_chart_offsets, cu_chart_offsets + 1,
+        Float3Add(),
+        make_float3(0.0f, 0.0f, 0.0f)
+    ));
+   hipLaunchKernelGGL(( normalize_kernel), dim3((C + BLOCK_SIZE - 1) / BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+        cu_chart_normals,
+        C
+    );
+    CUDA_CHECK(hipGetLastError());
+
+    // 4. Compute normal difference
+    float* cu_normal_diff;
+    CUDA_CHECK(hipMalloc(&cu_normal_diff, F * sizeof(float)));
+   hipLaunchKernelGGL(( normal_diff_kernel), dim3((F + BLOCK_SIZE - 1) / BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+        cu_chart_normals,
+        cu_sorted_face_normals,
+        sorted_chart_ids,
+        F,
+        cu_normal_diff
+    );
+    CUDA_CHECK(hipGetLastError());
+    CUDA_CHECK(hipFree(cu_sorted_face_normals));
+    CUDA_CHECK(hipFree(sorted_chart_ids));
+
+    // 5. Compute new cone half angles
+    float* cu_new_cone_half_angles;
+    CUDA_CHECK(hipMalloc(&cu_new_cone_half_angles, C * sizeof(float)));
+    temp_storage_bytes = 0;
+    CUDA_CHECK(hipcub::DeviceSegmentedReduce::Max(
+        nullptr, temp_storage_bytes,
+        cu_normal_diff, cu_new_cone_half_angles,
+        C,
+        cu_chart_offsets, cu_chart_offsets + 1
+    ));
+    mesh.cub_temp_storage.resize(temp_storage_bytes);
+    CUDA_CHECK(hipcub::DeviceSegmentedReduce::Max(
+        mesh.cub_temp_storage.ptr, temp_storage_bytes,
+        cu_normal_diff, cu_new_cone_half_angles,
+        C,
+        cu_chart_offsets, cu_chart_offsets + 1
+    ));
+    CUDA_CHECK(hipFree(cu_chart_offsets));
+    CUDA_CHECK(hipFree(cu_normal_diff));
+
+    // 6. Update chart normal cones
+    mesh.atlas_chart_normal_cones.resize(C);
+   hipLaunchKernelGGL(( update_normal_cones_kernel), dim3((C + BLOCK_SIZE - 1) / BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+        mesh.atlas_chart_normal_cones.ptr,
+        cu_chart_normals,
+        cu_new_cone_half_angles,
+        C
+    );
+    CUDA_CHECK(hipGetLastError());
+    CUDA_CHECK(hipFree(cu_chart_normals));
+    CUDA_CHECK(hipFree(cu_new_cone_half_angles));
+}
+
+
+static __global__ void refine_charts_kernel(
+    const float4* chart_normal_cones,
+    const float3* face_normals,
+    const float3* vertices,
+    const uint64_t* edges,
+    const int3* face2edge,
+    const int* edge2face,
+    const int* edge2face_offset,
+    const size_t F,
+    const float lambda_smooth,
+    const int* chart_ids,         // Read-only (Input)
+    int* pong_chart_ids           // Write-only (Output)
+) {
+    const int fid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (fid >= F) return;
+
+    // 1. Load current face data
+    int current_c = chart_ids[fid];
+    Vec3f n(face_normals[fid]); 
+
+    // local register cache for candidate list (triangle has at most 3 neighbors, plus self, max 4 candidates)
+    int candidates[4];
+    float smooth_scores[4];
+    int num_candidates = 0;
+
+    // init: add self to candidate list
+    candidates[0] = current_c;
+    smooth_scores[0] = 0.0f;
+    num_candidates = 1;
+
+    // 2. Iterate over 3 edges to aggregate smooth scores
+    int eids[3] = { face2edge[fid].x, face2edge[fid].y, face2edge[fid].z };
+    
+    #pragma unroll
+    for (int i = 0; i < 3; i++) {
+        int eid = eids[i];
+
+        // calculate edge length (as smooth weight)
+        // logic: if I add the neighbor's Chart, I can eliminate this edge as a boundary cost
+        int v0_idx = int(edges[eid] >> 32);
+        int v1_idx = int(edges[eid] & 0xFFFFFFFF);
+        Vec3f v0 = Vec3f(vertices[v0_idx]);
+        Vec3f v1 = Vec3f(vertices[v1_idx]);
+        float edge_len = (v1 - v0).norm();
+
+        int start = edge2face_offset[eid];
+        int end = edge2face_offset[eid + 1];
+
+        // Process edge neighbors
+        for (int j = start; j < end; j++) {
+            int neighbor_fid = edge2face[j];
+            if (neighbor_fid == fid) continue;
+
+            int neighbor_c = chart_ids[neighbor_fid]; // Read from Input buffer
+
+            int idx = -1;
+            for (int k = 0; k < num_candidates; ++k) {
+                if (candidates[k] == neighbor_c) {
+                    idx = k;
+                    break;
+                }
+            }
+
+            if (idx == -1 && num_candidates < 4) {
+                idx = num_candidates++;
+                candidates[idx] = neighbor_c;
+                smooth_scores[idx] = 0.0f;
+            }
+
+            if (idx != -1) {
+                smooth_scores[idx] += edge_len;
+            }
+        }
+    }
+
+    // 3. Evaluate candidates and pick best
+    int best_c = current_c;
+    float best_total_score = -1e9f;
+
+    for (int i = 0; i < num_candidates; ++i) {
+        int c = candidates[i];
+        
+        // A. Geom score
+        float4 cone = chart_normal_cones[c];
+        Vec3f axis(cone.x, cone.y, cone.z);
+        float geo_sim = axis.dot(n); // [-1, 1]
+
+        // if invalid cone, skip
+        if (geo_sim <= 0.0f) continue;
+
+        // B. Smooth score
+        float smooth_sim = smooth_scores[i] * lambda_smooth;
+
+        float total_score = geo_sim + smooth_sim;
+
+        if (c == current_c) {
+            if (best_total_score == -1e9f) {
+                best_total_score = total_score;
+                best_c = c;
+            }
+        }
+
+        // C. Compare with best
+        float diff = total_score - best_total_score;
+        const float epsilon = 1e-5f; // dampening factor
+
+        if (diff > epsilon) {
+            // new best is significantly better than current best
+            best_total_score = total_score;
+            best_c = c;
+        } 
+        else if (abs(diff) <= epsilon) {
+            // scores are very close, break tie by choosing smaller ID
+            if (c < best_c) {
+                best_total_score = total_score;
+                best_c = c;
+            }
+        }
+    }
+
+    // Write back to Output buffer
+    pong_chart_ids[fid] = best_c;
+}
+
+
+__global__ void hook_edges_if_same_chart_kernel(
+    const int2* adj,
+    const int* chart_ids,
+    const int M,
+    int* conn_comp_ids,
+    int* end_flag
+) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= M) return;
+
+    // get adjacent faces
+    int f0 = adj[tid].x;
+    int f1 = adj[tid].y;
+    int c0 = chart_ids[f0];
+    int c1 = chart_ids[f1];
+    if (c0 != c1) return;
+
+    // union
+    // find roots
+    int root0 = conn_comp_ids[f0];
+    while (root0 != conn_comp_ids[root0]) {
+        root0 = conn_comp_ids[root0];
+    }
+    int root1 = conn_comp_ids[f1];
+    while (root1 != conn_comp_ids[root1]) {
+        root1 = conn_comp_ids[root1];
+    }
+
+    if (root0 == root1) return;
+
+    int high = max(root0, root1);
+    int low = min(root0, root1);
+    atomicMin(&conn_comp_ids[high], low);
+    *end_flag = 0;
+}
+
+
+static void reassign_chart_ids(
+    CuMesh& mesh
+) {
+    size_t F = mesh.faces.size;
+    size_t M = mesh.manifold_face_adj.size;
+
+    mesh.temp_storage.resize(F * sizeof(int));        // Use as parent for DSU
+   hipLaunchKernelGGL(( arange_kernel), dim3((F + BLOCK_SIZE - 1) / BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+        reinterpret_cast<int*>(mesh.temp_storage.ptr),
+        F
+    );
+    CUDA_CHECK(hipGetLastError());
+
+    int* cu_end_flag; int h_end_flag;
+    CUDA_CHECK(hipMalloc(&cu_end_flag, sizeof(int)));
+    do {
+        h_end_flag = 1;
+        CUDA_CHECK(hipMemcpy(cu_end_flag, &h_end_flag, sizeof(int), hipMemcpyHostToDevice));
+
+        // Hook
+       hipLaunchKernelGGL(( hook_edges_if_same_chart_kernel), dim3((M+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+            mesh.manifold_face_adj.ptr,
+            mesh.atlas_chart_ids.ptr,
+            M,
+            reinterpret_cast<int*>(mesh.temp_storage.ptr),
+            cu_end_flag
+        );
+        CUDA_CHECK(hipGetLastError());
+
+        // Compress
+       hipLaunchKernelGGL(( compress_components_kernel), dim3((F+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+            reinterpret_cast<int*>(mesh.temp_storage.ptr),
+            F
+        );
+        CUDA_CHECK(hipGetLastError());
+        CUDA_CHECK(hipMemcpy(&h_end_flag, cu_end_flag, sizeof(int), hipMemcpyDeviceToHost));
+    } while (h_end_flag == 0);
+    CUDA_CHECK(hipFree(cu_end_flag));
+    
+    swap_buffers(mesh.atlas_chart_ids, mesh.temp_storage);
+    mesh.atlas_num_charts = compress_ids(mesh.atlas_chart_ids.ptr, F, mesh.cub_temp_storage);
+}
+
+
+static __global__ void expand_chart_ids_and_vertex_ids_kernel(
+    const int* sorted_chart_ids,
+    const int* sorted_face_idx,
+    const int3* faces,
+    const size_t F,
+    uint64_t* pack
+) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= F) return;
+
+    int c = sorted_chart_ids[tid];
+    int f = sorted_face_idx[tid];
+    int3 face = faces[f];
+    int v0 = face.x;
+    int v1 = face.y;
+    int v2 = face.z;
+
+    pack[3 * tid + 0] = (uint64_t(c) << 32) | v0;
+    pack[3 * tid + 1] = (uint64_t(c) << 32) | v1;
+    pack[3 * tid + 2] = (uint64_t(c) << 32) | v2;
+}
+
+
+static __global__ void unpack_faces_kernel(
+    const uint64_t* pack,
+    const size_t F,
+    int3* faces
+) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= F) return;
+    int3 face;
+    face.x = int(pack[3 * tid + 0]);
+    face.y = int(pack[3 * tid + 1]);
+    face.z = int(pack[3 * tid + 2]);
+    faces[tid] = face;
+}
+
+
+static __global__ void unpack_vertex_ids_kernel(
+    const uint64_t* pack,
+    const size_t N,
+    int* vertex_ids,
+    int* vertex_offsets
+) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= N) return;
+    vertex_ids[tid] = int(pack[tid] & 0xFFFFFFFF);
+
+    int cur_c = int(pack[tid] >> 32);
+    if (tid == 0) {
+        vertex_offsets[0] = 0;
+    }
+    else {
+        int prev_c = int(pack[tid - 1] >> 32);
+        if (cur_c != prev_c) {
+            vertex_offsets[cur_c] = tid;
+        }
+    }
+    if (tid == N - 1) {
+        vertex_offsets[cur_c + 1] = N;
+    }
+}
+
+
+void construct_chart_mesh(
+    CuMesh& mesh
+) {
+    size_t F = mesh.faces.size;
+
+    // 1. Sort faces by chart id
+    mesh.atlas_chart_faces.resize(F);
+    mesh.atlas_chart_faces_offset.resize(mesh.atlas_num_charts + 1);
+    int* cu_sorted_chart_ids;
+    int* cu_face_idx;
+    int* cu_sorted_face_idx;
+    CUDA_CHECK(hipMalloc(&cu_sorted_chart_ids, F * sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_face_idx, F * sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_sorted_face_idx, F * sizeof(int)));
+   hipLaunchKernelGGL(( arange_kernel), dim3((F + BLOCK_SIZE - 1) / BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+        cu_face_idx,
+        F
+    );
+    CUDA_CHECK(hipGetLastError());
+    size_t temp_storage_bytes = 0;
+    CUDA_CHECK(hipcub::DeviceRadixSort::SortPairs(
+        nullptr, temp_storage_bytes,
+        mesh.atlas_chart_ids.ptr, cu_sorted_chart_ids,
+        cu_face_idx, cu_sorted_face_idx,
+        F
+    ));
+    mesh.cub_temp_storage.resize(temp_storage_bytes);
+    CUDA_CHECK(hipcub::DeviceRadixSort::SortPairs(
+        mesh.cub_temp_storage.ptr, temp_storage_bytes,
+        mesh.atlas_chart_ids.ptr, cu_sorted_chart_ids,
+        cu_face_idx, cu_sorted_face_idx,
+        F
+    ));
+    CUDA_CHECK(hipFree(cu_face_idx));
+    // 2. RLE for chart size
+    int* cu_chart_size;
+    int* cu_num_chart;
+    int* cu_unique_chart_ids;
+    CUDA_CHECK(hipMalloc(&cu_chart_size, (mesh.atlas_num_charts + 1) * sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_num_chart, sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_unique_chart_ids, mesh.atlas_num_charts * sizeof(int)));
+    temp_storage_bytes = 0;
+    CUDA_CHECK(hipcub::DeviceRunLengthEncode::Encode(
+        nullptr, temp_storage_bytes,
+        cu_sorted_chart_ids, cu_unique_chart_ids, cu_chart_size, cu_num_chart,
+        F
+    ));
+    mesh.cub_temp_storage.resize(temp_storage_bytes);
+    CUDA_CHECK(hipcub::DeviceRunLengthEncode::Encode(
+        mesh.cub_temp_storage.ptr, temp_storage_bytes,
+        cu_sorted_chart_ids, cu_unique_chart_ids, cu_chart_size, cu_num_chart,
+        F
+    ));
+    CUDA_CHECK(hipFree(cu_unique_chart_ids));
+    CUDA_CHECK(hipFree(cu_num_chart));
+    // 3. Exclusive scan for chart face offset
+    temp_storage_bytes = 0;
+    CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum(
+        nullptr, temp_storage_bytes,
+        cu_chart_size, mesh.atlas_chart_faces_offset.ptr,
+        mesh.atlas_num_charts + 1
+    ));
+    mesh.cub_temp_storage.resize(temp_storage_bytes);
+    CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum(
+        mesh.cub_temp_storage.ptr, temp_storage_bytes,
+        cu_chart_size, mesh.atlas_chart_faces_offset.ptr,
+        mesh.atlas_num_charts + 1
+    ));
+    CUDA_CHECK(hipFree(cu_chart_size));
+    // 4. Expand chart ids and vertex ids
+    uint64_t* cu_pack;
+    CUDA_CHECK(hipMalloc(&cu_pack, 3 * F * sizeof(uint64_t)));
+   hipLaunchKernelGGL(( expand_chart_ids_and_vertex_ids_kernel), dim3((F + BLOCK_SIZE - 1) / BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+        cu_sorted_chart_ids,
+        cu_sorted_face_idx,
+        mesh.faces.ptr,
+        F,
+        cu_pack
+    );
+    CUDA_CHECK(hipGetLastError());
+    CUDA_CHECK(hipFree(cu_sorted_chart_ids));
+    CUDA_CHECK(hipFree(cu_sorted_face_idx));
+    // 5. Compress pair to construct all maps
+    uint64_t* cu_inverse_pack;
+    CUDA_CHECK(hipMalloc(&cu_inverse_pack, 3 * F * sizeof(uint64_t)));
+    int new_num_vertices = compress_ids(
+        cu_pack,
+        3 * F,
+        mesh.cub_temp_storage,
+        cu_inverse_pack
+    );
+    mesh.atlas_chart_vertex_map.resize(new_num_vertices);
+    mesh.atlas_chart_vertex_offset.resize(mesh.atlas_num_charts + 1);
+   hipLaunchKernelGGL(( unpack_vertex_ids_kernel), dim3((new_num_vertices + BLOCK_SIZE - 1) / BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+        cu_inverse_pack,
+        new_num_vertices,
+        mesh.atlas_chart_vertex_map.ptr,
+        mesh.atlas_chart_vertex_offset.ptr
+    );
+    CUDA_CHECK(hipGetLastError());
+    CUDA_CHECK(hipFree(cu_inverse_pack));
+   hipLaunchKernelGGL(( unpack_faces_kernel), dim3((F + BLOCK_SIZE - 1) / BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+        cu_pack,
+        F,
+        mesh.atlas_chart_faces.ptr
+    );
+    CUDA_CHECK(hipGetLastError());
+    CUDA_CHECK(hipFree(cu_pack));
+}
+
+
+void CuMesh::compute_charts(
+    float threshold_cone_half_angle_rad, 
+    int refine_iterations, 
+    int global_iterations, 
+    float smooth_strength,
+    float area_penalty_weight,
+    float perimeter_area_ratio_weight
+) {
+    if (this->manifold_face_adj.is_empty()) {
+        this->get_manifold_face_adjacency();
+    }
+    if (this->face_normals.is_empty()) {
+        this->compute_face_normals();
+    }
+    if (this->face_areas.is_empty()) {
+        this->compute_face_areas();
+    }
+
+    // Initialize chart id
+    size_t F = this->faces.size;
+    this->atlas_chart_ids.resize(F);
+    this->atlas_num_charts = F;
+   hipLaunchKernelGGL(( arange_kernel), dim3((F + BLOCK_SIZE - 1) / BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+        this->atlas_chart_ids.ptr,
+        F
+    );
+    CUDA_CHECK(hipGetLastError());
+
+    // Main Iteration: Collapse and Refine
+    int* cu_end_flag; int h_end_flag;
+    CUDA_CHECK(hipMalloc(&cu_end_flag, sizeof(int)));
+    for (int i = 0; i < global_iterations; i++) {
+        while (true) {
+            h_end_flag = 1;
+            CUDA_CHECK(hipMemcpy(cu_end_flag, &h_end_flag, sizeof(int), hipMemcpyHostToDevice));
+
+            // 1. Compute chart connectivity
+            get_chart_connectivity(*this);
+            if (this->atlas_chart_adj.size == 0) break;
+
+            // 2. Compute normal cones
+            compute_chart_normal_cones(*this);
+
+            // 3. Compute chart adjacency cost
+            size_t E = this->atlas_chart_adj.size;
+            this->edge_collapse_costs.resize(E);
+           hipLaunchKernelGGL(( compute_chart_adjacency_cost_kernel), dim3((E + BLOCK_SIZE - 1) / BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+                this->atlas_chart_adj.ptr,
+                this->atlas_chart_normal_cones.ptr,
+                this->atlas_chart_adj_length.ptr,
+                this->atlas_chart_perims.ptr,
+                this->atlas_chart_areas.ptr,
+                area_penalty_weight,
+                perimeter_area_ratio_weight,
+                E,
+                this->edge_collapse_costs.ptr
+            );
+            CUDA_CHECK(hipGetLastError());CUDA_CHECK(hipDeviceSynchronize());
+
+            // 4. Propagate costs
+            size_t C = this->atlas_num_charts;
+            this->propagated_costs.resize(C);
+           hipLaunchKernelGGL(( propagate_cost_kernel), dim3((C + BLOCK_SIZE - 1) / BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+                this->atlas_chart2edge.ptr,
+                this->atlas_chart2edge_offset.ptr,
+                this->edge_collapse_costs.ptr,
+                C,
+                this->propagated_costs.ptr
+            );
+            CUDA_CHECK(hipGetLastError());CUDA_CHECK(hipDeviceSynchronize());
+
+            // 5. Collapse edges
+            this->vertices_map.resize(C);      // store collapse map
+           hipLaunchKernelGGL(( arange_kernel), dim3((C + BLOCK_SIZE - 1) / BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+                this->vertices_map.ptr,
+                C
+            );
+           hipLaunchKernelGGL(( collapse_edges_kernel), dim3((E + BLOCK_SIZE - 1) / BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+                this->atlas_chart_adj.ptr,
+                this->edge_collapse_costs.ptr,
+                this->propagated_costs.ptr,
+                threshold_cone_half_angle_rad,
+                E,
+                this->vertices_map.ptr,
+                this->atlas_chart_normal_cones.ptr,
+                cu_end_flag
+            );
+            CUDA_CHECK(hipGetLastError());CUDA_CHECK(hipDeviceSynchronize());
+
+            // End of iteration
+            CUDA_CHECK(hipMemcpy(&h_end_flag, cu_end_flag, sizeof(int), hipMemcpyDeviceToHost));
+            if (h_end_flag == 1) break;
+
+            // 6. Compress chart ids
+            this->atlas_num_charts = compress_ids(this->vertices_map.ptr, C, this->cub_temp_storage);
+            this->temp_storage.resize(F * sizeof(int));
+           hipLaunchKernelGGL(( index_kernel), dim3((F + BLOCK_SIZE - 1) / BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+                this->vertices_map.ptr,
+                this->atlas_chart_ids.ptr,
+                F,
+                reinterpret_cast<int*>(this->temp_storage.ptr)
+            );
+            CUDA_CHECK(hipGetLastError());CUDA_CHECK(hipDeviceSynchronize());
+            swap_buffers(this->atlas_chart_ids, this->temp_storage);
+        }
+
+        // Refine charts
+        for (int j = 0; j < refine_iterations; j++) {
+            compute_chart_normal_cones(*this);
+            this->temp_storage.resize(F * sizeof(int));
+           hipLaunchKernelGGL(( refine_charts_kernel), dim3((F + BLOCK_SIZE - 1) / BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+                this->atlas_chart_normal_cones.ptr,
+                this->face_normals.ptr,
+                this->vertices.ptr,
+                this->edges.ptr,
+                this->face2edge.ptr,
+                this->edge2face.ptr,
+                this->edge2face_offset.ptr,
+                F,
+                smooth_strength,
+                this->atlas_chart_ids.ptr,
+                reinterpret_cast<int*>(this->temp_storage.ptr)
+            );
+            CUDA_CHECK(hipGetLastError());
+            swap_buffers(this->atlas_chart_ids, this->temp_storage);
+            this->atlas_num_charts = compress_ids(this->atlas_chart_ids.ptr, F, this->cub_temp_storage);
+        }
+
+        // After refinement, the chart may become disconnected, so we need to re-assign chart ids
+        reassign_chart_ids(*this);
+    }
+    CUDA_CHECK(hipFree(cu_end_flag));
+
+    // Finalizing: calculate vmap, chart face and chart face offset
+    construct_chart_mesh(*this);
+}
+
+
+std::tuple<int, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor> CuMesh::read_atlas_charts() {
+    auto chart_ids = torch::empty({ static_cast<int64_t>(this->faces.size) }, torch::dtype(torch::kInt32).device(torch::kCUDA));
+    CUDA_CHECK(hipMemcpy(
+        chart_ids.data_ptr<int>(),
+        this->atlas_chart_ids.ptr,
+        this->faces.size * sizeof(int),
+        hipMemcpyDeviceToDevice
+    ));
+    auto vertex_map = torch::empty({ static_cast<int64_t>(this->atlas_chart_vertex_map.size) }, torch::dtype(torch::kInt32).device(torch::kCUDA));
+    CUDA_CHECK(hipMemcpy(
+        vertex_map.data_ptr<int>(),
+        this->atlas_chart_vertex_map.ptr,
+        this->atlas_chart_vertex_map.size * sizeof(int),
+        hipMemcpyDeviceToDevice
+    ));
+    auto chart_faces = torch::empty({ static_cast<int64_t>(this->atlas_chart_faces.size), 3 }, torch::dtype(torch::kInt32).device(torch::kCUDA));
+    CUDA_CHECK(hipMemcpy(
+        chart_faces.data_ptr<int>(),
+        this->atlas_chart_faces.ptr,
+        this->atlas_chart_faces.size * 3 * sizeof(int),
+        hipMemcpyDeviceToDevice
+    ));
+    auto chart_vertex_offset = torch::empty({ static_cast<int64_t>(this->atlas_chart_vertex_offset.size) }, torch::dtype(torch::kInt32).device(torch::kCUDA));
+    CUDA_CHECK(hipMemcpy(
+        chart_vertex_offset.data_ptr<int>(),
+        this->atlas_chart_vertex_offset.ptr,
+        this->atlas_chart_vertex_offset.size * sizeof(int),
+        hipMemcpyDeviceToDevice
+    ));
+    auto chart_face_offset = torch::empty({ static_cast<int64_t>(this->atlas_chart_faces_offset.size) }, torch::dtype(torch::kInt32).device(torch::kCUDA));
+    CUDA_CHECK(hipMemcpy(
+        chart_face_offset.data_ptr<int>(),
+        this->atlas_chart_faces_offset.ptr,
+        this->atlas_chart_faces_offset.size * sizeof(int),
+        hipMemcpyDeviceToDevice
+    ));
+    return std::make_tuple(this->atlas_num_charts, chart_ids, vertex_map, chart_faces, chart_vertex_offset, chart_face_offset);
+}
+
+
+} // namespace cumesh
\ No newline at end of file
diff --git a/src/clean_up.cu b/src/clean_up.cu
index 3c12cd7..a0dfb4b 100644
--- a/src/clean_up.cu
+++ b/src/clean_up.cu
@@ -1,7 +1,16 @@
 #include "cumesh.h"
 #include "dtypes.cuh"
 #include "shared.h"
+#ifdef __HIP_PLATFORM_AMD__
+#include <hipcub/hipcub.hpp>
+#else
 #include <cub/cub.cuh>
+#endif
+#ifdef __HIP_PLATFORM_AMD__
+#include <rocprim/types/tuple.hpp>
+#else
+#include <thrust/tuple.h>
+#endif
 
 
 namespace cumesh {
@@ -36,25 +45,25 @@ void CuMesh::remove_faces(torch::Tensor& face_mask) {
     size_t temp_storage_bytes = 0;
     int *cu_new_num_faces;
     int3 *cu_new_faces;
-    CUDA_CHECK(cudaMalloc(&cu_new_num_faces, sizeof(int)));
-    CUDA_CHECK(cudaMalloc(&cu_new_faces, F * sizeof(int3)));
-    CUDA_CHECK(cub::DeviceSelect::Flagged(
+    CUDA_CHECK(hipMalloc(&cu_new_num_faces, sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_new_faces, F * sizeof(int3)));
+    CUDA_CHECK(hipcub::DeviceSelect::Flagged(
         nullptr, temp_storage_bytes,
         this->faces.ptr, face_mask.data_ptr<bool>(), cu_new_faces, cu_new_num_faces,
         F
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
-    CUDA_CHECK(cub::DeviceSelect::Flagged(
+    CUDA_CHECK(hipcub::DeviceSelect::Flagged(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         this->faces.ptr, face_mask.data_ptr<bool>(), cu_new_faces, cu_new_num_faces,
         F
     ));
     int new_num_faces;
-    CUDA_CHECK(cudaMemcpy(&new_num_faces, cu_new_num_faces, sizeof(int), cudaMemcpyDeviceToHost));
+    CUDA_CHECK(hipMemcpy(&new_num_faces, cu_new_num_faces, sizeof(int), hipMemcpyDeviceToHost));
     this->faces.resize(new_num_faces);
-    CUDA_CHECK(cudaMemcpy(this->faces.ptr, cu_new_faces, new_num_faces * sizeof(int3), cudaMemcpyDeviceToDevice));
-    CUDA_CHECK(cudaFree(cu_new_num_faces));
-    CUDA_CHECK(cudaFree(cu_new_faces));
+    CUDA_CHECK(hipMemcpy(this->faces.ptr, cu_new_faces, new_num_faces * sizeof(int3), hipMemcpyDeviceToDevice));
+    CUDA_CHECK(hipFree(cu_new_num_faces));
+    CUDA_CHECK(hipFree(cu_new_faces));
 
     this->remove_unreferenced_vertices();
 }
@@ -66,25 +75,25 @@ void CuMesh::_remove_faces(uint8_t* face_mask) {
     size_t temp_storage_bytes = 0;
     int *cu_new_num_faces;
     int3 *cu_new_faces;
-    CUDA_CHECK(cudaMalloc(&cu_new_num_faces, sizeof(int)));
-    CUDA_CHECK(cudaMalloc(&cu_new_faces, F * sizeof(int3)));
-    CUDA_CHECK(cub::DeviceSelect::Flagged(
+    CUDA_CHECK(hipMalloc(&cu_new_num_faces, sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_new_faces, F * sizeof(int3)));
+    CUDA_CHECK(hipcub::DeviceSelect::Flagged(
         nullptr, temp_storage_bytes,
         this->faces.ptr, face_mask, cu_new_faces, cu_new_num_faces,
         F
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
-    CUDA_CHECK(cub::DeviceSelect::Flagged(
+    CUDA_CHECK(hipcub::DeviceSelect::Flagged(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         this->faces.ptr, face_mask, cu_new_faces, cu_new_num_faces,
         F
     ));
     int new_num_faces;
-    CUDA_CHECK(cudaMemcpy(&new_num_faces, cu_new_num_faces, sizeof(int), cudaMemcpyDeviceToHost));
+    CUDA_CHECK(hipMemcpy(&new_num_faces, cu_new_num_faces, sizeof(int), hipMemcpyDeviceToHost));
     this->faces.resize(new_num_faces);
-    CUDA_CHECK(cudaMemcpy(this->faces.ptr, cu_new_faces, new_num_faces * sizeof(int3), cudaMemcpyDeviceToDevice));
-    CUDA_CHECK(cudaFree(cu_new_num_faces));
-    CUDA_CHECK(cudaFree(cu_new_faces));
+    CUDA_CHECK(hipMemcpy(this->faces.ptr, cu_new_faces, new_num_faces * sizeof(int3), hipMemcpyDeviceToDevice));
+    CUDA_CHECK(hipFree(cu_new_num_faces));
+    CUDA_CHECK(hipFree(cu_new_faces));
 
     this->remove_unreferenced_vertices();
 }
@@ -139,28 +148,28 @@ void CuMesh::remove_unreferenced_vertices() {
 
     // Mark referenced vertices
     int* cu_vertex_is_referenced;
-    CUDA_CHECK(cudaMalloc(&cu_vertex_is_referenced, (V+1) * sizeof(int)));
-    CUDA_CHECK(cudaMemset(cu_vertex_is_referenced, 0, (V+1) * sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_vertex_is_referenced, (V+1) * sizeof(int)));
+    CUDA_CHECK(hipMemset(cu_vertex_is_referenced, 0, (V+1) * sizeof(int)));
     set_vertex_is_referenced<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
         this->faces.ptr,
         F,
         cu_vertex_is_referenced
     );
-    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(hipGetLastError());
 
     // Get vertices map
     size_t temp_storage_bytes = 0;
-    CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
+    CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum(
         nullptr, temp_storage_bytes,
         cu_vertex_is_referenced, V+1
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
-    CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
+    CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         cu_vertex_is_referenced, V+1
     ));
     int new_num_vertices;
-    CUDA_CHECK(cudaMemcpy(&new_num_vertices, cu_vertex_is_referenced + V, sizeof(int), cudaMemcpyDeviceToHost));
+    CUDA_CHECK(hipMemcpy(&new_num_vertices, cu_vertex_is_referenced + V, sizeof(int), hipMemcpyDeviceToHost));
 
     // Compress vertices
     this->temp_storage.resize(new_num_vertices * sizeof(float3));
@@ -170,7 +179,7 @@ void CuMesh::remove_unreferenced_vertices() {
         V,
         reinterpret_cast<float3*>(this->temp_storage.ptr)
     );
-    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(hipGetLastError());
     swap_buffers(this->temp_storage, this->vertices);
 
     // Update faces
@@ -179,8 +188,8 @@ void CuMesh::remove_unreferenced_vertices() {
         F,
         this->faces.ptr
     );
-    CUDA_CHECK(cudaGetLastError());
-    CUDA_CHECK(cudaFree(cu_vertex_is_referenced));
+    CUDA_CHECK(hipGetLastError());
+    CUDA_CHECK(hipFree(cu_vertex_is_referenced));
 
     // Delete all cached info since mesh has changed
     this->clear_cache();
@@ -229,10 +238,17 @@ static __global__ void select_first_in_each_group_kernel(
 
 struct int3_decomposer
 {
+#ifdef __HIP_PLATFORM_AMD__
+    __host__ __device__ ::rocprim::tuple<int&, int&, int&> operator()(int3& key) const
+    {
+        return ::rocprim::tie(key.x, key.y, key.z);
+    }
+#else
     __host__ __device__ ::cuda::std::tuple<int&, int&, int&> operator()(int3& key) const
     {
         return {key.x, key.y, key.z};
     }
+#endif
 };
 
 
@@ -242,29 +258,29 @@ void CuMesh::remove_duplicate_faces() {
     // Create a temporary sorted copy of faces for duplicate detection
     // Do NOT modify the original faces to preserve vertex order and normals
     int3 *cu_sorted_faces;
-    CUDA_CHECK(cudaMalloc(&cu_sorted_faces, F * sizeof(int3)));
-    CUDA_CHECK(cudaMemcpy(cu_sorted_faces, this->faces.ptr, F * sizeof(int3), cudaMemcpyDeviceToDevice));
+    CUDA_CHECK(hipMalloc(&cu_sorted_faces, F * sizeof(int3)));
+    CUDA_CHECK(hipMemcpy(cu_sorted_faces, this->faces.ptr, F * sizeof(int3), hipMemcpyDeviceToDevice));
 
     // Sort vertices within each face (in the temporary copy)
     sort_faces_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
         cu_sorted_faces,
         F
     );
-    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(hipGetLastError());
 
     // Sort all faces globally by their sorted vertex indices
     size_t temp_storage_bytes = 0;
     int *cu_sorted_face_indices;
-    CUDA_CHECK(cudaMalloc(&cu_sorted_face_indices, F * sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_sorted_face_indices, F * sizeof(int)));
     arange_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(cu_sorted_face_indices, F);
-    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(hipGetLastError());
 
     int *cu_sorted_indices_output;
     int3 *cu_sorted_faces_output;
-    CUDA_CHECK(cudaMalloc(&cu_sorted_indices_output, F * sizeof(int)));
-    CUDA_CHECK(cudaMalloc(&cu_sorted_faces_output, F * sizeof(int3)));
+    CUDA_CHECK(hipMalloc(&cu_sorted_indices_output, F * sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_sorted_faces_output, F * sizeof(int3)));
 
-    CUDA_CHECK(cub::DeviceRadixSort::SortPairs(
+    CUDA_CHECK(hipcub::DeviceRadixSort::SortPairs(
         nullptr, temp_storage_bytes,
         cu_sorted_faces, cu_sorted_faces_output,
         cu_sorted_face_indices, cu_sorted_indices_output,
@@ -272,45 +288,45 @@ void CuMesh::remove_duplicate_faces() {
         int3_decomposer{}
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
-    CUDA_CHECK(cub::DeviceRadixSort::SortPairs(
+    CUDA_CHECK(hipcub::DeviceRadixSort::SortPairs(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         cu_sorted_faces, cu_sorted_faces_output,
         cu_sorted_face_indices, cu_sorted_indices_output,
         F,
         int3_decomposer{}
     ));
-    CUDA_CHECK(cudaFree(cu_sorted_faces));
-    CUDA_CHECK(cudaFree(cu_sorted_face_indices));
+    CUDA_CHECK(hipFree(cu_sorted_faces));
+    CUDA_CHECK(hipFree(cu_sorted_face_indices));
 
     // Select first in each group of duplicate faces (based on sorted faces)
     uint8_t* cu_face_mask_sorted;
-    CUDA_CHECK(cudaMalloc(&cu_face_mask_sorted, F * sizeof(uint8_t)));
+    CUDA_CHECK(hipMalloc(&cu_face_mask_sorted, F * sizeof(uint8_t)));
     select_first_in_each_group_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
         cu_sorted_faces_output,
         F,
         cu_face_mask_sorted
     );
-    CUDA_CHECK(cudaGetLastError());
-    CUDA_CHECK(cudaFree(cu_sorted_faces_output));
+    CUDA_CHECK(hipGetLastError());
+    CUDA_CHECK(hipFree(cu_sorted_faces_output));
 
     // Map the mask back to original face order using scatter
     // scatter: output[indices[i]] = values[i]
     // This maps: cu_face_mask_original[original_idx] = cu_face_mask_sorted[sorted_position]
     uint8_t* cu_face_mask_original;
-    CUDA_CHECK(cudaMalloc(&cu_face_mask_original, F * sizeof(uint8_t)));
+    CUDA_CHECK(hipMalloc(&cu_face_mask_original, F * sizeof(uint8_t)));
     scatter_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
         cu_sorted_indices_output,  // indices: sorted_position -> original_idx
         cu_face_mask_sorted,       // values: mask at sorted_position
         F,
         cu_face_mask_original      // output: mask at original position
     );
-    CUDA_CHECK(cudaGetLastError());
-    CUDA_CHECK(cudaFree(cu_face_mask_sorted));
-    CUDA_CHECK(cudaFree(cu_sorted_indices_output));
+    CUDA_CHECK(hipGetLastError());
+    CUDA_CHECK(hipFree(cu_face_mask_sorted));
+    CUDA_CHECK(hipFree(cu_sorted_indices_output));
 
     // Select faces to keep (preserving original vertex order)
     this->_remove_faces(cu_face_mask_original);
-    CUDA_CHECK(cudaFree(cu_face_mask_original));
+    CUDA_CHECK(hipFree(cu_face_mask_original));
 }
 
 
@@ -355,7 +371,7 @@ void CuMesh::remove_degenerate_faces(float abs_thresh, float rel_thresh) {
     size_t F = this->faces.size;
 
     uint8_t* cu_face_mask;
-    CUDA_CHECK(cudaMalloc(&cu_face_mask, F * sizeof(uint8_t)));
+    CUDA_CHECK(hipMalloc(&cu_face_mask, F * sizeof(uint8_t)));
     mark_degenerate_faces_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
         this->vertices.ptr,
         this->faces.ptr,
@@ -363,10 +379,10 @@ void CuMesh::remove_degenerate_faces(float abs_thresh, float rel_thresh) {
         F,
         cu_face_mask
     );
-    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(hipGetLastError());
 
     this->_remove_faces(cu_face_mask);
-    CUDA_CHECK(cudaFree(cu_face_mask));
+    CUDA_CHECK(hipFree(cu_face_mask));
 }
 
 
@@ -450,7 +466,7 @@ void CuMesh::fill_holes(float max_hole_perimeter) {
 
     // Compute loop boundary lengths
     float* cu_loop_boundary_lengths;
-    CUDA_CHECK(cudaMalloc(&cu_loop_boundary_lengths, E * sizeof(float)));
+    CUDA_CHECK(hipMalloc(&cu_loop_boundary_lengths, E * sizeof(float)));
     compute_loop_boundary_lengths<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
         this->vertices.ptr,
         this->edges.ptr,
@@ -458,13 +474,13 @@ void CuMesh::fill_holes(float max_hole_perimeter) {
         E,
         cu_loop_boundary_lengths
     );
-    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(hipGetLastError());
 
     // Segment sum
     size_t temp_storage_bytes = 0;
     float *cu_bound_loop_perimeters;
-    CUDA_CHECK(cudaMalloc(&cu_bound_loop_perimeters, L * sizeof(float)));
-    CUDA_CHECK(cub::DeviceSegmentedReduce::Sum(
+    CUDA_CHECK(hipMalloc(&cu_bound_loop_perimeters, L * sizeof(float)));
+    CUDA_CHECK(hipcub::DeviceSegmentedReduce::Sum(
         nullptr, temp_storage_bytes,
         cu_loop_boundary_lengths, cu_bound_loop_perimeters,
         L,
@@ -472,18 +488,18 @@ void CuMesh::fill_holes(float max_hole_perimeter) {
         this->loop_boundaries_offset.ptr + 1
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
-    CUDA_CHECK(cub::DeviceSegmentedReduce::Sum(
+    CUDA_CHECK(hipcub::DeviceSegmentedReduce::Sum(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         cu_loop_boundary_lengths, cu_bound_loop_perimeters,
         L,
         this->loop_boundaries_offset.ptr,
         this->loop_boundaries_offset.ptr + 1
     ));
-    CUDA_CHECK(cudaFree(cu_loop_boundary_lengths));
+    CUDA_CHECK(hipFree(cu_loop_boundary_lengths));
 
     // Mask small loops
     uint8_t* cu_bound_loop_mask;
-    CUDA_CHECK(cudaMalloc(&cu_bound_loop_mask, L * sizeof(uint8_t)));
+    CUDA_CHECK(hipMalloc(&cu_bound_loop_mask, L * sizeof(uint8_t)));
     compare_kernel<<<(L+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
         cu_bound_loop_perimeters,
         max_hole_perimeter,
@@ -491,62 +507,62 @@ void CuMesh::fill_holes(float max_hole_perimeter) {
         LessThanOp(),
         cu_bound_loop_mask
     );
-    CUDA_CHECK(cudaGetLastError());
-    CUDA_CHECK(cudaFree(cu_bound_loop_perimeters));
+    CUDA_CHECK(hipGetLastError());
+    CUDA_CHECK(hipFree(cu_bound_loop_perimeters));
 
     // Compress bound loops size
     int* cu_bound_loops_cnt;
-    CUDA_CHECK(cudaMalloc(&cu_bound_loops_cnt, L * sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_bound_loops_cnt, L * sizeof(int)));
     diff_kernel<<<(L+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
         this->loop_boundaries_offset.ptr,
         L,
         cu_bound_loops_cnt
     );
-    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(hipGetLastError());
     int *cu_new_loop_boundaries_cnt, *cu_new_num_bound_loops;
-    CUDA_CHECK(cudaMalloc(&cu_new_loop_boundaries_cnt, (L+1) * sizeof(int)));
-    CUDA_CHECK(cudaMalloc(&cu_new_num_bound_loops, sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_new_loop_boundaries_cnt, (L+1) * sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_new_num_bound_loops, sizeof(int)));
     temp_storage_bytes = 0;
-    CUDA_CHECK(cub::DeviceSelect::Flagged(
+    CUDA_CHECK(hipcub::DeviceSelect::Flagged(
         nullptr, temp_storage_bytes,
         cu_bound_loops_cnt, cu_bound_loop_mask, cu_new_loop_boundaries_cnt, cu_new_num_bound_loops,
         L
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
-    CUDA_CHECK(cub::DeviceSelect::Flagged(
+    CUDA_CHECK(hipcub::DeviceSelect::Flagged(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         cu_bound_loops_cnt, cu_bound_loop_mask, cu_new_loop_boundaries_cnt, cu_new_num_bound_loops,
         L
     ));
     int new_num_bound_loops;
-    CUDA_CHECK(cudaMemcpy(&new_num_bound_loops, cu_new_num_bound_loops, sizeof(int), cudaMemcpyDeviceToHost));
-    CUDA_CHECK(cudaFree(cu_bound_loops_cnt));
-    CUDA_CHECK(cudaFree(cu_new_num_bound_loops));
+    CUDA_CHECK(hipMemcpy(&new_num_bound_loops, cu_new_num_bound_loops, sizeof(int), hipMemcpyDeviceToHost));
+    CUDA_CHECK(hipFree(cu_bound_loops_cnt));
+    CUDA_CHECK(hipFree(cu_new_num_bound_loops));
     if (new_num_bound_loops == 0) {
-        CUDA_CHECK(cudaFree(cu_new_loop_boundaries_cnt));
-        CUDA_CHECK(cudaFree(cu_bound_loop_mask));
+        CUDA_CHECK(hipFree(cu_new_loop_boundaries_cnt));
+        CUDA_CHECK(hipFree(cu_bound_loop_mask));
         return;
     }
 
     // Get loop ids of loop boundaries
     int* cu_loop_bound_loop_ids;
-    CUDA_CHECK(cudaMalloc(&cu_loop_bound_loop_ids, E * sizeof(int)));
-    CUDA_CHECK(cudaMemset(cu_loop_bound_loop_ids, 0, E * sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_loop_bound_loop_ids, E * sizeof(int)));
+    CUDA_CHECK(hipMemset(cu_loop_bound_loop_ids, 0, E * sizeof(int)));
     if (L > 1) {
         set_flag_kernel<<<(L-1+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
             this->loop_boundaries_offset.ptr + 1, L - 1,
             cu_loop_bound_loop_ids
         );
-        CUDA_CHECK(cudaGetLastError());
+        CUDA_CHECK(hipGetLastError());
     }
     temp_storage_bytes = 0;
-    CUDA_CHECK(cub::DeviceScan::InclusiveSum(
+    CUDA_CHECK(hipcub::DeviceScan::InclusiveSum(
         nullptr, temp_storage_bytes,
         cu_loop_bound_loop_ids,
         E
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
-    CUDA_CHECK(cub::DeviceScan::InclusiveSum(
+    CUDA_CHECK(hipcub::DeviceScan::InclusiveSum(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         cu_loop_bound_loop_ids,
         E
@@ -554,71 +570,71 @@ void CuMesh::fill_holes(float max_hole_perimeter) {
 
     // Mask loop boundaries
     uint8_t* cu_loop_boundary_mask;
-    CUDA_CHECK(cudaMalloc(&cu_loop_boundary_mask, E * sizeof(uint8_t)));
+    CUDA_CHECK(hipMalloc(&cu_loop_boundary_mask, E * sizeof(uint8_t)));
     index_kernel<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
         cu_bound_loop_mask,
         cu_loop_bound_loop_ids,
         E,
         cu_loop_boundary_mask
     );
-    CUDA_CHECK(cudaGetLastError());
-    CUDA_CHECK(cudaFree(cu_bound_loop_mask));
-    CUDA_CHECK(cudaFree(cu_loop_bound_loop_ids));
+    CUDA_CHECK(hipGetLastError());
+    CUDA_CHECK(hipFree(cu_bound_loop_mask));
+    CUDA_CHECK(hipFree(cu_loop_bound_loop_ids));
 
     // Compress loop boundaries
     int *cu_new_loop_boundaries, *cu_new_num_loop_boundaries;
-    CUDA_CHECK(cudaMalloc(&cu_new_loop_boundaries, E * sizeof(int)));
-    CUDA_CHECK(cudaMalloc(&cu_new_num_loop_boundaries, sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_new_loop_boundaries, E * sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_new_num_loop_boundaries, sizeof(int)));
     temp_storage_bytes = 0;
-    CUDA_CHECK(cub::DeviceSelect::Flagged(
+    CUDA_CHECK(hipcub::DeviceSelect::Flagged(
         nullptr, temp_storage_bytes,
         this->loop_boundaries.ptr, cu_loop_boundary_mask, cu_new_loop_boundaries, cu_new_num_loop_boundaries,
         E
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
-    CUDA_CHECK(cub::DeviceSelect::Flagged(
+    CUDA_CHECK(hipcub::DeviceSelect::Flagged(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         this->loop_boundaries.ptr, cu_loop_boundary_mask, cu_new_loop_boundaries, cu_new_num_loop_boundaries,
         E
     ));
     int new_num_loop_boundaries;
-    CUDA_CHECK(cudaMemcpy(&new_num_loop_boundaries, cu_new_num_loop_boundaries, sizeof(int), cudaMemcpyDeviceToHost));
-    CUDA_CHECK(cudaFree(cu_new_num_loop_boundaries));
-    CUDA_CHECK(cudaFree(cu_loop_boundary_mask));
+    CUDA_CHECK(hipMemcpy(&new_num_loop_boundaries, cu_new_num_loop_boundaries, sizeof(int), hipMemcpyDeviceToHost));
+    CUDA_CHECK(hipFree(cu_new_num_loop_boundaries));
+    CUDA_CHECK(hipFree(cu_loop_boundary_mask));
 
     // Reconstruct new bound loops
     int* cu_new_loop_boundaries_offset;
-    CUDA_CHECK(cudaMalloc(&cu_new_loop_boundaries_offset, (new_num_loop_boundaries+1) * sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_new_loop_boundaries_offset, (new_num_loop_boundaries+1) * sizeof(int)));
     temp_storage_bytes = 0;
-    CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
+    CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum(
         nullptr, temp_storage_bytes,
         cu_new_loop_boundaries_cnt, cu_new_loop_boundaries_offset,
         new_num_bound_loops + 1
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
-    CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
+    CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         cu_new_loop_boundaries_cnt, cu_new_loop_boundaries_offset,
         new_num_bound_loops + 1
     ));
     int* cu_new_loop_bound_loop_ids;
-    CUDA_CHECK(cudaMalloc(&cu_new_loop_bound_loop_ids, new_num_loop_boundaries * sizeof(int)));
-    CUDA_CHECK(cudaMemset(cu_new_loop_bound_loop_ids, 0, new_num_loop_boundaries * sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_new_loop_bound_loop_ids, new_num_loop_boundaries * sizeof(int)));
+    CUDA_CHECK(hipMemset(cu_new_loop_bound_loop_ids, 0, new_num_loop_boundaries * sizeof(int)));
     if (new_num_bound_loops > 1) {
         set_flag_kernel<<<(new_num_bound_loops-1+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
             cu_new_loop_boundaries_offset+1, new_num_bound_loops-1,
             cu_new_loop_bound_loop_ids
         );
-        CUDA_CHECK(cudaGetLastError());
+        CUDA_CHECK(hipGetLastError());
     }
     temp_storage_bytes = 0;
-    CUDA_CHECK(cub::DeviceScan::InclusiveSum(
+    CUDA_CHECK(hipcub::DeviceScan::InclusiveSum(
         nullptr, temp_storage_bytes,
         cu_new_loop_bound_loop_ids,
         new_num_loop_boundaries
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
-    CUDA_CHECK(cub::DeviceScan::InclusiveSum(
+    CUDA_CHECK(hipcub::DeviceScan::InclusiveSum(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         cu_new_loop_bound_loop_ids,
         new_num_loop_boundaries
@@ -626,7 +642,7 @@ void CuMesh::fill_holes(float max_hole_perimeter) {
 
     // Calculate new vertex positions as average of loop vertices
     Vec3f* cu_new_loop_bound_centers;
-    CUDA_CHECK(cudaMalloc(&cu_new_loop_bound_centers, new_num_loop_boundaries * sizeof(Vec3f)));
+    CUDA_CHECK(hipMalloc(&cu_new_loop_bound_centers, new_num_loop_boundaries * sizeof(Vec3f)));
     compute_loop_boundary_midpoints<<<(new_num_loop_boundaries+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
         this->vertices.ptr,
         this->edges.ptr,
@@ -634,11 +650,11 @@ void CuMesh::fill_holes(float max_hole_perimeter) {
         new_num_loop_boundaries,
         cu_new_loop_bound_centers
     );
-    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(hipGetLastError());
     Vec3f* cu_new_vertices;
-    CUDA_CHECK(cudaMalloc(&cu_new_vertices, new_num_bound_loops * sizeof(Vec3f)));
+    CUDA_CHECK(hipMalloc(&cu_new_vertices, new_num_bound_loops * sizeof(Vec3f)));
     temp_storage_bytes = 0;
-    CUDA_CHECK(cub::DeviceSegmentedReduce::Sum(
+    CUDA_CHECK(hipcub::DeviceSegmentedReduce::Sum(
         nullptr, temp_storage_bytes,
         cu_new_loop_bound_centers, cu_new_vertices,
         new_num_bound_loops,
@@ -646,22 +662,22 @@ void CuMesh::fill_holes(float max_hole_perimeter) {
         cu_new_loop_boundaries_offset + 1
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
-    CUDA_CHECK(cub::DeviceSegmentedReduce::Sum(
+    CUDA_CHECK(hipcub::DeviceSegmentedReduce::Sum(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         cu_new_loop_bound_centers, cu_new_vertices,
         new_num_bound_loops,
         cu_new_loop_boundaries_offset,
         cu_new_loop_boundaries_offset + 1
     ));
-    CUDA_CHECK(cudaFree(cu_new_loop_bound_centers));
-    CUDA_CHECK(cudaFree(cu_new_loop_boundaries_offset));
+    CUDA_CHECK(hipFree(cu_new_loop_bound_centers));
+    CUDA_CHECK(hipFree(cu_new_loop_boundaries_offset));
     inplace_div_kernel<<<(new_num_bound_loops+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
         cu_new_vertices,
         cu_new_loop_boundaries_cnt,
         new_num_bound_loops
     );
-    CUDA_CHECK(cudaGetLastError());
-    CUDA_CHECK(cudaFree(cu_new_loop_boundaries_cnt));
+    CUDA_CHECK(hipGetLastError());
+    CUDA_CHECK(hipFree(cu_new_loop_boundaries_cnt));
 
     // Update mesh
     this->vertices.extend(new_num_bound_loops);
@@ -671,8 +687,8 @@ void CuMesh::fill_holes(float max_hole_perimeter) {
         new_num_bound_loops,
         this->vertices.ptr + V
     );
-    CUDA_CHECK(cudaGetLastError());
-    CUDA_CHECK(cudaFree(cu_new_vertices));
+    CUDA_CHECK(hipGetLastError());
+    CUDA_CHECK(hipFree(cu_new_vertices));
     connect_new_vertices_kernel<<<(new_num_loop_boundaries+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
         this->edges.ptr,
         cu_new_loop_boundaries,
@@ -681,9 +697,9 @@ void CuMesh::fill_holes(float max_hole_perimeter) {
         V,
         this->faces.ptr + F
     );
-    CUDA_CHECK(cudaGetLastError());
-    CUDA_CHECK(cudaFree(cu_new_loop_boundaries));
-    CUDA_CHECK(cudaFree(cu_new_loop_bound_loop_ids));
+    CUDA_CHECK(hipGetLastError());
+    CUDA_CHECK(hipFree(cu_new_loop_boundaries));
+    CUDA_CHECK(hipFree(cu_new_loop_bound_loop_ids));
 
     // Delete all cached info since mesh has changed
     this->clear_cache();
@@ -772,25 +788,25 @@ void CuMesh::repair_non_manifold_edges(){
 
     // Construct vertex adjacency pairs with manifold edges
     int2* cu_vertex_adj_pairs;
-    CUDA_CHECK(cudaMalloc(&cu_vertex_adj_pairs, 2*M*sizeof(int2)));
+    CUDA_CHECK(hipMalloc(&cu_vertex_adj_pairs, 2*M*sizeof(int2)));
     construct_vertex_adj_pairs_kernel<<<(M+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
         this->manifold_face_adj.ptr,
         this->faces.ptr,
         cu_vertex_adj_pairs,
         M
     );
-    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(hipGetLastError());
 
     // Iterative Hook and Compress
     int* cu_vertex_ids;
-    CUDA_CHECK(cudaMalloc(&cu_vertex_ids, 3 * F * sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_vertex_ids, 3 * F * sizeof(int)));
     arange_kernel<<<(3*F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(cu_vertex_ids, 3 * F);
-    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(hipGetLastError());
     int* cu_end_flag; int h_end_flag;
-    CUDA_CHECK(cudaMalloc(&cu_end_flag, sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_end_flag, sizeof(int)));
     do {
         h_end_flag = 1;
-        CUDA_CHECK(cudaMemcpy(cu_end_flag, &h_end_flag, sizeof(int), cudaMemcpyHostToDevice));
+        CUDA_CHECK(hipMemcpy(cu_end_flag, &h_end_flag, sizeof(int), hipMemcpyHostToDevice));
 
         // Hook
         hook_edges_kernel<<<(2*M+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
@@ -799,25 +815,25 @@ void CuMesh::repair_non_manifold_edges(){
             cu_vertex_ids,
             cu_end_flag
         );
-        CUDA_CHECK(cudaGetLastError());
+        CUDA_CHECK(hipGetLastError());
 
         // Compress
         compress_components_kernel<<<(3*F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
             cu_vertex_ids,
             3 * F
         );
-        CUDA_CHECK(cudaGetLastError());
-        CUDA_CHECK(cudaMemcpy(&h_end_flag, cu_end_flag, sizeof(int), cudaMemcpyDeviceToHost));
+        CUDA_CHECK(hipGetLastError());
+        CUDA_CHECK(hipMemcpy(&h_end_flag, cu_end_flag, sizeof(int), hipMemcpyDeviceToHost));
     } while (h_end_flag == 0);
-    CUDA_CHECK(cudaFree(cu_end_flag));
-    CUDA_CHECK(cudaFree(cu_vertex_adj_pairs));
+    CUDA_CHECK(hipFree(cu_end_flag));
+    CUDA_CHECK(hipFree(cu_vertex_adj_pairs));
 
     // Construct new faces
     int* cu_new_vertices_ids;
-    CUDA_CHECK(cudaMalloc(&cu_new_vertices_ids, 3 * F * sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_new_vertices_ids, 3 * F * sizeof(int)));
     int new_V = compress_ids(cu_vertex_ids, 3 * F, this->cub_temp_storage, cu_new_vertices_ids);
     float3* cu_new_vertices;
-    CUDA_CHECK(cudaMalloc(&cu_new_vertices, new_V * sizeof(float3)));
+    CUDA_CHECK(hipMalloc(&cu_new_vertices, new_V * sizeof(float3)));
     index_vertice_kernel<<<(new_V+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
         cu_new_vertices_ids,
         this->faces.ptr,
@@ -825,15 +841,15 @@ void CuMesh::repair_non_manifold_edges(){
         new_V,
         cu_new_vertices
     );
-    CUDA_CHECK(cudaGetLastError());
-    CUDA_CHECK(cudaFree(cu_new_vertices_ids));
+    CUDA_CHECK(hipGetLastError());
+    CUDA_CHECK(hipFree(cu_new_vertices_ids));
     this->vertices.resize(new_V);
-    CUDA_CHECK(cudaMemcpy(this->vertices.ptr, cu_new_vertices, new_V * sizeof(float3), cudaMemcpyDeviceToDevice));
-    CUDA_CHECK(cudaFree(cu_new_vertices));
+    CUDA_CHECK(hipMemcpy(this->vertices.ptr, cu_new_vertices, new_V * sizeof(float3), hipMemcpyDeviceToDevice));
+    CUDA_CHECK(hipFree(cu_new_vertices));
     this->faces.resize(F);
     copy_T_to_T3_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(cu_vertex_ids, F, this->faces.ptr);
-    CUDA_CHECK(cudaGetLastError());
-    CUDA_CHECK(cudaFree(cu_vertex_ids));
+    CUDA_CHECK(hipGetLastError());
+    CUDA_CHECK(hipFree(cu_vertex_ids));
 
     // Delete all cached info since mesh has changed
     this->clear_cache();
@@ -886,8 +902,8 @@ void CuMesh::remove_non_manifold_faces() {
 
     // Initialize face mask (1 = keep all faces initially)
     uint8_t* cu_face_keep_mask;
-    CUDA_CHECK(cudaMalloc(&cu_face_keep_mask, F * sizeof(uint8_t)));
-    CUDA_CHECK(cudaMemset(cu_face_keep_mask, 1, F * sizeof(uint8_t)));
+    CUDA_CHECK(hipMalloc(&cu_face_keep_mask, F * sizeof(uint8_t)));
+    CUDA_CHECK(hipMemset(cu_face_keep_mask, 1, F * sizeof(uint8_t)));
 
     // Mark faces on non-manifold edges for removal
     mark_non_manifold_faces_kernel<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
@@ -897,11 +913,11 @@ void CuMesh::remove_non_manifold_faces() {
         E,
         cu_face_keep_mask
     );
-    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(hipGetLastError());
 
     // Remove marked faces
     this->_remove_faces(cu_face_keep_mask);
-    CUDA_CHECK(cudaFree(cu_face_keep_mask));
+    CUDA_CHECK(hipFree(cu_face_keep_mask));
 
     // Clear cache since mesh has changed
     this->clear_cache();
@@ -930,16 +946,16 @@ void CuMesh::remove_small_connected_components(float min_area) {
     size_t temp_storage_bytes = 0;
     int *cu_sorted_conn_comp_ids;
     float *cu_sorted_face_areas;
-    CUDA_CHECK(cudaMalloc(&cu_sorted_conn_comp_ids, F * sizeof(int)));
-    CUDA_CHECK(cudaMalloc(&cu_sorted_face_areas, F * sizeof(float)));
-    CUDA_CHECK(cub::DeviceRadixSort::SortPairs(
+    CUDA_CHECK(hipMalloc(&cu_sorted_conn_comp_ids, F * sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_sorted_face_areas, F * sizeof(float)));
+    CUDA_CHECK(hipcub::DeviceRadixSort::SortPairs(
         nullptr, temp_storage_bytes,
         this->conn_comp_ids.ptr, cu_sorted_conn_comp_ids,
         this->face_areas.ptr, cu_sorted_face_areas,
         F
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
-    CUDA_CHECK(cub::DeviceRadixSort::SortPairs(
+    CUDA_CHECK(hipcub::DeviceRadixSort::SortPairs(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         this->conn_comp_ids.ptr, cu_sorted_conn_comp_ids,
         this->face_areas.ptr, cu_sorted_face_areas,
@@ -950,48 +966,48 @@ void CuMesh::remove_small_connected_components(float min_area) {
     int* cu_conn_comp_num_faces;
     int* cu_num_conn_comps;
     int* cu_unique_conn_comp_ids; // Not needed, but we need to pass a valid pointer.
-    CUDA_CHECK(cudaMalloc(&cu_conn_comp_num_faces, (this->num_conn_comps + 1) * sizeof(int)));
-    CUDA_CHECK(cudaMalloc(&cu_num_conn_comps, sizeof(int)));
-    CUDA_CHECK(cudaMalloc(&cu_unique_conn_comp_ids, (this->num_conn_comps + 1) * sizeof(int)));
-    CUDA_CHECK(cub::DeviceRunLengthEncode::Encode(
+    CUDA_CHECK(hipMalloc(&cu_conn_comp_num_faces, (this->num_conn_comps + 1) * sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_num_conn_comps, sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_unique_conn_comp_ids, (this->num_conn_comps + 1) * sizeof(int)));
+    CUDA_CHECK(hipcub::DeviceRunLengthEncode::Encode(
         nullptr, temp_storage_bytes,
         cu_sorted_conn_comp_ids, cu_unique_conn_comp_ids,
         cu_conn_comp_num_faces, cu_num_conn_comps,
         F
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
-    CUDA_CHECK(cub::DeviceRunLengthEncode::Encode(
+    CUDA_CHECK(hipcub::DeviceRunLengthEncode::Encode(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         cu_sorted_conn_comp_ids, cu_unique_conn_comp_ids,
         cu_conn_comp_num_faces, cu_num_conn_comps,
         F
     ));
     int num_conn_comps;
-    CUDA_CHECK(cudaMemcpy(&num_conn_comps, cu_num_conn_comps, sizeof(int), cudaMemcpyDeviceToHost));
-    CUDA_CHECK(cudaFree(cu_num_conn_comps));
-    CUDA_CHECK(cudaFree(cu_sorted_conn_comp_ids));
-    CUDA_CHECK(cudaFree(cu_unique_conn_comp_ids));
+    CUDA_CHECK(hipMemcpy(&num_conn_comps, cu_num_conn_comps, sizeof(int), hipMemcpyDeviceToHost));
+    CUDA_CHECK(hipFree(cu_num_conn_comps));
+    CUDA_CHECK(hipFree(cu_sorted_conn_comp_ids));
+    CUDA_CHECK(hipFree(cu_unique_conn_comp_ids));
 
     // 3. Compute the total area for each connected component via segmented reduction.
     int* cu_conn_comp_offsets;
-    CUDA_CHECK(cudaMalloc(&cu_conn_comp_offsets, (num_conn_comps + 1) * sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_conn_comp_offsets, (num_conn_comps + 1) * sizeof(int)));
     temp_storage_bytes = 0;
-    CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
+    CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum(
         nullptr, temp_storage_bytes,
         cu_conn_comp_num_faces, cu_conn_comp_offsets,
         num_conn_comps + 1
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
-    CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
+    CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         cu_conn_comp_num_faces, cu_conn_comp_offsets,
         num_conn_comps + 1
     ));
-    CUDA_CHECK(cudaFree(cu_conn_comp_num_faces));
+    CUDA_CHECK(hipFree(cu_conn_comp_num_faces));
 
     float *cu_conn_comp_areas;
-    CUDA_CHECK(cudaMalloc(&cu_conn_comp_areas, num_conn_comps * sizeof(float)));
-    CUDA_CHECK(cub::DeviceSegmentedReduce::Sum(
+    CUDA_CHECK(hipMalloc(&cu_conn_comp_areas, num_conn_comps * sizeof(float)));
+    CUDA_CHECK(hipcub::DeviceSegmentedReduce::Sum(
         nullptr, temp_storage_bytes,
         cu_sorted_face_areas, cu_conn_comp_areas,
         num_conn_comps,
@@ -999,19 +1015,19 @@ void CuMesh::remove_small_connected_components(float min_area) {
         cu_conn_comp_offsets + 1
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
-    CUDA_CHECK(cub::DeviceSegmentedReduce::Sum(
+    CUDA_CHECK(hipcub::DeviceSegmentedReduce::Sum(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         cu_sorted_face_areas, cu_conn_comp_areas,
         num_conn_comps,
         cu_conn_comp_offsets,
         cu_conn_comp_offsets + 1
     ));
-    CUDA_CHECK(cudaFree(cu_sorted_face_areas));
-    CUDA_CHECK(cudaFree(cu_conn_comp_offsets));
+    CUDA_CHECK(hipFree(cu_sorted_face_areas));
+    CUDA_CHECK(hipFree(cu_conn_comp_offsets));
 
     // 4. Create a "keep" mask for components with area >= min_area.
     uint8_t* cu_comp_keep_mask;
-    CUDA_CHECK(cudaMalloc(&cu_comp_keep_mask, num_conn_comps * sizeof(uint8_t)));
+    CUDA_CHECK(hipMalloc(&cu_comp_keep_mask, num_conn_comps * sizeof(uint8_t)));
     compare_kernel<<<(num_conn_comps+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
         cu_conn_comp_areas,
         min_area,
@@ -1019,12 +1035,12 @@ void CuMesh::remove_small_connected_components(float min_area) {
         GreaterThanOrEqualToOp(),
         cu_comp_keep_mask
     );
-    CUDA_CHECK(cudaGetLastError());
-    CUDA_CHECK(cudaFree(cu_conn_comp_areas));
+    CUDA_CHECK(hipGetLastError());
+    CUDA_CHECK(hipFree(cu_conn_comp_areas));
 
     // 5. Propagate the component "keep" mask to every face.
     uint8_t* cu_face_keep_mask;
-    CUDA_CHECK(cudaMalloc(&cu_face_keep_mask, F * sizeof(uint8_t)));
+    CUDA_CHECK(hipMalloc(&cu_face_keep_mask, F * sizeof(uint8_t)));
     // Use an index_kernel (gather operation)
     index_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
         cu_comp_keep_mask,      // Source array
@@ -1032,12 +1048,12 @@ void CuMesh::remove_small_connected_components(float min_area) {
         F,
         cu_face_keep_mask       // Destination array
     );
-    CUDA_CHECK(cudaGetLastError());
-    CUDA_CHECK(cudaFree(cu_comp_keep_mask));
+    CUDA_CHECK(hipGetLastError());
+    CUDA_CHECK(hipFree(cu_comp_keep_mask));
 
     // 6. Select the faces to keep and update the mesh.
     this->_remove_faces(cu_face_keep_mask);
-    CUDA_CHECK(cudaFree(cu_face_keep_mask));
+    CUDA_CHECK(hipFree(cu_face_keep_mask));
 }
 
 
@@ -1164,25 +1180,25 @@ void CuMesh::unify_face_orientations() {
 
     // 1. Compute the flipped flag for each edge.
     uint8_t* cu_flipped;
-    CUDA_CHECK(cudaMalloc(&cu_flipped, this->manifold_face_adj.size * sizeof(uint8_t)));
+    CUDA_CHECK(hipMalloc(&cu_flipped, this->manifold_face_adj.size * sizeof(uint8_t)));
     get_flip_flags_kernel<<<(this->manifold_face_adj.size+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
         this->manifold_face_adj.ptr,
         this->faces.ptr,
         this->manifold_face_adj.size,
         cu_flipped
     );
-    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(hipGetLastError());
 
     // 2. Hook edges with flipped flag.
     int* conn_comp_with_flip;
-    CUDA_CHECK(cudaMalloc(&conn_comp_with_flip, this->faces.size * sizeof(int)));
+    CUDA_CHECK(hipMalloc(&conn_comp_with_flip, this->faces.size * sizeof(int)));
     arange_kernel<<<(this->faces.size+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(conn_comp_with_flip, this->faces.size, 2);
-    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(hipGetLastError());
     int* cu_end_flag; int h_end_flag;
-    CUDA_CHECK(cudaMalloc(&cu_end_flag, sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_end_flag, sizeof(int)));
     do {
         h_end_flag = 1;
-        CUDA_CHECK(cudaMemcpy(cu_end_flag, &h_end_flag, sizeof(int), cudaMemcpyHostToDevice));
+        CUDA_CHECK(hipMemcpy(cu_end_flag, &h_end_flag, sizeof(int), hipMemcpyHostToDevice));
 
         // Hook
         hook_edges_with_orientation_kernel<<<(this->manifold_face_adj.size+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
@@ -1192,17 +1208,17 @@ void CuMesh::unify_face_orientations() {
             conn_comp_with_flip,
             cu_end_flag
         );
-        CUDA_CHECK(cudaGetLastError());
+        CUDA_CHECK(hipGetLastError());
 
         // Compress
         compress_components_with_orientation_kernel<<<(this->faces.size+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
             conn_comp_with_flip,
             this->faces.size
         );
-        CUDA_CHECK(cudaGetLastError());
-        CUDA_CHECK(cudaMemcpy(&h_end_flag, cu_end_flag, sizeof(int), cudaMemcpyDeviceToHost));
+        CUDA_CHECK(hipGetLastError());
+        CUDA_CHECK(hipMemcpy(&h_end_flag, cu_end_flag, sizeof(int), hipMemcpyDeviceToHost));
     } while (h_end_flag == 0);
-    CUDA_CHECK(cudaFree(cu_end_flag));
+    CUDA_CHECK(hipFree(cu_end_flag));
 
     // 3. Flip the orientation of the faces.
     inplace_flip_faces_with_flags_kernel<<<(this->faces.size+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
@@ -1210,9 +1226,9 @@ void CuMesh::unify_face_orientations() {
         conn_comp_with_flip,
         this->faces.size
     );
-    CUDA_CHECK(cudaGetLastError());
-    CUDA_CHECK(cudaFree(cu_flipped));
-    CUDA_CHECK(cudaFree(conn_comp_with_flip));
+    CUDA_CHECK(hipGetLastError());
+    CUDA_CHECK(hipFree(cu_flipped));
+    CUDA_CHECK(hipFree(conn_comp_with_flip));
 }
 
 
diff --git a/src/clean_up.hip b/src/clean_up.hip
new file mode 100644
index 0000000..8bd58ef
--- /dev/null
+++ b/src/clean_up.hip
@@ -0,0 +1,1237 @@
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
+#include "cumesh_hip.h"
+#include "dtypes_hip.cuh"
+#include "shared_hip.h"
+#ifdef __HIP_PLATFORM_AMD__
+#include <hipcub/hipcub.hpp>
+#else
+#include <hipcub/hipcub.hpp>
+#endif
+#ifdef __HIP_PLATFORM_AMD__
+#include <rocprim/types/tuple.hpp>
+#else
+#include <thrust/tuple.h>
+#endif
+
+
+namespace cumesh {
+
+
+static __global__ void copy_vec3f_to_float3_kernel(
+    const Vec3f* vec3f,
+    const size_t N,
+    float3* output
+) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= N) return;
+    output[tid] = make_float3(vec3f[tid].x, vec3f[tid].y, vec3f[tid].z);
+}
+
+
+template<typename T, typename U>
+static __global__ void copy_T_to_T3_kernel(
+    const T* input,
+    const size_t N,
+    U* output
+) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= N) return;
+    output[tid] = { input[3 * tid], input[3 * tid + 1], input[3 * tid + 2] };
+}
+
+
+void CuMesh::remove_faces(torch::Tensor& face_mask) {
+    size_t F = this->faces.size;
+
+    size_t temp_storage_bytes = 0;
+    int *cu_new_num_faces;
+    int3 *cu_new_faces;
+    CUDA_CHECK(hipMalloc(&cu_new_num_faces, sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_new_faces, F * sizeof(int3)));
+    CUDA_CHECK(hipcub::DeviceSelect::Flagged(
+        nullptr, temp_storage_bytes,
+        this->faces.ptr, face_mask.data_ptr<bool>(), cu_new_faces, cu_new_num_faces,
+        F
+    ));
+    this->cub_temp_storage.resize(temp_storage_bytes);
+    CUDA_CHECK(hipcub::DeviceSelect::Flagged(
+        this->cub_temp_storage.ptr, temp_storage_bytes,
+        this->faces.ptr, face_mask.data_ptr<bool>(), cu_new_faces, cu_new_num_faces,
+        F
+    ));
+    int new_num_faces;
+    CUDA_CHECK(hipMemcpy(&new_num_faces, cu_new_num_faces, sizeof(int), hipMemcpyDeviceToHost));
+    this->faces.resize(new_num_faces);
+    CUDA_CHECK(hipMemcpy(this->faces.ptr, cu_new_faces, new_num_faces * sizeof(int3), hipMemcpyDeviceToDevice));
+    CUDA_CHECK(hipFree(cu_new_num_faces));
+    CUDA_CHECK(hipFree(cu_new_faces));
+
+    this->remove_unreferenced_vertices();
+}
+
+
+void CuMesh::_remove_faces(uint8_t* face_mask) {
+    size_t F = this->faces.size;
+
+    size_t temp_storage_bytes = 0;
+    int *cu_new_num_faces;
+    int3 *cu_new_faces;
+    CUDA_CHECK(hipMalloc(&cu_new_num_faces, sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_new_faces, F * sizeof(int3)));
+    CUDA_CHECK(hipcub::DeviceSelect::Flagged(
+        nullptr, temp_storage_bytes,
+        this->faces.ptr, face_mask, cu_new_faces, cu_new_num_faces,
+        F
+    ));
+    this->cub_temp_storage.resize(temp_storage_bytes);
+    CUDA_CHECK(hipcub::DeviceSelect::Flagged(
+        this->cub_temp_storage.ptr, temp_storage_bytes,
+        this->faces.ptr, face_mask, cu_new_faces, cu_new_num_faces,
+        F
+    ));
+    int new_num_faces;
+    CUDA_CHECK(hipMemcpy(&new_num_faces, cu_new_num_faces, sizeof(int), hipMemcpyDeviceToHost));
+    this->faces.resize(new_num_faces);
+    CUDA_CHECK(hipMemcpy(this->faces.ptr, cu_new_faces, new_num_faces * sizeof(int3), hipMemcpyDeviceToDevice));
+    CUDA_CHECK(hipFree(cu_new_num_faces));
+    CUDA_CHECK(hipFree(cu_new_faces));
+
+    this->remove_unreferenced_vertices();
+}
+
+
+static __global__ void set_vertex_is_referenced(
+    const int3* faces,
+    const size_t F,
+    int* vertex_is_referenced
+) {
+    const int fid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (fid >= F) return;
+    int3 face = faces[fid];
+    vertex_is_referenced[face.x] = 1;
+    vertex_is_referenced[face.y] = 1;
+    vertex_is_referenced[face.z] = 1;
+}
+
+
+static __global__ void compress_vertices_kernel(
+    const int* vertices_map,
+    const float3* old_vertices,
+    const int V,
+    float3* new_vertices
+) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= V) return;
+    int new_id = vertices_map[tid];
+    int is_kept = vertices_map[tid + 1] == new_id + 1;
+    if (is_kept) {
+        new_vertices[new_id] = old_vertices[tid];
+    }
+}
+
+
+static __global__ void remap_faces_kernel(
+    const int* vertices_map,
+    const int F,
+    int3* faces
+) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= F) return;
+    faces[tid].x = vertices_map[faces[tid].x];
+    faces[tid].y = vertices_map[faces[tid].y];
+    faces[tid].z = vertices_map[faces[tid].z];
+}
+
+
+void CuMesh::remove_unreferenced_vertices() {
+    size_t V = this->vertices.size;
+    size_t F = this->faces.size;
+
+    // Mark referenced vertices
+    int* cu_vertex_is_referenced;
+    CUDA_CHECK(hipMalloc(&cu_vertex_is_referenced, (V+1) * sizeof(int)));
+    CUDA_CHECK(hipMemset(cu_vertex_is_referenced, 0, (V+1) * sizeof(int)));
+   hipLaunchKernelGGL(( set_vertex_is_referenced), dim3((F+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+        this->faces.ptr,
+        F,
+        cu_vertex_is_referenced
+    );
+    CUDA_CHECK(hipGetLastError());
+
+    // Get vertices map
+    size_t temp_storage_bytes = 0;
+    CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum(
+        nullptr, temp_storage_bytes,
+        cu_vertex_is_referenced, V+1
+    ));
+    this->cub_temp_storage.resize(temp_storage_bytes);
+    CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum(
+        this->cub_temp_storage.ptr, temp_storage_bytes,
+        cu_vertex_is_referenced, V+1
+    ));
+    int new_num_vertices;
+    CUDA_CHECK(hipMemcpy(&new_num_vertices, cu_vertex_is_referenced + V, sizeof(int), hipMemcpyDeviceToHost));
+
+    // Compress vertices
+    this->temp_storage.resize(new_num_vertices * sizeof(float3));
+   hipLaunchKernelGGL(( compress_vertices_kernel), dim3((V+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+        cu_vertex_is_referenced,
+        this->vertices.ptr,
+        V,
+        reinterpret_cast<float3*>(this->temp_storage.ptr)
+    );
+    CUDA_CHECK(hipGetLastError());
+    swap_buffers(this->temp_storage, this->vertices);
+
+    // Update faces
+   hipLaunchKernelGGL(( remap_faces_kernel), dim3((F+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+        cu_vertex_is_referenced,
+        F,
+        this->faces.ptr
+    );
+    CUDA_CHECK(hipGetLastError());
+    CUDA_CHECK(hipFree(cu_vertex_is_referenced));
+
+    // Delete all cached info since mesh has changed
+    this->clear_cache();
+}
+
+
+static __global__ void sort_faces_kernel(
+    int3* faces,
+    const size_t F
+) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= F) return;
+
+    int3 face = faces[tid];
+    int tmp;
+
+    // bubble sort 3 elements (x, y, z)
+    if (face.x > face.y) { tmp = face.x; face.x = face.y; face.y = tmp; }
+    if (face.y > face.z) { tmp = face.y; face.y = face.z; face.z = tmp; }
+    if (face.x > face.y) { tmp = face.x; face.x = face.y; face.y = tmp; }
+
+    faces[tid] = face;
+}
+
+
+static __global__ void select_first_in_each_group_kernel(
+    const int3* faces,
+    const size_t F,
+    uint8_t* face_mask
+) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= F) return;
+    if (tid == 0) {
+        face_mask[tid] = 1;
+    } else {
+        int3 face = faces[tid];
+        int3 prev_face = faces[tid-1];
+        if (face.x == prev_face.x && face.y == prev_face.y && face.z == prev_face.z) {
+            face_mask[tid] = 0;
+        } else {
+            face_mask[tid] = 1;
+        }
+    }
+}
+
+
+struct int3_decomposer
+{
+#ifdef __HIP_PLATFORM_AMD__
+    __host__ __device__ ::rocprim::tuple<int&, int&, int&> operator()(int3& key) const
+    {
+        return ::rocprim::tie(key.x, key.y, key.z);
+    }
+#else
+    __host__ __device__ ::cuda::std::tuple<int&, int&, int&> operator()(int3& key) const
+    {
+        return {key.x, key.y, key.z};
+    }
+#endif
+};
+
+
+void CuMesh::remove_duplicate_faces() {
+    size_t F = this->faces.size;
+
+    // Create a temporary sorted copy of faces for duplicate detection
+    // Do NOT modify the original faces to preserve vertex order and normals
+    int3 *cu_sorted_faces;
+    CUDA_CHECK(hipMalloc(&cu_sorted_faces, F * sizeof(int3)));
+    CUDA_CHECK(hipMemcpy(cu_sorted_faces, this->faces.ptr, F * sizeof(int3), hipMemcpyDeviceToDevice));
+
+    // Sort vertices within each face (in the temporary copy)
+   hipLaunchKernelGGL(( sort_faces_kernel), dim3((F+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+        cu_sorted_faces,
+        F
+    );
+    CUDA_CHECK(hipGetLastError());
+
+    // Sort all faces globally by their sorted vertex indices
+    size_t temp_storage_bytes = 0;
+    int *cu_sorted_face_indices;
+    CUDA_CHECK(hipMalloc(&cu_sorted_face_indices, F * sizeof(int)));
+   hipLaunchKernelGGL(( arange_kernel), dim3((F+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, cu_sorted_face_indices, F);
+    CUDA_CHECK(hipGetLastError());
+
+    int *cu_sorted_indices_output;
+    int3 *cu_sorted_faces_output;
+    CUDA_CHECK(hipMalloc(&cu_sorted_indices_output, F * sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_sorted_faces_output, F * sizeof(int3)));
+
+    CUDA_CHECK(hipcub::DeviceRadixSort::SortPairs(
+        nullptr, temp_storage_bytes,
+        cu_sorted_faces, cu_sorted_faces_output,
+        cu_sorted_face_indices, cu_sorted_indices_output,
+        F,
+        int3_decomposer{}
+    ));
+    this->cub_temp_storage.resize(temp_storage_bytes);
+    CUDA_CHECK(hipcub::DeviceRadixSort::SortPairs(
+        this->cub_temp_storage.ptr, temp_storage_bytes,
+        cu_sorted_faces, cu_sorted_faces_output,
+        cu_sorted_face_indices, cu_sorted_indices_output,
+        F,
+        int3_decomposer{}
+    ));
+    CUDA_CHECK(hipFree(cu_sorted_faces));
+    CUDA_CHECK(hipFree(cu_sorted_face_indices));
+
+    // Select first in each group of duplicate faces (based on sorted faces)
+    uint8_t* cu_face_mask_sorted;
+    CUDA_CHECK(hipMalloc(&cu_face_mask_sorted, F * sizeof(uint8_t)));
+   hipLaunchKernelGGL(( select_first_in_each_group_kernel), dim3((F+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+        cu_sorted_faces_output,
+        F,
+        cu_face_mask_sorted
+    );
+    CUDA_CHECK(hipGetLastError());
+    CUDA_CHECK(hipFree(cu_sorted_faces_output));
+
+    // Map the mask back to original face order using scatter
+    // scatter: output[indices[i]] = values[i]
+    // This maps: cu_face_mask_original[original_idx] = cu_face_mask_sorted[sorted_position]
+    uint8_t* cu_face_mask_original;
+    CUDA_CHECK(hipMalloc(&cu_face_mask_original, F * sizeof(uint8_t)));
+   hipLaunchKernelGGL(( scatter_kernel), dim3((F+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+        cu_sorted_indices_output,  // indices: sorted_position -> original_idx
+        cu_face_mask_sorted,       // values: mask at sorted_position
+        F,
+        cu_face_mask_original      // output: mask at original position
+    );
+    CUDA_CHECK(hipGetLastError());
+    CUDA_CHECK(hipFree(cu_face_mask_sorted));
+    CUDA_CHECK(hipFree(cu_sorted_indices_output));
+
+    // Select faces to keep (preserving original vertex order)
+    this->_remove_faces(cu_face_mask_original);
+    CUDA_CHECK(hipFree(cu_face_mask_original));
+}
+
+
+static __global__ void mark_degenerate_faces_kernel(
+    const float3* vertices,
+    const int3* faces,
+    const float abs_thresh,
+    const float rel_thresh,
+    const size_t F,
+    uint8_t* face_mask
+) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= F) return;
+    int3 face = faces[tid];
+
+    // 1. Check if any vertex is duplicated
+    if (face.x == face.y || face.y == face.z || face.z == face.x) {
+        face_mask[tid] = 0;
+        return;
+    }
+
+    // 2. Check if slim or zero area
+    Vec3f v0 = Vec3f(vertices[face.x]);
+    Vec3f v1 = Vec3f(vertices[face.y]);
+    Vec3f v2 = Vec3f(vertices[face.z]);
+    Vec3f e0 = v1 - v0;
+    Vec3f e1 = v2 - v1;
+    Vec3f e2 = v0 - v2;
+    float max_edge_len = fmaxf(fmaxf(e0.norm(), e1.norm()), e2.norm());
+    float area = e0.cross(e1).norm() / 2.0f;
+    float thresh = fminf(rel_thresh * max_edge_len * max_edge_len, abs_thresh);
+    if (area < thresh) {
+        face_mask[tid] = 0;
+        return;
+    }
+
+    face_mask[tid] = 1;
+}
+
+
+void CuMesh::remove_degenerate_faces(float abs_thresh, float rel_thresh) {
+    size_t F = this->faces.size;
+
+    uint8_t* cu_face_mask;
+    CUDA_CHECK(hipMalloc(&cu_face_mask, F * sizeof(uint8_t)));
+   hipLaunchKernelGGL(( mark_degenerate_faces_kernel), dim3((F+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+        this->vertices.ptr,
+        this->faces.ptr,
+        abs_thresh, rel_thresh,
+        F,
+        cu_face_mask
+    );
+    CUDA_CHECK(hipGetLastError());
+
+    this->_remove_faces(cu_face_mask);
+    CUDA_CHECK(hipFree(cu_face_mask));
+}
+
+
+static __global__ void compute_loop_boundary_lengths(
+    const float3* vertices,
+    const uint64_t* edges,
+    const int* loop_boundaries,
+    const size_t E,
+    float* loop_boundary_lengths
+) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= E) return;
+    uint64_t edge = edges[loop_boundaries[tid]];
+    int e0 = int(edge & 0xFFFFFFFF);
+    int e1 = int(edge >> 32);
+    Vec3f v0 = Vec3f(vertices[e0]);
+    Vec3f v1 = Vec3f(vertices[e1]);
+    loop_boundary_lengths[tid] = (v1 - v0).norm();
+}
+
+
+static __global__ void compute_loop_boundary_midpoints(
+    const float3* vertices,
+    const uint64_t* edges,
+    const int* loop_boundaries,
+    const size_t E,
+    Vec3f* loop_boundary_midpoints
+) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= E) return;
+    uint64_t edge = edges[loop_boundaries[tid]];
+    int e0 = int(edge & 0xFFFFFFFF);
+    int e1 = int(edge >> 32);
+    Vec3f v0 = Vec3f(vertices[e0]);
+    Vec3f v1 = Vec3f(vertices[e1]);
+    loop_boundary_midpoints[tid] = (v0 + v1) * 0.5f;
+}
+
+
+static __global__ void connect_new_vertices_kernel(
+    const uint64_t* edges,
+    const int* loop_boundaries,
+    const int* loop_bound_loop_ids,
+    const size_t L,
+    const size_t V,
+    int3* faces
+) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= L) return;
+    int loop_id = loop_bound_loop_ids[tid];
+    int loop_boundary = loop_boundaries[tid];
+    uint64_t e = edges[loop_boundary];
+    int e0 = int(e & 0xFFFFFFFF);
+    int e1 = int(e >> 32);
+    int new_v_id = loop_id + V;
+    faces[tid] = {e0, e1, new_v_id};
+}
+
+
+struct LessThanOp {
+    __device__ bool operator()(float a, float b) const {
+        return a < b;
+    }
+};
+
+
+void CuMesh::fill_holes(float max_hole_perimeter) {
+    if (this->loop_boundaries.is_empty() || this->loop_boundaries_offset.is_empty()) {
+        this->get_boundary_loops();
+    }
+
+    size_t V = this->vertices.size;
+    size_t F = this->faces.size;
+    size_t L = this->num_bound_loops;
+    size_t E = this->loop_boundaries.size;
+
+    // Early return if no boundary loops
+    if (L == 0 || E == 0) {
+        return;
+    }
+
+    // Compute loop boundary lengths
+    float* cu_loop_boundary_lengths;
+    CUDA_CHECK(hipMalloc(&cu_loop_boundary_lengths, E * sizeof(float)));
+   hipLaunchKernelGGL(( compute_loop_boundary_lengths), dim3((E+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+        this->vertices.ptr,
+        this->edges.ptr,
+        this->loop_boundaries.ptr,
+        E,
+        cu_loop_boundary_lengths
+    );
+    CUDA_CHECK(hipGetLastError());
+
+    // Segment sum
+    size_t temp_storage_bytes = 0;
+    float *cu_bound_loop_perimeters;
+    CUDA_CHECK(hipMalloc(&cu_bound_loop_perimeters, L * sizeof(float)));
+    CUDA_CHECK(hipcub::DeviceSegmentedReduce::Sum(
+        nullptr, temp_storage_bytes,
+        cu_loop_boundary_lengths, cu_bound_loop_perimeters,
+        L,
+        this->loop_boundaries_offset.ptr,
+        this->loop_boundaries_offset.ptr + 1
+    ));
+    this->cub_temp_storage.resize(temp_storage_bytes);
+    CUDA_CHECK(hipcub::DeviceSegmentedReduce::Sum(
+        this->cub_temp_storage.ptr, temp_storage_bytes,
+        cu_loop_boundary_lengths, cu_bound_loop_perimeters,
+        L,
+        this->loop_boundaries_offset.ptr,
+        this->loop_boundaries_offset.ptr + 1
+    ));
+    CUDA_CHECK(hipFree(cu_loop_boundary_lengths));
+
+    // Mask small loops
+    uint8_t* cu_bound_loop_mask;
+    CUDA_CHECK(hipMalloc(&cu_bound_loop_mask, L * sizeof(uint8_t)));
+   hipLaunchKernelGGL(( compare_kernel), dim3((L+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+        cu_bound_loop_perimeters,
+        max_hole_perimeter,
+        L,
+        LessThanOp(),
+        cu_bound_loop_mask
+    );
+    CUDA_CHECK(hipGetLastError());
+    CUDA_CHECK(hipFree(cu_bound_loop_perimeters));
+
+    // Compress bound loops size
+    int* cu_bound_loops_cnt;
+    CUDA_CHECK(hipMalloc(&cu_bound_loops_cnt, L * sizeof(int)));
+   hipLaunchKernelGGL(( diff_kernel), dim3((L+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+        this->loop_boundaries_offset.ptr,
+        L,
+        cu_bound_loops_cnt
+    );
+    CUDA_CHECK(hipGetLastError());
+    int *cu_new_loop_boundaries_cnt, *cu_new_num_bound_loops;
+    CUDA_CHECK(hipMalloc(&cu_new_loop_boundaries_cnt, (L+1) * sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_new_num_bound_loops, sizeof(int)));
+    temp_storage_bytes = 0;
+    CUDA_CHECK(hipcub::DeviceSelect::Flagged(
+        nullptr, temp_storage_bytes,
+        cu_bound_loops_cnt, cu_bound_loop_mask, cu_new_loop_boundaries_cnt, cu_new_num_bound_loops,
+        L
+    ));
+    this->cub_temp_storage.resize(temp_storage_bytes);
+    CUDA_CHECK(hipcub::DeviceSelect::Flagged(
+        this->cub_temp_storage.ptr, temp_storage_bytes,
+        cu_bound_loops_cnt, cu_bound_loop_mask, cu_new_loop_boundaries_cnt, cu_new_num_bound_loops,
+        L
+    ));
+    int new_num_bound_loops;
+    CUDA_CHECK(hipMemcpy(&new_num_bound_loops, cu_new_num_bound_loops, sizeof(int), hipMemcpyDeviceToHost));
+    CUDA_CHECK(hipFree(cu_bound_loops_cnt));
+    CUDA_CHECK(hipFree(cu_new_num_bound_loops));
+    if (new_num_bound_loops == 0) {
+        CUDA_CHECK(hipFree(cu_new_loop_boundaries_cnt));
+        CUDA_CHECK(hipFree(cu_bound_loop_mask));
+        return;
+    }
+
+    // Get loop ids of loop boundaries
+    int* cu_loop_bound_loop_ids;
+    CUDA_CHECK(hipMalloc(&cu_loop_bound_loop_ids, E * sizeof(int)));
+    CUDA_CHECK(hipMemset(cu_loop_bound_loop_ids, 0, E * sizeof(int)));
+    if (L > 1) {
+       hipLaunchKernelGGL(( set_flag_kernel), dim3((L-1+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+            this->loop_boundaries_offset.ptr + 1, L - 1,
+            cu_loop_bound_loop_ids
+        );
+        CUDA_CHECK(hipGetLastError());
+    }
+    temp_storage_bytes = 0;
+    CUDA_CHECK(hipcub::DeviceScan::InclusiveSum(
+        nullptr, temp_storage_bytes,
+        cu_loop_bound_loop_ids,
+        E
+    ));
+    this->cub_temp_storage.resize(temp_storage_bytes);
+    CUDA_CHECK(hipcub::DeviceScan::InclusiveSum(
+        this->cub_temp_storage.ptr, temp_storage_bytes,
+        cu_loop_bound_loop_ids,
+        E
+    ));
+
+    // Mask loop boundaries
+    uint8_t* cu_loop_boundary_mask;
+    CUDA_CHECK(hipMalloc(&cu_loop_boundary_mask, E * sizeof(uint8_t)));
+   hipLaunchKernelGGL(( index_kernel), dim3((E+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+        cu_bound_loop_mask,
+        cu_loop_bound_loop_ids,
+        E,
+        cu_loop_boundary_mask
+    );
+    CUDA_CHECK(hipGetLastError());
+    CUDA_CHECK(hipFree(cu_bound_loop_mask));
+    CUDA_CHECK(hipFree(cu_loop_bound_loop_ids));
+
+    // Compress loop boundaries
+    int *cu_new_loop_boundaries, *cu_new_num_loop_boundaries;
+    CUDA_CHECK(hipMalloc(&cu_new_loop_boundaries, E * sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_new_num_loop_boundaries, sizeof(int)));
+    temp_storage_bytes = 0;
+    CUDA_CHECK(hipcub::DeviceSelect::Flagged(
+        nullptr, temp_storage_bytes,
+        this->loop_boundaries.ptr, cu_loop_boundary_mask, cu_new_loop_boundaries, cu_new_num_loop_boundaries,
+        E
+    ));
+    this->cub_temp_storage.resize(temp_storage_bytes);
+    CUDA_CHECK(hipcub::DeviceSelect::Flagged(
+        this->cub_temp_storage.ptr, temp_storage_bytes,
+        this->loop_boundaries.ptr, cu_loop_boundary_mask, cu_new_loop_boundaries, cu_new_num_loop_boundaries,
+        E
+    ));
+    int new_num_loop_boundaries;
+    CUDA_CHECK(hipMemcpy(&new_num_loop_boundaries, cu_new_num_loop_boundaries, sizeof(int), hipMemcpyDeviceToHost));
+    CUDA_CHECK(hipFree(cu_new_num_loop_boundaries));
+    CUDA_CHECK(hipFree(cu_loop_boundary_mask));
+
+    // Reconstruct new bound loops
+    int* cu_new_loop_boundaries_offset;
+    CUDA_CHECK(hipMalloc(&cu_new_loop_boundaries_offset, (new_num_loop_boundaries+1) * sizeof(int)));
+    temp_storage_bytes = 0;
+    CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum(
+        nullptr, temp_storage_bytes,
+        cu_new_loop_boundaries_cnt, cu_new_loop_boundaries_offset,
+        new_num_bound_loops + 1
+    ));
+    this->cub_temp_storage.resize(temp_storage_bytes);
+    CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum(
+        this->cub_temp_storage.ptr, temp_storage_bytes,
+        cu_new_loop_boundaries_cnt, cu_new_loop_boundaries_offset,
+        new_num_bound_loops + 1
+    ));
+    int* cu_new_loop_bound_loop_ids;
+    CUDA_CHECK(hipMalloc(&cu_new_loop_bound_loop_ids, new_num_loop_boundaries * sizeof(int)));
+    CUDA_CHECK(hipMemset(cu_new_loop_bound_loop_ids, 0, new_num_loop_boundaries * sizeof(int)));
+    if (new_num_bound_loops > 1) {
+       hipLaunchKernelGGL(( set_flag_kernel), dim3((new_num_bound_loops-1+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+            cu_new_loop_boundaries_offset+1, new_num_bound_loops-1,
+            cu_new_loop_bound_loop_ids
+        );
+        CUDA_CHECK(hipGetLastError());
+    }
+    temp_storage_bytes = 0;
+    CUDA_CHECK(hipcub::DeviceScan::InclusiveSum(
+        nullptr, temp_storage_bytes,
+        cu_new_loop_bound_loop_ids,
+        new_num_loop_boundaries
+    ));
+    this->cub_temp_storage.resize(temp_storage_bytes);
+    CUDA_CHECK(hipcub::DeviceScan::InclusiveSum(
+        this->cub_temp_storage.ptr, temp_storage_bytes,
+        cu_new_loop_bound_loop_ids,
+        new_num_loop_boundaries
+    ));
+
+    // Calculate new vertex positions as average of loop vertices
+    Vec3f* cu_new_loop_bound_centers;
+    CUDA_CHECK(hipMalloc(&cu_new_loop_bound_centers, new_num_loop_boundaries * sizeof(Vec3f)));
+   hipLaunchKernelGGL(( compute_loop_boundary_midpoints), dim3((new_num_loop_boundaries+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+        this->vertices.ptr,
+        this->edges.ptr,
+        cu_new_loop_boundaries,
+        new_num_loop_boundaries,
+        cu_new_loop_bound_centers
+    );
+    CUDA_CHECK(hipGetLastError());
+    Vec3f* cu_new_vertices;
+    CUDA_CHECK(hipMalloc(&cu_new_vertices, new_num_bound_loops * sizeof(Vec3f)));
+    temp_storage_bytes = 0;
+    CUDA_CHECK(hipcub::DeviceSegmentedReduce::Sum(
+        nullptr, temp_storage_bytes,
+        cu_new_loop_bound_centers, cu_new_vertices,
+        new_num_bound_loops,
+        cu_new_loop_boundaries_offset,
+        cu_new_loop_boundaries_offset + 1
+    ));
+    this->cub_temp_storage.resize(temp_storage_bytes);
+    CUDA_CHECK(hipcub::DeviceSegmentedReduce::Sum(
+        this->cub_temp_storage.ptr, temp_storage_bytes,
+        cu_new_loop_bound_centers, cu_new_vertices,
+        new_num_bound_loops,
+        cu_new_loop_boundaries_offset,
+        cu_new_loop_boundaries_offset + 1
+    ));
+    CUDA_CHECK(hipFree(cu_new_loop_bound_centers));
+    CUDA_CHECK(hipFree(cu_new_loop_boundaries_offset));
+   hipLaunchKernelGGL(( inplace_div_kernel), dim3((new_num_bound_loops+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+        cu_new_vertices,
+        cu_new_loop_boundaries_cnt,
+        new_num_bound_loops
+    );
+    CUDA_CHECK(hipGetLastError());
+    CUDA_CHECK(hipFree(cu_new_loop_boundaries_cnt));
+
+    // Update mesh
+    this->vertices.extend(new_num_bound_loops);
+    this->faces.extend(new_num_loop_boundaries);
+   hipLaunchKernelGGL(( copy_vec3f_to_float3_kernel), dim3((new_num_bound_loops+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+        cu_new_vertices,
+        new_num_bound_loops,
+        this->vertices.ptr + V
+    );
+    CUDA_CHECK(hipGetLastError());
+    CUDA_CHECK(hipFree(cu_new_vertices));
+   hipLaunchKernelGGL(( connect_new_vertices_kernel), dim3((new_num_loop_boundaries+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+        this->edges.ptr,
+        cu_new_loop_boundaries,
+        cu_new_loop_bound_loop_ids,
+        new_num_loop_boundaries,
+        V,
+        this->faces.ptr + F
+    );
+    CUDA_CHECK(hipGetLastError());
+    CUDA_CHECK(hipFree(cu_new_loop_boundaries));
+    CUDA_CHECK(hipFree(cu_new_loop_bound_loop_ids));
+
+    // Delete all cached info since mesh has changed
+    this->clear_cache();
+}
+
+
+static __global__ void construct_vertex_adj_pairs_kernel(
+    const int2* manifold_face_adj,
+    const int3* faces,
+    int2* vertex_adj_pairs,
+    const size_t M
+) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= M) return;
+
+    const int2 adj_faces = manifold_face_adj[tid];
+    const int3 face1 = faces[adj_faces.x];
+    const int3 face2 = faces[adj_faces.y];
+
+    const int v1[3] = {face1.x, face1.y, face1.z};
+
+    int shared_local_indices1[2] = {0, 0};
+    int shared_local_indices2[2] = {0, 0};
+    int found_count = 0;
+
+    for (int i = 0; i < 3; ++i) {
+        if (v1[i] == face2.x) {
+            shared_local_indices1[found_count] = i;
+            shared_local_indices2[found_count] = 0;
+            found_count++;
+        } else if (v1[i] == face2.y) {
+            shared_local_indices1[found_count] = i;
+            shared_local_indices2[found_count] = 1;
+            found_count++;
+        } else if (v1[i] == face2.z) {
+            shared_local_indices1[found_count] = i;
+            shared_local_indices2[found_count] = 2;
+            found_count++;
+        }
+        if (found_count == 2) {
+            break;
+        }
+    }
+
+    // Only process if we found exactly 2 shared vertices (valid manifold edge)
+    if (found_count == 2) {
+        vertex_adj_pairs[2 * tid + 0] = make_int2(
+            3 * adj_faces.x + shared_local_indices1[0],
+            3 * adj_faces.y + shared_local_indices2[0]
+        );
+        vertex_adj_pairs[2 * tid + 1] = make_int2(
+            3 * adj_faces.x + shared_local_indices1[1],
+            3 * adj_faces.y + shared_local_indices2[1]
+        );
+    } else {
+        // Invalid edge, set to identity mapping
+        vertex_adj_pairs[2 * tid + 0] = make_int2(3 * adj_faces.x, 3 * adj_faces.x);
+        vertex_adj_pairs[2 * tid + 1] = make_int2(3 * adj_faces.y, 3 * adj_faces.y);
+    }
+}
+
+
+static __global__ void index_vertice_kernel(
+    const int* vertex_ids,
+    const int3* faces,
+    const float3* vertices,
+    const size_t V,
+    float3* new_vertices
+) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= V) return;
+    const int vid = vertex_ids[tid];
+    const int3 face = faces[vid / 3];
+    const int f[3] = {face.x, face.y, face.z};
+    new_vertices[tid] = vertices[f[vid % 3]];
+}
+
+
+void CuMesh::repair_non_manifold_edges(){
+    // Always recompute manifold_face_adj to ensure it's up to date
+    // especially after operations like simplify() that modify the mesh
+    this->get_manifold_face_adjacency();
+
+    size_t F = this->faces.size;
+    size_t M = this->manifold_face_adj.size;
+
+    // Construct vertex adjacency pairs with manifold edges
+    int2* cu_vertex_adj_pairs;
+    CUDA_CHECK(hipMalloc(&cu_vertex_adj_pairs, 2*M*sizeof(int2)));
+   hipLaunchKernelGGL(( construct_vertex_adj_pairs_kernel), dim3((M+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+        this->manifold_face_adj.ptr,
+        this->faces.ptr,
+        cu_vertex_adj_pairs,
+        M
+    );
+    CUDA_CHECK(hipGetLastError());
+
+    // Iterative Hook and Compress
+    int* cu_vertex_ids;
+    CUDA_CHECK(hipMalloc(&cu_vertex_ids, 3 * F * sizeof(int)));
+   hipLaunchKernelGGL(( arange_kernel), dim3((3*F+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, cu_vertex_ids, 3 * F);
+    CUDA_CHECK(hipGetLastError());
+    int* cu_end_flag; int h_end_flag;
+    CUDA_CHECK(hipMalloc(&cu_end_flag, sizeof(int)));
+    do {
+        h_end_flag = 1;
+        CUDA_CHECK(hipMemcpy(cu_end_flag, &h_end_flag, sizeof(int), hipMemcpyHostToDevice));
+
+        // Hook
+       hipLaunchKernelGGL(( hook_edges_kernel), dim3((2*M+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+            cu_vertex_adj_pairs,
+            2 * M,
+            cu_vertex_ids,
+            cu_end_flag
+        );
+        CUDA_CHECK(hipGetLastError());
+
+        // Compress
+       hipLaunchKernelGGL(( compress_components_kernel), dim3((3*F+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+            cu_vertex_ids,
+            3 * F
+        );
+        CUDA_CHECK(hipGetLastError());
+        CUDA_CHECK(hipMemcpy(&h_end_flag, cu_end_flag, sizeof(int), hipMemcpyDeviceToHost));
+    } while (h_end_flag == 0);
+    CUDA_CHECK(hipFree(cu_end_flag));
+    CUDA_CHECK(hipFree(cu_vertex_adj_pairs));
+
+    // Construct new faces
+    int* cu_new_vertices_ids;
+    CUDA_CHECK(hipMalloc(&cu_new_vertices_ids, 3 * F * sizeof(int)));
+    int new_V = compress_ids(cu_vertex_ids, 3 * F, this->cub_temp_storage, cu_new_vertices_ids);
+    float3* cu_new_vertices;
+    CUDA_CHECK(hipMalloc(&cu_new_vertices, new_V * sizeof(float3)));
+   hipLaunchKernelGGL(( index_vertice_kernel), dim3((new_V+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+        cu_new_vertices_ids,
+        this->faces.ptr,
+        this->vertices.ptr,
+        new_V,
+        cu_new_vertices
+    );
+    CUDA_CHECK(hipGetLastError());
+    CUDA_CHECK(hipFree(cu_new_vertices_ids));
+    this->vertices.resize(new_V);
+    CUDA_CHECK(hipMemcpy(this->vertices.ptr, cu_new_vertices, new_V * sizeof(float3), hipMemcpyDeviceToDevice));
+    CUDA_CHECK(hipFree(cu_new_vertices));
+    this->faces.resize(F);
+   hipLaunchKernelGGL(( copy_T_to_T3_kernel), dim3((F+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, cu_vertex_ids, F, this->faces.ptr);
+    CUDA_CHECK(hipGetLastError());
+    CUDA_CHECK(hipFree(cu_vertex_ids));
+
+    // Delete all cached info since mesh has changed
+    this->clear_cache();
+}
+
+
+/**
+ * Mark faces to remove for non-manifold edges
+ * For each non-manifold edge (shared by >2 faces), only keep the first 2 faces
+ *
+ * @param edge2face: edge to face adjacency
+ * @param edge2face_offset: edge to face adjacency offset
+ * @param edge2face_cnt: number of faces per edge
+ * @param E: number of edges
+ * @param face_keep_mask: output mask (1 = keep, 0 = remove)
+ */
+static __global__ void mark_non_manifold_faces_kernel(
+    const int* edge2face,
+    const int* edge2face_offset,
+    const int* edge2face_cnt,
+    const size_t E,
+    uint8_t* face_keep_mask
+) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= E) return;
+
+    // Only process non-manifold edges (cnt > 2)
+    int cnt = edge2face_cnt[tid];
+    if (cnt <= 2) return;
+
+    // Mark faces beyond the first 2 for removal
+    int start = edge2face_offset[tid];
+    for (int i = 2; i < cnt; i++) {
+        int face_idx = edge2face[start + i];
+        face_keep_mask[face_idx] = 0;
+    }
+}
+
+
+void CuMesh::remove_non_manifold_faces() {
+    // Get edge-face adjacency information
+    if (this->edge2face.is_empty() || this->edge2face_offset.is_empty()) {
+        this->get_edge_face_adjacency();
+    }
+
+    size_t F = this->faces.size;
+    size_t E = this->edges.size;
+
+    if (F == 0 || E == 0) return;
+
+    // Initialize face mask (1 = keep all faces initially)
+    uint8_t* cu_face_keep_mask;
+    CUDA_CHECK(hipMalloc(&cu_face_keep_mask, F * sizeof(uint8_t)));
+    CUDA_CHECK(hipMemset(cu_face_keep_mask, 1, F * sizeof(uint8_t)));
+
+    // Mark faces on non-manifold edges for removal
+   hipLaunchKernelGGL(( mark_non_manifold_faces_kernel), dim3((E+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+        this->edge2face.ptr,
+        this->edge2face_offset.ptr,
+        this->edge2face_cnt.ptr,
+        E,
+        cu_face_keep_mask
+    );
+    CUDA_CHECK(hipGetLastError());
+
+    // Remove marked faces
+    this->_remove_faces(cu_face_keep_mask);
+    CUDA_CHECK(hipFree(cu_face_keep_mask));
+
+    // Clear cache since mesh has changed
+    this->clear_cache();
+}
+
+
+struct GreaterThanOrEqualToOp {
+    __device__ __forceinline__ bool operator()(const float& a, const float& b) const {
+        return a >= b;
+    }
+};
+
+
+void CuMesh::remove_small_connected_components(float min_area) {
+    if (this->conn_comp_ids.is_empty()) {
+        this->get_connected_components();
+    }
+    if (this->face_areas.is_empty()) {
+        this->compute_face_areas();
+    }
+    size_t F = this->faces.size;
+    if (F == 0) return;
+
+    // 1. Sort face areas based on their connected component ID.
+    // This groups all faces of the same component together.
+    size_t temp_storage_bytes = 0;
+    int *cu_sorted_conn_comp_ids;
+    float *cu_sorted_face_areas;
+    CUDA_CHECK(hipMalloc(&cu_sorted_conn_comp_ids, F * sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_sorted_face_areas, F * sizeof(float)));
+    CUDA_CHECK(hipcub::DeviceRadixSort::SortPairs(
+        nullptr, temp_storage_bytes,
+        this->conn_comp_ids.ptr, cu_sorted_conn_comp_ids,
+        this->face_areas.ptr, cu_sorted_face_areas,
+        F
+    ));
+    this->cub_temp_storage.resize(temp_storage_bytes);
+    CUDA_CHECK(hipcub::DeviceRadixSort::SortPairs(
+        this->cub_temp_storage.ptr, temp_storage_bytes,
+        this->conn_comp_ids.ptr, cu_sorted_conn_comp_ids,
+        this->face_areas.ptr, cu_sorted_face_areas,
+        F
+    ));
+
+    // 2. Find unique components and get the number of faces in each.
+    int* cu_conn_comp_num_faces;
+    int* cu_num_conn_comps;
+    int* cu_unique_conn_comp_ids; // Not needed, but we need to pass a valid pointer.
+    CUDA_CHECK(hipMalloc(&cu_conn_comp_num_faces, (this->num_conn_comps + 1) * sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_num_conn_comps, sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_unique_conn_comp_ids, (this->num_conn_comps + 1) * sizeof(int)));
+    CUDA_CHECK(hipcub::DeviceRunLengthEncode::Encode(
+        nullptr, temp_storage_bytes,
+        cu_sorted_conn_comp_ids, cu_unique_conn_comp_ids,
+        cu_conn_comp_num_faces, cu_num_conn_comps,
+        F
+    ));
+    this->cub_temp_storage.resize(temp_storage_bytes);
+    CUDA_CHECK(hipcub::DeviceRunLengthEncode::Encode(
+        this->cub_temp_storage.ptr, temp_storage_bytes,
+        cu_sorted_conn_comp_ids, cu_unique_conn_comp_ids,
+        cu_conn_comp_num_faces, cu_num_conn_comps,
+        F
+    ));
+    int num_conn_comps;
+    CUDA_CHECK(hipMemcpy(&num_conn_comps, cu_num_conn_comps, sizeof(int), hipMemcpyDeviceToHost));
+    CUDA_CHECK(hipFree(cu_num_conn_comps));
+    CUDA_CHECK(hipFree(cu_sorted_conn_comp_ids));
+    CUDA_CHECK(hipFree(cu_unique_conn_comp_ids));
+
+    // 3. Compute the total area for each connected component via segmented reduction.
+    int* cu_conn_comp_offsets;
+    CUDA_CHECK(hipMalloc(&cu_conn_comp_offsets, (num_conn_comps + 1) * sizeof(int)));
+    temp_storage_bytes = 0;
+    CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum(
+        nullptr, temp_storage_bytes,
+        cu_conn_comp_num_faces, cu_conn_comp_offsets,
+        num_conn_comps + 1
+    ));
+    this->cub_temp_storage.resize(temp_storage_bytes);
+    CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum(
+        this->cub_temp_storage.ptr, temp_storage_bytes,
+        cu_conn_comp_num_faces, cu_conn_comp_offsets,
+        num_conn_comps + 1
+    ));
+    CUDA_CHECK(hipFree(cu_conn_comp_num_faces));
+
+    float *cu_conn_comp_areas;
+    CUDA_CHECK(hipMalloc(&cu_conn_comp_areas, num_conn_comps * sizeof(float)));
+    CUDA_CHECK(hipcub::DeviceSegmentedReduce::Sum(
+        nullptr, temp_storage_bytes,
+        cu_sorted_face_areas, cu_conn_comp_areas,
+        num_conn_comps,
+        cu_conn_comp_offsets,
+        cu_conn_comp_offsets + 1
+    ));
+    this->cub_temp_storage.resize(temp_storage_bytes);
+    CUDA_CHECK(hipcub::DeviceSegmentedReduce::Sum(
+        this->cub_temp_storage.ptr, temp_storage_bytes,
+        cu_sorted_face_areas, cu_conn_comp_areas,
+        num_conn_comps,
+        cu_conn_comp_offsets,
+        cu_conn_comp_offsets + 1
+    ));
+    CUDA_CHECK(hipFree(cu_sorted_face_areas));
+    CUDA_CHECK(hipFree(cu_conn_comp_offsets));
+
+    // 4. Create a "keep" mask for components with area >= min_area.
+    uint8_t* cu_comp_keep_mask;
+    CUDA_CHECK(hipMalloc(&cu_comp_keep_mask, num_conn_comps * sizeof(uint8_t)));
+   hipLaunchKernelGGL(( compare_kernel), dim3((num_conn_comps+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+        cu_conn_comp_areas,
+        min_area,
+        num_conn_comps,
+        GreaterThanOrEqualToOp(),
+        cu_comp_keep_mask
+    );
+    CUDA_CHECK(hipGetLastError());
+    CUDA_CHECK(hipFree(cu_conn_comp_areas));
+
+    // 5. Propagate the component "keep" mask to every face.
+    uint8_t* cu_face_keep_mask;
+    CUDA_CHECK(hipMalloc(&cu_face_keep_mask, F * sizeof(uint8_t)));
+    // Use an index_kernel (gather operation)
+   hipLaunchKernelGGL(( index_kernel), dim3((F + BLOCK_SIZE - 1) / BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+        cu_comp_keep_mask,      // Source array
+        this->conn_comp_ids.ptr, // Indices to gather from
+        F,
+        cu_face_keep_mask       // Destination array
+    );
+    CUDA_CHECK(hipGetLastError());
+    CUDA_CHECK(hipFree(cu_comp_keep_mask));
+
+    // 6. Select the faces to keep and update the mesh.
+    this->_remove_faces(cu_face_keep_mask);
+    CUDA_CHECK(hipFree(cu_face_keep_mask));
+}
+
+
+static __global__ void hook_edges_with_orientation_kernel(
+    const int2* adj,
+    const uint8_t* flipped,
+    const int M,
+    int* conn_comp_ids,
+    int* end_flag
+) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= M) return;
+
+    // get adjacent faces
+    int f0 = adj[tid].x;
+    int f1 = adj[tid].y;
+    uint8_t is_flipped = flipped[tid];
+
+    // union
+    // find roots
+    int root0 = conn_comp_ids[f0] >> 1;
+    int flip0 = conn_comp_ids[f0] & 1;
+    while (root0 != (conn_comp_ids[root0] >> 1)) {
+        flip0 ^= conn_comp_ids[root0] & 1;
+        root0 = conn_comp_ids[root0] >> 1;
+    }
+    int root1 = conn_comp_ids[f1] >> 1;
+    int flip1 = conn_comp_ids[f1] & 1;
+    while (root1 != (conn_comp_ids[root1] >> 1)) {
+        flip1 ^= conn_comp_ids[root1] & 1;
+        root1 = conn_comp_ids[root1] >> 1;
+    }
+
+    if (root0 == root1) return;
+
+    int high = max(root0, root1);
+    int low = min(root0, root1);
+    atomicMin(&conn_comp_ids[high], (low << 1) | (is_flipped ^ flip0 ^ flip1));
+    *end_flag = 0;
+}
+
+
+static __global__ void compress_components_with_orientation_kernel(
+    int* conn_comp_ids,
+    const int F
+) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= F) return;
+
+    int p = conn_comp_ids[tid] >> 1;
+    int f = conn_comp_ids[tid] & 1;
+    while (p != (conn_comp_ids[p] >> 1)) {
+        f ^= conn_comp_ids[p] & 1;
+        p = conn_comp_ids[p] >> 1;
+    }
+    conn_comp_ids[tid] = (p << 1) | f;
+}
+
+
+static __global__ void get_flip_flags_kernel(
+    const int2* manifold_face_adj,
+    const int3* faces,
+    const int M,
+    uint8_t* flipped
+) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= M) return;
+
+    const int2 adj_faces = manifold_face_adj[tid];
+    const int3 face1 = faces[adj_faces.x];
+    const int3 face2 = faces[adj_faces.y];
+
+    const int v1[3] = {face1.x, face1.y, face1.z};
+
+    int shared_local_indices1[2];
+    int shared_local_indices2[2];
+    int found_count = 0;
+
+    for (int i = 0; i < 3; ++i) {
+        if (v1[i] == face2.x) {
+            shared_local_indices1[found_count] = i;
+            shared_local_indices2[found_count] = 0;
+            found_count++;
+        } else if (v1[i] == face2.y) {
+            shared_local_indices1[found_count] = i;
+            shared_local_indices2[found_count] = 1;
+            found_count++;
+        } else if (v1[i] == face2.z) {
+            shared_local_indices1[found_count] = i;
+            shared_local_indices2[found_count] = 2;
+            found_count++;
+        }
+        if (found_count == 2) {
+            break;
+        }
+    }
+
+    int direction1 = (shared_local_indices1[1] - shared_local_indices1[0] + 3) % 3;
+    int direction2 = (shared_local_indices2[1] - shared_local_indices2[0] + 3) % 3;
+    flipped[tid] = (direction1 == direction2) ? 1 : 0;
+}
+
+
+static __global__ void inplace_flip_faces_with_flags_kernel(
+    int3* faces,
+    const int* conn_comp_with_flip,
+    const int F
+) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= F) return;
+
+    int is_flipped = conn_comp_with_flip[tid] & 1;
+    if (is_flipped) {
+        int3 face = faces[tid];
+        faces[tid] = make_int3(face.x, face.z, face.y);
+    }
+}
+
+
+void CuMesh::unify_face_orientations() {
+    if (this->manifold_face_adj.is_empty()) {
+        this->get_manifold_face_adjacency();
+    }
+
+    // 1. Compute the flipped flag for each edge.
+    uint8_t* cu_flipped;
+    CUDA_CHECK(hipMalloc(&cu_flipped, this->manifold_face_adj.size * sizeof(uint8_t)));
+   hipLaunchKernelGGL(( get_flip_flags_kernel), dim3((this->manifold_face_adj.size+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+        this->manifold_face_adj.ptr,
+        this->faces.ptr,
+        this->manifold_face_adj.size,
+        cu_flipped
+    );
+    CUDA_CHECK(hipGetLastError());
+
+    // 2. Hook edges with flipped flag.
+    int* conn_comp_with_flip;
+    CUDA_CHECK(hipMalloc(&conn_comp_with_flip, this->faces.size * sizeof(int)));
+   hipLaunchKernelGGL(( arange_kernel), dim3((this->faces.size+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, conn_comp_with_flip, this->faces.size, 2);
+    CUDA_CHECK(hipGetLastError());
+    int* cu_end_flag; int h_end_flag;
+    CUDA_CHECK(hipMalloc(&cu_end_flag, sizeof(int)));
+    do {
+        h_end_flag = 1;
+        CUDA_CHECK(hipMemcpy(cu_end_flag, &h_end_flag, sizeof(int), hipMemcpyHostToDevice));
+
+        // Hook
+       hipLaunchKernelGGL(( hook_edges_with_orientation_kernel), dim3((this->manifold_face_adj.size+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+            this->manifold_face_adj.ptr,
+            cu_flipped,
+            this->manifold_face_adj.size,
+            conn_comp_with_flip,
+            cu_end_flag
+        );
+        CUDA_CHECK(hipGetLastError());
+
+        // Compress
+       hipLaunchKernelGGL(( compress_components_with_orientation_kernel), dim3((this->faces.size+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+            conn_comp_with_flip,
+            this->faces.size
+        );
+        CUDA_CHECK(hipGetLastError());
+        CUDA_CHECK(hipMemcpy(&h_end_flag, cu_end_flag, sizeof(int), hipMemcpyDeviceToHost));
+    } while (h_end_flag == 0);
+    CUDA_CHECK(hipFree(cu_end_flag));
+
+    // 3. Flip the orientation of the faces.
+   hipLaunchKernelGGL(( inplace_flip_faces_with_flags_kernel), dim3((this->faces.size+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+        this->faces.ptr,
+        conn_comp_with_flip,
+        this->faces.size
+    );
+    CUDA_CHECK(hipGetLastError());
+    CUDA_CHECK(hipFree(cu_flipped));
+    CUDA_CHECK(hipFree(conn_comp_with_flip));
+}
+
+
+} // namespace cumesh
\ No newline at end of file
diff --git a/src/connectivity.cu b/src/connectivity.cu
index 6e2f5fe..f634882 100644
--- a/src/connectivity.cu
+++ b/src/connectivity.cu
@@ -1,7 +1,11 @@
 #include "cumesh.h"
 #include "shared.h"
 
+#ifdef __HIP_PLATFORM_AMD__
+#include <hipcub/hipcub.hpp>
+#else
 #include <cub/cub.cuh>
+#endif
 
 
 namespace cumesh {
@@ -64,18 +68,18 @@ void CuMesh::get_vertex_face_adjacency() {
     this->vert2face_cnt.resize(V + 1);
     this->vert2face_cnt.zero();
     get_neighbor_face_cnt_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(this->faces.ptr, F, this->vert2face_cnt.ptr);
-    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(hipGetLastError());
 
     // allocate memory for neighboring face ids
     this->vert2face_offset.resize(V + 1);
     size_t temp_storage_bytes = 0;
-    CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
+    CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum(
         nullptr, temp_storage_bytes,
         this->vert2face_cnt.ptr, this->vert2face_offset.ptr,
         V + 1
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
-    CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
+    CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         this->vert2face_cnt.ptr, this->vert2face_offset.ptr,
         V + 1
@@ -89,7 +93,7 @@ void CuMesh::get_vertex_face_adjacency() {
         this->vert2face_offset.ptr,
         this->vert2face_cnt.ptr
     );
-    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(hipGetLastError());
 }
 
 
@@ -122,19 +126,19 @@ void CuMesh::get_edges() {
     size_t F = this->faces.size;
     this->edges.resize(F * 3);
     expand_edges_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(this->faces.ptr, F, this->edges.ptr);
-    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(hipGetLastError());
 
     // sort edges
     this->temp_storage.resize(F * 3 * sizeof(uint64_t));
     size_t temp_storage_bytes = 0;
-    CUDA_CHECK(cub::DeviceRadixSort::SortKeys(
+    CUDA_CHECK(hipcub::DeviceRadixSort::SortKeys(
         nullptr, temp_storage_bytes,
         this->edges.ptr,
         reinterpret_cast<uint64_t*>(this->temp_storage.ptr),
         F * 3
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
-    CUDA_CHECK(cub::DeviceRadixSort::SortKeys(
+    CUDA_CHECK(hipcub::DeviceRadixSort::SortKeys(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         this->edges.ptr,
         reinterpret_cast<uint64_t*>(this->temp_storage.ptr),
@@ -143,22 +147,22 @@ void CuMesh::get_edges() {
 
     // unique edges
     int* num_edges;
-    CUDA_CHECK(cudaMalloc(&num_edges, sizeof(int)));
+    CUDA_CHECK(hipMalloc(&num_edges, sizeof(int)));
     this->edge2face_cnt.resize(F * 3);
-    CUDA_CHECK(cub::DeviceRunLengthEncode::Encode(
+    CUDA_CHECK(hipcub::DeviceRunLengthEncode::Encode(
         nullptr, temp_storage_bytes,
         reinterpret_cast<uint64_t*>(this->temp_storage.ptr), this->edges.ptr, this->edge2face_cnt.ptr, num_edges,
         F * 3
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
-    CUDA_CHECK(cub::DeviceRunLengthEncode::Encode(
+    CUDA_CHECK(hipcub::DeviceRunLengthEncode::Encode(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         reinterpret_cast<uint64_t*>(this->temp_storage.ptr), this->edges.ptr, this->edge2face_cnt.ptr, num_edges,
         F * 3
     ));
-    CUDA_CHECK(cudaMemcpy(&this->edges.size, num_edges, sizeof(int), cudaMemcpyDeviceToHost));
+    CUDA_CHECK(hipMemcpy(&this->edges.size, num_edges, sizeof(int), hipMemcpyDeviceToHost));
     this->edge2face_cnt.size = this->edges.size;
-    CUDA_CHECK(cudaFree(num_edges));
+    CUDA_CHECK(hipFree(num_edges));
 }
 
 
@@ -229,13 +233,13 @@ void CuMesh::get_edge_face_adjacency() {
     // allocate memory for edge2face_offset
     this->edge2face_offset.resize(E + 1);
     size_t temp_storage_bytes = 0;
-    CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
+    CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum(
         nullptr, temp_storage_bytes,
         this->edge2face_cnt.ptr, this->edge2face_offset.ptr,
         E + 1
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
-    CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
+    CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         this->edge2face_cnt.ptr, this->edge2face_offset.ptr,
         E + 1
@@ -243,7 +247,7 @@ void CuMesh::get_edge_face_adjacency() {
 
     // allocate memory for edge2face
     int total_edge_face_cnt;
-    CUDA_CHECK(cudaMemcpy(&total_edge_face_cnt, &this->edge2face_offset.ptr[E], sizeof(int), cudaMemcpyDeviceToHost));
+    CUDA_CHECK(hipMemcpy(&total_edge_face_cnt, &this->edge2face_offset.ptr[E], sizeof(int), hipMemcpyDeviceToHost));
     this->edge2face.resize(total_edge_face_cnt);
 
     // allocate memory for face2edge
@@ -261,7 +265,7 @@ void CuMesh::get_edge_face_adjacency() {
         this->edge2face.ptr,
         this->face2edge.ptr
     );
-    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(hipGetLastError());
 }
 
 
@@ -334,18 +338,18 @@ void CuMesh::get_vertex_edge_adjacency() {
     get_vertex_edge_cnt_kernel<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
         this->edges.ptr, E, this->vert2edge_cnt.ptr
     );
-    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(hipGetLastError());
 
     // allocate memory for vert2edge_offset
     this->vert2edge_offset.resize(V + 1);
     size_t temp_storage_bytes = 0;
-    CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
+    CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum(
         nullptr, temp_storage_bytes,
         this->vert2edge_cnt.ptr, this->vert2edge_offset.ptr,
         V + 1
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
-    CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
+    CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         this->vert2edge_cnt.ptr, this->vert2edge_offset.ptr,
         V + 1
@@ -360,7 +364,7 @@ void CuMesh::get_vertex_edge_adjacency() {
         this->vert2edge_offset.ptr,
         this->vert2edge_cnt.ptr
     );
-    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(hipGetLastError());
 }
 
 
@@ -416,26 +420,26 @@ void CuMesh::get_boundary_info() {
     // Select boundary edges
     size_t temp_storage_bytes = 0;
     int *cu_num_boundary, *cu_edge_idx;
-    CUDA_CHECK(cudaMalloc(&cu_num_boundary, sizeof(int)));
-    CUDA_CHECK(cudaMalloc(&cu_edge_idx, E * sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_num_boundary, sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_edge_idx, E * sizeof(int)));
     this->boundaries.resize(E);
     arange_kernel<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(cu_edge_idx, E);
-    CUDA_CHECK(cub::DeviceSelect::If(
+    CUDA_CHECK(hipcub::DeviceSelect::If(
         nullptr, temp_storage_bytes,
         cu_edge_idx, this->boundaries.ptr, cu_num_boundary,
         E,
         is_boundary_edge{this->edge2face_cnt.ptr}
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
-    CUDA_CHECK(cub::DeviceSelect::If(
+    CUDA_CHECK(hipcub::DeviceSelect::If(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         cu_edge_idx, this->boundaries.ptr, cu_num_boundary,
         E,
         is_boundary_edge{this->edge2face_cnt.ptr}
     ));
-    CUDA_CHECK(cudaMemcpy(&this->boundaries.size, cu_num_boundary, sizeof(int), cudaMemcpyDeviceToHost));
-    CUDA_CHECK(cudaFree(cu_num_boundary));
-    CUDA_CHECK(cudaFree(cu_edge_idx));
+    CUDA_CHECK(hipMemcpy(&this->boundaries.size, cu_num_boundary, sizeof(int), hipMemcpyDeviceToHost));
+    CUDA_CHECK(hipFree(cu_num_boundary));
+    CUDA_CHECK(hipFree(cu_edge_idx));
 
     // Set vertex boundary indicator
     this->vert_is_boundary.resize(this->vertices.size);
@@ -445,7 +449,7 @@ void CuMesh::get_boundary_info() {
             this->edges.ptr, this->boundaries.ptr, this->edge2face_cnt.ptr,
             this->boundaries.size, this->vert_is_boundary.ptr
         );
-        CUDA_CHECK(cudaGetLastError());
+        CUDA_CHECK(hipGetLastError());
     }
 }
 
@@ -531,18 +535,18 @@ void CuMesh::get_vertex_boundary_adjacency() {
     get_vertex_boundary_cnt_kernel<<<(B+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
         this->edges.ptr, this->boundaries.ptr, B, this->vert2bound_cnt.ptr
     );
-    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(hipGetLastError());
 
     // allocate memory for vert2bound_offset
     this->vert2bound_offset.resize(V + 1);
     size_t temp_storage_bytes = 0;
-    CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
+    CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum(
         nullptr, temp_storage_bytes,
         this->vert2bound_cnt.ptr, this->vert2bound_offset.ptr,
         V + 1
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
-    CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
+    CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         this->vert2bound_cnt.ptr, this->vert2bound_offset.ptr,
         V + 1
@@ -557,7 +561,7 @@ void CuMesh::get_vertex_boundary_adjacency() {
         this->vert2bound_offset.ptr,
         this->vert2bound_cnt.ptr
     );
-    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(hipGetLastError());
 }
 
 
@@ -613,7 +617,7 @@ void CuMesh::get_vertex_is_manifold() {
         V,
         this->vert_is_manifold.ptr
     );
-    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(hipGetLastError());
 }
 
 
@@ -668,28 +672,28 @@ void CuMesh::get_manifold_face_adjacency() {
     // Select manifold edges
     size_t temp_storage_bytes = 0;
     int *cu_num_manifold_edges, *cu_edge_idx, *cu_manifold_edge_idx;
-    CUDA_CHECK(cudaMalloc(&cu_num_manifold_edges, sizeof(int)));
-    CUDA_CHECK(cudaMalloc(&cu_edge_idx, E * sizeof(int)));
-    CUDA_CHECK(cudaMalloc(&cu_manifold_edge_idx, E * sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_num_manifold_edges, sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_edge_idx, E * sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_manifold_edge_idx, E * sizeof(int)));
     arange_kernel<<<(E+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(cu_edge_idx, E);
-    CUDA_CHECK(cudaGetLastError());
-    CUDA_CHECK(cub::DeviceSelect::If(
+    CUDA_CHECK(hipGetLastError());
+    CUDA_CHECK(hipcub::DeviceSelect::If(
         nullptr, temp_storage_bytes,
         cu_edge_idx, cu_manifold_edge_idx, cu_num_manifold_edges,
         E,
         is_manifold_edge{this->edge2face_cnt.ptr}
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
-    CUDA_CHECK(cub::DeviceSelect::If(
+    CUDA_CHECK(hipcub::DeviceSelect::If(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         cu_edge_idx, cu_manifold_edge_idx, cu_num_manifold_edges,
         E,
         is_manifold_edge{this->edge2face_cnt.ptr}
     ));
     int manifold_edge_count;
-    CUDA_CHECK(cudaMemcpy(&manifold_edge_count, cu_num_manifold_edges, sizeof(int), cudaMemcpyDeviceToHost));
-    CUDA_CHECK(cudaFree(cu_num_manifold_edges));
-    CUDA_CHECK(cudaFree(cu_edge_idx));
+    CUDA_CHECK(hipMemcpy(&manifold_edge_count, cu_num_manifold_edges, sizeof(int), hipMemcpyDeviceToHost));
+    CUDA_CHECK(hipFree(cu_num_manifold_edges));
+    CUDA_CHECK(hipFree(cu_edge_idx));
 
     // set manifold_face_adj
     this->manifold_face_adj.resize(manifold_edge_count);
@@ -700,8 +704,8 @@ void CuMesh::get_manifold_face_adjacency() {
         manifold_edge_count,
         this->manifold_face_adj.ptr
     );
-    CUDA_CHECK(cudaGetLastError());
-    CUDA_CHECK(cudaFree(cu_manifold_edge_idx));
+    CUDA_CHECK(hipGetLastError());
+    CUDA_CHECK(hipFree(cu_manifold_edge_idx));
 }
 
 
@@ -748,32 +752,32 @@ void CuMesh::get_manifold_boundary_adjacency() {
     // Select manifold boundary vertices
     size_t temp_storage_bytes = 0;
     int *cu_num_manifold_boundary_verts, *cu_vert_idx, *cu_manifold_vert_idx;
-    CUDA_CHECK(cudaMalloc(&cu_num_manifold_boundary_verts, sizeof(int)));
-    CUDA_CHECK(cudaMalloc(&cu_vert_idx, V * sizeof(int)));
-    CUDA_CHECK(cudaMalloc(&cu_manifold_vert_idx, V * sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_num_manifold_boundary_verts, sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_vert_idx, V * sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_manifold_vert_idx, V * sizeof(int)));
     arange_kernel<<<(V+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(cu_vert_idx, V);
-    CUDA_CHECK(cudaGetLastError());
-    CUDA_CHECK(cub::DeviceSelect::If(
+    CUDA_CHECK(hipGetLastError());
+    CUDA_CHECK(hipcub::DeviceSelect::If(
         nullptr, temp_storage_bytes,
         cu_vert_idx, cu_manifold_vert_idx, cu_num_manifold_boundary_verts,
         V,
         is_manifold_boundary_vertex{this->vert_is_manifold.ptr, this->vert_is_boundary.ptr}
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
-    CUDA_CHECK(cub::DeviceSelect::If(
+    CUDA_CHECK(hipcub::DeviceSelect::If(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         cu_vert_idx, cu_manifold_vert_idx, cu_num_manifold_boundary_verts,
         V,
         is_manifold_boundary_vertex{this->vert_is_manifold.ptr, this->vert_is_boundary.ptr}
     ));
     int manifold_boundary_vert_count;
-    CUDA_CHECK(cudaMemcpy(&manifold_boundary_vert_count, cu_num_manifold_boundary_verts, sizeof(int), cudaMemcpyDeviceToHost));
-    CUDA_CHECK(cudaFree(cu_num_manifold_boundary_verts));
-    CUDA_CHECK(cudaFree(cu_vert_idx));
+    CUDA_CHECK(hipMemcpy(&manifold_boundary_vert_count, cu_num_manifold_boundary_verts, sizeof(int), hipMemcpyDeviceToHost));
+    CUDA_CHECK(hipFree(cu_num_manifold_boundary_verts));
+    CUDA_CHECK(hipFree(cu_vert_idx));
 
     // Early return if no manifold boundary vertices
     if (manifold_boundary_vert_count == 0) {
-        CUDA_CHECK(cudaFree(cu_manifold_vert_idx));
+        CUDA_CHECK(hipFree(cu_manifold_vert_idx));
         return;
     }
 
@@ -786,7 +790,7 @@ void CuMesh::get_manifold_boundary_adjacency() {
         manifold_boundary_vert_count,
         this->manifold_bound_adj.ptr
     );
-    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(hipGetLastError());
 }
 
 
@@ -801,12 +805,12 @@ void CuMesh::get_connected_components() {
     // Iterative Hook and Compress
     this->conn_comp_ids.resize(F);
     arange_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(this->conn_comp_ids.ptr, F);
-    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(hipGetLastError());
     int* cu_end_flag; int h_end_flag;
-    CUDA_CHECK(cudaMalloc(&cu_end_flag, sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_end_flag, sizeof(int)));
     do {
         h_end_flag = 1;
-        CUDA_CHECK(cudaMemcpy(cu_end_flag, &h_end_flag, sizeof(int), cudaMemcpyHostToDevice));
+        CUDA_CHECK(hipMemcpy(cu_end_flag, &h_end_flag, sizeof(int), hipMemcpyHostToDevice));
 
         // Hook
         hook_edges_kernel<<<(M+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
@@ -815,17 +819,17 @@ void CuMesh::get_connected_components() {
             this->conn_comp_ids.ptr,
             cu_end_flag
         );
-        CUDA_CHECK(cudaGetLastError());
+        CUDA_CHECK(hipGetLastError());
 
         // Compress
         compress_components_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
             this->conn_comp_ids.ptr,
             F
         );
-        CUDA_CHECK(cudaGetLastError());
-        CUDA_CHECK(cudaMemcpy(&h_end_flag, cu_end_flag, sizeof(int), cudaMemcpyDeviceToHost));
+        CUDA_CHECK(hipGetLastError());
+        CUDA_CHECK(hipMemcpy(&h_end_flag, cu_end_flag, sizeof(int), hipMemcpyDeviceToHost));
     } while (h_end_flag == 0);
-    CUDA_CHECK(cudaFree(cu_end_flag));
+    CUDA_CHECK(hipFree(cu_end_flag));
 
     // Compresses boundary components
     this->num_conn_comps = compress_ids(this->conn_comp_ids.ptr, F, this->cub_temp_storage);
@@ -848,12 +852,12 @@ void CuMesh::get_boundary_connected_components() {
     // Iterative Hook and Compress
     this->bound_conn_comp_ids.resize(B);
     arange_kernel<<<(B+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(this->bound_conn_comp_ids.ptr, B);
-    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(hipGetLastError());
     int* cu_end_flag; int h_end_flag;
-    CUDA_CHECK(cudaMalloc(&cu_end_flag, sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_end_flag, sizeof(int)));
     do {
         h_end_flag = 1;
-        CUDA_CHECK(cudaMemcpy(cu_end_flag, &h_end_flag, sizeof(int), cudaMemcpyHostToDevice));
+        CUDA_CHECK(hipMemcpy(cu_end_flag, &h_end_flag, sizeof(int), hipMemcpyHostToDevice));
 
         // Hook
         hook_edges_kernel<<<(M+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
@@ -862,17 +866,17 @@ void CuMesh::get_boundary_connected_components() {
             this->bound_conn_comp_ids.ptr,
             cu_end_flag
         );
-        CUDA_CHECK(cudaGetLastError());
+        CUDA_CHECK(hipGetLastError());
 
         // Compress
         compress_components_kernel<<<(B+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
             this->bound_conn_comp_ids.ptr,
             B
         );
-        CUDA_CHECK(cudaGetLastError());
-        CUDA_CHECK(cudaMemcpy(&h_end_flag, cu_end_flag, sizeof(int), cudaMemcpyDeviceToHost));
+        CUDA_CHECK(hipGetLastError());
+        CUDA_CHECK(hipMemcpy(&h_end_flag, cu_end_flag, sizeof(int), hipMemcpyDeviceToHost));
     } while (h_end_flag == 0);
-    CUDA_CHECK(cudaFree(cu_end_flag));
+    CUDA_CHECK(hipFree(cu_end_flag));
 
     // Compresses boundary components
     this->num_bound_conn_comps = compress_ids(this->bound_conn_comp_ids.ptr, B, this->cub_temp_storage);
@@ -940,13 +944,13 @@ void CuMesh::get_boundary_loops() {
 
     // Check if boundary components are loops
     int* cu_is_bound_conn_comp_loop;
-    CUDA_CHECK(cudaMalloc(&cu_is_bound_conn_comp_loop, this->num_bound_conn_comps * sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_is_bound_conn_comp_loop, this->num_bound_conn_comps * sizeof(int)));
     fill_kernel<<<(this->num_bound_conn_comps+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
         cu_is_bound_conn_comp_loop,
         this->num_bound_conn_comps,
         1
     );
-    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(hipGetLastError());
     is_bound_conn_comp_loop_kernel<<<(B+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
         this->edges.ptr,
         this->boundaries.ptr,
@@ -956,43 +960,43 @@ void CuMesh::get_boundary_loops() {
         B,
         cu_is_bound_conn_comp_loop
     );
-    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(hipGetLastError());
     int* cu_num_bound_loops;
-    CUDA_CHECK(cudaMalloc(&cu_num_bound_loops, sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_num_bound_loops, sizeof(int)));
     size_t temp_storage_bytes = 0;
-    CUDA_CHECK(cub::DeviceReduce::Sum(
+    CUDA_CHECK(hipcub::DeviceReduce::Sum(
         nullptr, temp_storage_bytes,
         cu_is_bound_conn_comp_loop,
         cu_num_bound_loops,
         this->num_bound_conn_comps
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
-    CUDA_CHECK(cub::DeviceReduce::Sum(
+    CUDA_CHECK(hipcub::DeviceReduce::Sum(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         cu_is_bound_conn_comp_loop,
         cu_num_bound_loops,
         this->num_bound_conn_comps
     ));
-    CUDA_CHECK(cudaMemcpy(&this->num_bound_loops, cu_num_bound_loops, sizeof(int), cudaMemcpyDeviceToHost));
-    CUDA_CHECK(cudaFree(cu_num_bound_loops));
+    CUDA_CHECK(hipMemcpy(&this->num_bound_loops, cu_num_bound_loops, sizeof(int), hipMemcpyDeviceToHost));
+    CUDA_CHECK(hipFree(cu_num_bound_loops));
     if (this->num_bound_loops == 0) {
-        CUDA_CHECK(cudaFree(cu_is_bound_conn_comp_loop));
+        CUDA_CHECK(hipFree(cu_is_bound_conn_comp_loop));
         return;
     }
     
     // Sort boundaries by connected component ids
     int *cu_bound_sorted, *cu_bound_conn_comp_ids_sorted;
-    CUDA_CHECK(cudaMalloc(&cu_bound_sorted, B * sizeof(int)));
-    CUDA_CHECK(cudaMalloc(&cu_bound_conn_comp_ids_sorted, B * sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_bound_sorted, B * sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_bound_conn_comp_ids_sorted, B * sizeof(int)));
     temp_storage_bytes = 0;
-    CUDA_CHECK(cub::DeviceRadixSort::SortPairs(
+    CUDA_CHECK(hipcub::DeviceRadixSort::SortPairs(
         nullptr, temp_storage_bytes,
         this->bound_conn_comp_ids.ptr, cu_bound_conn_comp_ids_sorted,
         this->boundaries.ptr, cu_bound_sorted,
         B
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
-    CUDA_CHECK(cub::DeviceRadixSort::SortPairs(
+    CUDA_CHECK(hipcub::DeviceRadixSort::SortPairs(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         this->bound_conn_comp_ids.ptr, cu_bound_conn_comp_ids_sorted,
         this->boundaries.ptr, cu_bound_sorted,
@@ -1001,84 +1005,84 @@ void CuMesh::get_boundary_loops() {
 
     // Select loops
     int* cu_bound_is_on_loop;
-    CUDA_CHECK(cudaMalloc(&cu_bound_is_on_loop, B * sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_bound_is_on_loop, B * sizeof(int)));
     index_kernel<<<(B+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
         cu_is_bound_conn_comp_loop,
         cu_bound_conn_comp_ids_sorted,
         B,
         cu_bound_is_on_loop
     );
-    CUDA_CHECK(cudaGetLastError());
-    CUDA_CHECK(cudaFree(cu_is_bound_conn_comp_loop));
+    CUDA_CHECK(hipGetLastError());
+    CUDA_CHECK(hipFree(cu_is_bound_conn_comp_loop));
     this->loop_boundaries.resize(B);
     int *cu_loop_bound_conn_comp_ids_sorted, *cu_num_bound_on_loop;
-    CUDA_CHECK(cudaMalloc(&cu_loop_bound_conn_comp_ids_sorted, B * sizeof(int)));
-    CUDA_CHECK(cudaMalloc(&cu_num_bound_on_loop, sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_loop_bound_conn_comp_ids_sorted, B * sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_num_bound_on_loop, sizeof(int)));
     temp_storage_bytes = 0;
-    CUDA_CHECK(cub::DeviceSelect::Flagged(
+    CUDA_CHECK(hipcub::DeviceSelect::Flagged(
         nullptr, temp_storage_bytes,
         cu_bound_sorted, cu_bound_is_on_loop, this->loop_boundaries.ptr, cu_num_bound_on_loop,
         B
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
-    CUDA_CHECK(cub::DeviceSelect::Flagged(
+    CUDA_CHECK(hipcub::DeviceSelect::Flagged(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         cu_bound_sorted, cu_bound_is_on_loop, this->loop_boundaries.ptr, cu_num_bound_on_loop,
         B
     ));
     int num_bound_on_loop;
-    CUDA_CHECK(cudaMemcpy(&num_bound_on_loop, cu_num_bound_on_loop, sizeof(int), cudaMemcpyDeviceToHost));
-    CUDA_CHECK(cudaFree(cu_bound_sorted));
+    CUDA_CHECK(hipMemcpy(&num_bound_on_loop, cu_num_bound_on_loop, sizeof(int), hipMemcpyDeviceToHost));
+    CUDA_CHECK(hipFree(cu_bound_sorted));
     this->loop_boundaries.resize(num_bound_on_loop);
     temp_storage_bytes = 0;
-    CUDA_CHECK(cub::DeviceSelect::Flagged(
+    CUDA_CHECK(hipcub::DeviceSelect::Flagged(
         nullptr, temp_storage_bytes,
         cu_bound_conn_comp_ids_sorted, cu_bound_is_on_loop, cu_loop_bound_conn_comp_ids_sorted, cu_num_bound_on_loop,
         B
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
-    CUDA_CHECK(cub::DeviceSelect::Flagged(
+    CUDA_CHECK(hipcub::DeviceSelect::Flagged(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         cu_bound_conn_comp_ids_sorted, cu_bound_is_on_loop, cu_loop_bound_conn_comp_ids_sorted, cu_num_bound_on_loop,
         B
     ));
-    CUDA_CHECK(cudaFree(cu_bound_conn_comp_ids_sorted));
-    CUDA_CHECK(cudaFree(cu_bound_is_on_loop));
-    CUDA_CHECK(cudaFree(cu_num_bound_on_loop));
+    CUDA_CHECK(hipFree(cu_bound_conn_comp_ids_sorted));
+    CUDA_CHECK(hipFree(cu_bound_is_on_loop));
+    CUDA_CHECK(hipFree(cu_num_bound_on_loop));
     
     // RLE
     this->loop_boundaries_offset.resize(this->num_bound_loops + 1);
     this->loop_boundaries_offset.zero();
     int* cu_rle_unique_out, *cu_rle_num_runs;
-    CUDA_CHECK(cudaMalloc(&cu_rle_unique_out, this->num_bound_loops * sizeof(int)));
-    CUDA_CHECK(cudaMalloc(&cu_rle_num_runs, sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_rle_unique_out, this->num_bound_loops * sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_rle_num_runs, sizeof(int)));
     temp_storage_bytes = 0;
-    CUDA_CHECK(cub::DeviceRunLengthEncode::Encode(
+    CUDA_CHECK(hipcub::DeviceRunLengthEncode::Encode(
         nullptr, temp_storage_bytes,
         cu_loop_bound_conn_comp_ids_sorted,
         cu_rle_unique_out, this->loop_boundaries_offset.ptr, cu_rle_num_runs,
         num_bound_on_loop
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
-    CUDA_CHECK(cub::DeviceRunLengthEncode::Encode(
+    CUDA_CHECK(hipcub::DeviceRunLengthEncode::Encode(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         cu_loop_bound_conn_comp_ids_sorted,
         cu_rle_unique_out, this->loop_boundaries_offset.ptr, cu_rle_num_runs,
         num_bound_on_loop
     ));
-    CUDA_CHECK(cudaFree(cu_loop_bound_conn_comp_ids_sorted));
-    CUDA_CHECK(cudaFree(cu_rle_unique_out));
-    CUDA_CHECK(cudaFree(cu_rle_num_runs));
+    CUDA_CHECK(hipFree(cu_loop_bound_conn_comp_ids_sorted));
+    CUDA_CHECK(hipFree(cu_rle_unique_out));
+    CUDA_CHECK(hipFree(cu_rle_num_runs));
 
     // Scan loop boundaries offset
     temp_storage_bytes = 0;
-    CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
+    CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum(
         nullptr, temp_storage_bytes,
         this->loop_boundaries_offset.ptr,
         this->num_bound_loops + 1
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
-    CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
+    CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum(
         this->cub_temp_storage.ptr, temp_storage_bytes,
         this->loop_boundaries_offset.ptr,
         this->num_bound_loops + 1
diff --git a/src/connectivity.hip b/src/connectivity.hip
new file mode 100644
index 0000000..d5d878e
--- /dev/null
+++ b/src/connectivity.hip
@@ -0,0 +1,1095 @@
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
+#include "cumesh_hip.h"
+#include "shared_hip.h"
+
+#ifdef __HIP_PLATFORM_AMD__
+#include <hipcub/hipcub.hpp>
+#else
+#include <hipcub/hipcub.hpp>
+#endif
+
+
+namespace cumesh {
+
+/**
+ * Get count of neighboring faces for each vertex
+ * 
+ * @param faces: the faces of the mesh, shape (F)
+ * @param F: the number of faces
+ * @param neighbor_face_cnt: the buffer for neighbor face count, shape (V+1)
+ */
+static __global__ void get_neighbor_face_cnt_kernel(
+    const int3* faces,
+    const int F,
+    int* neighbor_face_cnt
+) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= F) return;
+
+    int3 f = faces[tid];
+
+    atomicAdd(&neighbor_face_cnt[f.x], 1);
+    atomicAdd(&neighbor_face_cnt[f.y], 1);
+    atomicAdd(&neighbor_face_cnt[f.z], 1);
+}
+
+
+/**
+ * Fill the neighboring face ids for each vertex
+ * 
+ * @param faces: the faces of the mesh, shape (F)
+ * @param F: the number of faces
+ * @param neighbor_face_ids: the buffer for neighbor face ids, shape (total_neighbor_face_cnt)
+ * @param neighbor_face_ids_offset: the buffer for neighbor face ids offset, shape (V+1)
+ * @param neighbor_face_cnt: the buffer for neighbor face count, shape (V+1)
+ */
+static __global__ void fill_neighbor_face_ids_kernel(
+    const int3* faces,
+    const int F,
+    int* neighbor_face_ids,
+    int* neighbor_face_ids_offset,
+    int* neighbor_face_cnt
+) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= F) return;
+
+    int3 f = faces[tid];
+
+    neighbor_face_ids[neighbor_face_ids_offset[f.x] + atomicAdd(&neighbor_face_cnt[f.x], 1)] = tid;
+    neighbor_face_ids[neighbor_face_ids_offset[f.y] + atomicAdd(&neighbor_face_cnt[f.y], 1)] = tid;
+    neighbor_face_ids[neighbor_face_ids_offset[f.z] + atomicAdd(&neighbor_face_cnt[f.z], 1)] = tid;
+}
+
+
+void CuMesh::get_vertex_face_adjacency() {
+    size_t F = this->faces.size;
+    size_t V = this->vertices.size;
+
+    // get neighboring face count for each vertex
+    this->vert2face_cnt.resize(V + 1);
+    this->vert2face_cnt.zero();
+   hipLaunchKernelGGL(( get_neighbor_face_cnt_kernel), dim3((F+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, this->faces.ptr, F, this->vert2face_cnt.ptr);
+    CUDA_CHECK(hipGetLastError());
+
+    // allocate memory for neighboring face ids
+    this->vert2face_offset.resize(V + 1);
+    size_t temp_storage_bytes = 0;
+    CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum(
+        nullptr, temp_storage_bytes,
+        this->vert2face_cnt.ptr, this->vert2face_offset.ptr,
+        V + 1
+    ));
+    this->cub_temp_storage.resize(temp_storage_bytes);
+    CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum(
+        this->cub_temp_storage.ptr, temp_storage_bytes,
+        this->vert2face_cnt.ptr, this->vert2face_offset.ptr,
+        V + 1
+    ));
+    this->vert2face.resize(F*3);
+
+    // fill neighboring face ids for each vertex
+    this->vert2face_cnt.zero();
+   hipLaunchKernelGGL(( fill_neighbor_face_ids_kernel), dim3((F+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, this->faces.ptr, F, 
+        this->vert2face.ptr,
+        this->vert2face_offset.ptr,
+        this->vert2face_cnt.ptr
+    );
+    CUDA_CHECK(hipGetLastError());
+}
+
+
+/**
+ * Expand edges for each triangle face
+ * 
+ * @param faces: the faces of the mesh, shape (F)
+ * @param F: the number of faces
+ * @param edges: the buffer for edges, shape (F*3)
+ */
+static __global__ void expand_edges_kernel(
+    const int3* faces,
+    const int F,
+    uint64_t *edges
+) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= F) return;
+
+    int base = tid * 3;
+    int3 f = faces[tid];
+    
+    // expand edges
+    edges[base + 0] = ((uint64_t)min(f.x, f.y) << 32) | max(f.x, f.y);
+    edges[base + 1] = ((uint64_t)min(f.y, f.z) << 32) | max(f.y, f.z);
+    edges[base + 2] = ((uint64_t)min(f.z, f.x) << 32) | max(f.z, f.x);
+}
+
+
+void CuMesh::get_edges() {
+    size_t F = this->faces.size;
+    this->edges.resize(F * 3);
+   hipLaunchKernelGGL(( expand_edges_kernel), dim3((F+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, this->faces.ptr, F, this->edges.ptr);
+    CUDA_CHECK(hipGetLastError());
+
+    // sort edges
+    this->temp_storage.resize(F * 3 * sizeof(uint64_t));
+    size_t temp_storage_bytes = 0;
+    CUDA_CHECK(hipcub::DeviceRadixSort::SortKeys(
+        nullptr, temp_storage_bytes,
+        this->edges.ptr,
+        reinterpret_cast<uint64_t*>(this->temp_storage.ptr),
+        F * 3
+    ));
+    this->cub_temp_storage.resize(temp_storage_bytes);
+    CUDA_CHECK(hipcub::DeviceRadixSort::SortKeys(
+        this->cub_temp_storage.ptr, temp_storage_bytes,
+        this->edges.ptr,
+        reinterpret_cast<uint64_t*>(this->temp_storage.ptr),
+        F * 3
+    ));
+
+    // unique edges
+    int* num_edges;
+    CUDA_CHECK(hipMalloc(&num_edges, sizeof(int)));
+    this->edge2face_cnt.resize(F * 3);
+    CUDA_CHECK(hipcub::DeviceRunLengthEncode::Encode(
+        nullptr, temp_storage_bytes,
+        reinterpret_cast<uint64_t*>(this->temp_storage.ptr), this->edges.ptr, this->edge2face_cnt.ptr, num_edges,
+        F * 3
+    ));
+    this->cub_temp_storage.resize(temp_storage_bytes);
+    CUDA_CHECK(hipcub::DeviceRunLengthEncode::Encode(
+        this->cub_temp_storage.ptr, temp_storage_bytes,
+        reinterpret_cast<uint64_t*>(this->temp_storage.ptr), this->edges.ptr, this->edge2face_cnt.ptr, num_edges,
+        F * 3
+    ));
+    CUDA_CHECK(hipMemcpy(&this->edges.size, num_edges, sizeof(int), hipMemcpyDeviceToHost));
+    this->edge2face_cnt.size = this->edges.size;
+    CUDA_CHECK(hipFree(num_edges));
+}
+
+
+/**
+ * Get edge-face adjacency
+ * 
+ * @param faces: the faces of the mesh, shape (F)
+ * @param edges: the buffer for edges, shape (E)
+ * @param edge2face_cnt: the buffer for edge duplication number, shape (E)
+ * @param vert2face: the buffer for neighboring face ids, shape (total_neighbor_face_cnt)
+ * @param vert2face_offset: the buffer for neighboring face ids offset, shape (V+1)
+ * @param edge2face_offset: the buffer for edge to face adjacency offset, shape (E+1)
+ * @param E: the number of edges
+ * @param edge2face: the buffer for edge to face adjacency, shape (total_edge_face_cnt)
+ * @param face2edge: the buffer for face to edge adjacency, shape (F*3)
+ */
+static __global__ void get_edge_face_adjacency_kernel(
+    const int3* faces,
+    const uint64_t* edges,
+    const int* edge2face_cnt,
+    const int* vert2face,
+    const int* vert2face_offset,
+    const int* edge2face_offset,
+    const int E,
+    int* edge2face,
+    int3* face2edge
+) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= E) return;
+
+    // get edge
+    uint64_t e = edges[tid];
+    int e0 = int(e >> 32);
+    int e1 = int(e & 0xFFFFFFFF);
+
+    // assign connectivity
+    int ptr = edge2face_offset[tid];
+    for (int f = vert2face_offset[e0]; f < vert2face_offset[e0+1]; f++) {
+        int fid = vert2face[f];
+        int3 f_vids = faces[fid];
+        if (f_vids.x == e1 || f_vids.y == e1 || f_vids.z == e1) {
+            // this face contains the edge
+            edge2face[ptr] = fid;
+            ptr++;
+            // fill face2edge
+            if (f_vids.x == e0 && f_vids.y == e1 || f_vids.x == e1 && f_vids.y == e0) {
+                face2edge[fid].x = tid;
+            } else if (f_vids.y == e0 && f_vids.z == e1 || f_vids.y == e1 && f_vids.z == e0) {
+                face2edge[fid].y = tid;
+            } else if (f_vids.z == e0 && f_vids.x == e1 || f_vids.z == e1 && f_vids.x == e0) {
+                face2edge[fid].z = tid;
+            }
+        }
+    }
+}
+
+
+void CuMesh::get_edge_face_adjacency() {
+    if (this->edges.is_empty() || this->edge2face_cnt.is_empty()) {
+        this->get_edges();
+    }
+    if (this->vert2face.is_empty() || this->vert2face_offset.is_empty()) {
+        this->get_vertex_face_adjacency();    
+    }
+    size_t F = this->faces.size;
+    size_t E = this->edges.size;
+
+    // allocate memory for edge2face_offset
+    this->edge2face_offset.resize(E + 1);
+    size_t temp_storage_bytes = 0;
+    CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum(
+        nullptr, temp_storage_bytes,
+        this->edge2face_cnt.ptr, this->edge2face_offset.ptr,
+        E + 1
+    ));
+    this->cub_temp_storage.resize(temp_storage_bytes);
+    CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum(
+        this->cub_temp_storage.ptr, temp_storage_bytes,
+        this->edge2face_cnt.ptr, this->edge2face_offset.ptr,
+        E + 1
+    ));
+
+    // allocate memory for edge2face
+    int total_edge_face_cnt;
+    CUDA_CHECK(hipMemcpy(&total_edge_face_cnt, &this->edge2face_offset.ptr[E], sizeof(int), hipMemcpyDeviceToHost));
+    this->edge2face.resize(total_edge_face_cnt);
+
+    // allocate memory for face2edge
+    this->face2edge.resize(F);
+
+    // get edge-face adjacency
+   hipLaunchKernelGGL(( get_edge_face_adjacency_kernel), dim3((E+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+        this->faces.ptr,
+        this->edges.ptr,
+        this->edge2face_cnt.ptr,
+        this->vert2face.ptr,
+        this->vert2face_offset.ptr,
+        this->edge2face_offset.ptr,
+        E,
+        this->edge2face.ptr,
+        this->face2edge.ptr
+    );
+    CUDA_CHECK(hipGetLastError());
+}
+
+
+/**
+ * Get vertex adjacent edge number
+ * 
+ * @param edges: the buffer for edges, shape (E)
+ * @param E: the number of edges
+ * @param vert2edge_cnt: the buffer for vertex adjacent edge number, shape (V)
+ */
+static __global__ void get_vertex_edge_cnt_kernel(
+    const uint64_t* edges,
+    const int E,
+    int* vert2edge_cnt
+) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= E) return;
+
+    // get edge
+    uint64_t e = edges[tid];
+    int e0 = int(e >> 32);
+    int e1 = int(e & 0xFFFFFFFF);
+
+    // count vertex adjacent edge number
+    atomicAdd(&vert2edge_cnt[e0], 1);
+    atomicAdd(&vert2edge_cnt[e1], 1);
+}
+
+
+/**
+ * Get vertex-edge adjacency
+ * 
+ * @param edges: the buffer for edges, shape (E)
+ * @param E: the number of edges
+ * @param vert2edge: the buffer for vertex to edge adjacency, shape (total_vertex_edge_cnt)
+ * @param vert2edge_offset: the buffer for vertex to edge adjacency offset, shape (V+1)
+ * @param vert2edge_cnt: the buffer for vertex adjacent edge number, shape (V)
+ */
+static __global__ void get_vertex_edge_adjacency_kernel(
+    const uint64_t* edges,
+    const int E,
+    int* vert2edge,
+    int* vert2edge_offset,
+    int* vert2edge_cnt
+) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= E) return;
+
+    // get edge
+    uint64_t e = edges[tid];
+    int e0 = int(e >> 32);
+    int e1 = int(e & 0xFFFFFFFF);
+
+    // assign connectivity
+    vert2edge[vert2edge_offset[e0] + atomicAdd(&vert2edge_cnt[e0], 1)] = tid;
+    vert2edge[vert2edge_offset[e1] + atomicAdd(&vert2edge_cnt[e1], 1)] = tid;
+}
+
+
+void CuMesh::get_vertex_edge_adjacency() {
+    if (this->edges.is_empty()) {
+        this->get_edges();
+    }
+    size_t E = this->edges.size;
+    size_t V = this->vertices.size;
+
+    // get vertex adjacent edge number
+    this->vert2edge_cnt.resize(V + 1);
+    this->vert2edge_cnt.zero();
+   hipLaunchKernelGGL(( get_vertex_edge_cnt_kernel), dim3((E+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+        this->edges.ptr, E, this->vert2edge_cnt.ptr
+    );
+    CUDA_CHECK(hipGetLastError());
+
+    // allocate memory for vert2edge_offset
+    this->vert2edge_offset.resize(V + 1);
+    size_t temp_storage_bytes = 0;
+    CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum(
+        nullptr, temp_storage_bytes,
+        this->vert2edge_cnt.ptr, this->vert2edge_offset.ptr,
+        V + 1
+    ));
+    this->cub_temp_storage.resize(temp_storage_bytes);
+    CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum(
+        this->cub_temp_storage.ptr, temp_storage_bytes,
+        this->vert2edge_cnt.ptr, this->vert2edge_offset.ptr,
+        V + 1
+    ));
+
+    // get vertex-edge adjacency
+    this->vert2edge.resize(2 * E);
+    this->vert2edge_cnt.zero();
+   hipLaunchKernelGGL(( get_vertex_edge_adjacency_kernel), dim3((E+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+        this->edges.ptr, E,
+        this->vert2edge.ptr,
+        this->vert2edge_offset.ptr,
+        this->vert2edge_cnt.ptr
+    );
+    CUDA_CHECK(hipGetLastError());
+}
+
+
+/**
+ * Set vertex boundary indicator
+ * 
+ * @param edges: the buffer for edges, shape (E)
+ * @param boundaries: the buffer for boundary edges, shape (B)
+ * @param edge2face_cnt: the buffer for edge duplication number, shape (E)
+ * @param B: the number of boundary edges
+ * @param vert_is_boundary: the buffer for boundary vertex indicator, shape (V)
+ */
+static __global__ void set_boundary_vertex_kernel(
+    const uint64_t* edges,
+    const int* boundaries,
+    const int* edge2face_cnt,
+    const int B,
+    uint8_t* vert_is_boundary
+) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= B) return;
+
+    int eid = boundaries[tid];
+
+    if (edge2face_cnt[eid] == 1) {
+        // get edge
+        uint64_t e = edges[eid];
+        int e0 = int(e >> 32);
+        int e1 = int(e & 0xFFFFFFFF);
+
+        // set boundary vertex
+        vert_is_boundary[e0] = 1;
+        vert_is_boundary[e1] = 1;
+    }
+}
+
+
+struct is_boundary_edge {
+    const int* edge2face_cnt;
+    __host__ __device__
+    bool operator()(const int& idx) const {
+        return edge2face_cnt[idx] == 1;
+    }
+};
+
+
+void CuMesh::get_boundary_info() {
+    if (this->edges.is_empty() || this->edge2face_cnt.is_empty()) {
+        this->get_edges();
+    }
+    size_t E = this->edges.size;
+
+    // Select boundary edges
+    size_t temp_storage_bytes = 0;
+    int *cu_num_boundary, *cu_edge_idx;
+    CUDA_CHECK(hipMalloc(&cu_num_boundary, sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_edge_idx, E * sizeof(int)));
+    this->boundaries.resize(E);
+   hipLaunchKernelGGL(( arange_kernel), dim3((E+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, cu_edge_idx, E);
+    CUDA_CHECK(hipcub::DeviceSelect::If(
+        nullptr, temp_storage_bytes,
+        cu_edge_idx, this->boundaries.ptr, cu_num_boundary,
+        E,
+        is_boundary_edge{this->edge2face_cnt.ptr}
+    ));
+    this->cub_temp_storage.resize(temp_storage_bytes);
+    CUDA_CHECK(hipcub::DeviceSelect::If(
+        this->cub_temp_storage.ptr, temp_storage_bytes,
+        cu_edge_idx, this->boundaries.ptr, cu_num_boundary,
+        E,
+        is_boundary_edge{this->edge2face_cnt.ptr}
+    ));
+    CUDA_CHECK(hipMemcpy(&this->boundaries.size, cu_num_boundary, sizeof(int), hipMemcpyDeviceToHost));
+    CUDA_CHECK(hipFree(cu_num_boundary));
+    CUDA_CHECK(hipFree(cu_edge_idx));
+
+    // Set vertex boundary indicator
+    this->vert_is_boundary.resize(this->vertices.size);
+    this->vert_is_boundary.zero();
+    if (this->boundaries.size > 0) {
+       hipLaunchKernelGGL(( set_boundary_vertex_kernel), dim3((this->boundaries.size+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+            this->edges.ptr, this->boundaries.ptr, this->edge2face_cnt.ptr,
+            this->boundaries.size, this->vert_is_boundary.ptr
+        );
+        CUDA_CHECK(hipGetLastError());
+    }
+}
+
+
+static __global__ void get_vertex_boundary_cnt_kernel(
+    const uint64_t* edges,
+    const int* boundaries,
+    const int B,
+    int* vert2bound_cnt
+) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= B) return;
+
+    int eid = boundaries[tid];
+
+    // get edge
+    uint64_t e = edges[eid];
+    int e0 = int(e >> 32);
+    int e1 = int(e & 0xFFFFFFFF);
+
+    // count vertex adjacent boundary number
+    atomicAdd(&vert2bound_cnt[e0], 1);
+    atomicAdd(&vert2bound_cnt[e1], 1);
+}
+
+
+/**
+ * Get vertex-boundary adjacency
+ * 
+ * @param edges: the buffer for edges, shape (E)
+ * @param boundaries: the buffer for boundary edges, shape (B)
+ * @param B: the number of boundary edges
+ * @param vert2bound: the buffer for vertex to boundary adjacency, shape (total_vertex_boundary_cnt)
+ * @param vert2bound_offset: the buffer for vertex to boundary adjacency offset, shape (V+1)
+ * @param vert2bound_cnt: the buffer for vertex adjacent boundary number, shape (V)
+ */
+static __global__ void get_vertex_boundary_adjacency_kernel(
+    const uint64_t* edges,
+    const int* boundaries,
+    const int B,
+    int* vert2bound,
+    int* vert2bound_offset,
+    int* vert2bound_cnt
+) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= B) return;
+
+    int eid = boundaries[tid];
+
+    // get edge
+    uint64_t e = edges[eid];
+    int e0 = int(e >> 32);
+    int e1 = int(e & 0xFFFFFFFF);
+
+    // assign connectivity
+    vert2bound[vert2bound_offset[e0] + atomicAdd(&vert2bound_cnt[e0], 1)] = tid;
+    vert2bound[vert2bound_offset[e1] + atomicAdd(&vert2bound_cnt[e1], 1)] = tid;
+}
+
+
+void CuMesh::get_vertex_boundary_adjacency() {
+    if (this->edges.is_empty()) {
+        this->get_edges();
+    } 
+    if (this->boundaries.is_empty()) {
+        this->get_boundary_info();
+    }
+    size_t V = this->vertices.size;
+    size_t B = this->boundaries.size;
+
+    // Early return if no boundaries
+    if (B == 0) {
+        this->vert2bound_cnt.resize(V + 1);
+        this->vert2bound_cnt.zero();
+        this->vert2bound_offset.resize(V + 1);
+        this->vert2bound_offset.zero();
+        return;
+    }
+
+    // get vertex adjacent boundary number
+    this->vert2bound_cnt.resize(V + 1);
+    this->vert2bound_cnt.zero();
+   hipLaunchKernelGGL(( get_vertex_boundary_cnt_kernel), dim3((B+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+        this->edges.ptr, this->boundaries.ptr, B, this->vert2bound_cnt.ptr
+    );
+    CUDA_CHECK(hipGetLastError());
+
+    // allocate memory for vert2bound_offset
+    this->vert2bound_offset.resize(V + 1);
+    size_t temp_storage_bytes = 0;
+    CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum(
+        nullptr, temp_storage_bytes,
+        this->vert2bound_cnt.ptr, this->vert2bound_offset.ptr,
+        V + 1
+    ));
+    this->cub_temp_storage.resize(temp_storage_bytes);
+    CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum(
+        this->cub_temp_storage.ptr, temp_storage_bytes,
+        this->vert2bound_cnt.ptr, this->vert2bound_offset.ptr,
+        V + 1
+    ));
+
+    // get vertex-boundary adjacency
+    this->vert2bound.resize(2 * B);
+    this->vert2bound_cnt.zero();
+   hipLaunchKernelGGL(( get_vertex_boundary_adjacency_kernel), dim3((B+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+        this->edges.ptr, this->boundaries.ptr, B,
+        this->vert2bound.ptr,
+        this->vert2bound_offset.ptr,
+        this->vert2bound_cnt.ptr
+    );
+    CUDA_CHECK(hipGetLastError());
+}
+
+
+static __global__ void get_vertex_is_manifold_kernel(
+    const int* vert2edge,
+    const int* vert2edge_offset,
+    const int* edge2face_cnt,
+    const int V,
+    uint8_t* vert_is_manifold
+) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= V) return;
+
+    // traverse all edges of the vertex
+    int num_boundaries = 0;
+    bool is_manifold = true;
+    for (int i = vert2edge_offset[tid]; i < vert2edge_offset[tid+1]; i++) {
+        int eid = vert2edge[i];
+        // boundary edge
+        if (edge2face_cnt[eid] == 1) {
+            num_boundaries++;
+            if (num_boundaries > 2) {
+                is_manifold = false;
+                break;
+            }
+        }
+        // non-manifold edge
+        else if (edge2face_cnt[eid] > 2) {
+            is_manifold = false;
+            break;
+        }
+    }
+
+    vert_is_manifold[tid] = is_manifold ? 1 : 0;
+}       
+
+
+void CuMesh::get_vertex_is_manifold() {
+    if (this->vert2edge.is_empty() || this->vert2edge_offset.is_empty()) {
+        this->get_vertex_edge_adjacency();
+    }
+    if (this->edge2face_cnt.is_empty()) {
+        this->get_edges();
+    }
+    size_t V = this->vertices.size;
+
+    // get vertex is manifold
+    this->vert_is_manifold.resize(V);
+   hipLaunchKernelGGL(( get_vertex_is_manifold_kernel), dim3((V+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+        this->vert2edge.ptr,
+        this->vert2edge_offset.ptr,
+        this->edge2face_cnt.ptr,
+        V,
+        this->vert_is_manifold.ptr
+    );
+    CUDA_CHECK(hipGetLastError());
+}
+
+
+/**
+ * Set manifold face adjacency
+ * 
+ * @param manifold_edge_idx: the buffer for manifold edge index, shape (M)
+ * @param edge2face: the buffer for edge to face adjacency, shape (total_edge_face_cnt)
+ * @param edge2face_offset: the buffer for edge to face adjacency offset, shape (E+1)
+ * @param M: the number of manifold edges
+ * @param manifold_face_adj: the buffer for manifold face adjacency, shape (M)
+ */
+static __global__ void set_manifold_face_adj_kernel(
+    const int* manifold_edge_idx,
+    const int* edge2face,
+    const int* edge2face_offset,
+    const int M,
+    int2* manifold_face_adj
+) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= M) return;
+
+    // get edge
+    int edge_idx = manifold_edge_idx[tid];
+
+    // get adjacent faces
+    int start = edge2face_offset[edge_idx];
+    int end = edge2face_offset[edge_idx+1];
+    if (end - start != 2) return;   // if not a manifold edge
+    int f0 = edge2face[start];
+    int f1 = edge2face[start + 1];
+
+    manifold_face_adj[tid] = {f0, f1};
+}
+
+
+struct is_manifold_edge {
+    const int* edge2face_cnt;
+    __host__ __device__
+    bool operator()(const int& idx) const {
+        return edge2face_cnt[idx] == 2;
+    }
+};
+
+
+void CuMesh::get_manifold_face_adjacency() {
+    if (this->edge2face.is_empty() || this->edge2face_offset.is_empty()) {
+        this->get_edge_face_adjacency();
+    }
+    size_t E = this->edges.size;
+
+    // Select manifold edges
+    size_t temp_storage_bytes = 0;
+    int *cu_num_manifold_edges, *cu_edge_idx, *cu_manifold_edge_idx;
+    CUDA_CHECK(hipMalloc(&cu_num_manifold_edges, sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_edge_idx, E * sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_manifold_edge_idx, E * sizeof(int)));
+   hipLaunchKernelGGL(( arange_kernel), dim3((E+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, cu_edge_idx, E);
+    CUDA_CHECK(hipGetLastError());
+    CUDA_CHECK(hipcub::DeviceSelect::If(
+        nullptr, temp_storage_bytes,
+        cu_edge_idx, cu_manifold_edge_idx, cu_num_manifold_edges,
+        E,
+        is_manifold_edge{this->edge2face_cnt.ptr}
+    ));
+    this->cub_temp_storage.resize(temp_storage_bytes);
+    CUDA_CHECK(hipcub::DeviceSelect::If(
+        this->cub_temp_storage.ptr, temp_storage_bytes,
+        cu_edge_idx, cu_manifold_edge_idx, cu_num_manifold_edges,
+        E,
+        is_manifold_edge{this->edge2face_cnt.ptr}
+    ));
+    int manifold_edge_count;
+    CUDA_CHECK(hipMemcpy(&manifold_edge_count, cu_num_manifold_edges, sizeof(int), hipMemcpyDeviceToHost));
+    CUDA_CHECK(hipFree(cu_num_manifold_edges));
+    CUDA_CHECK(hipFree(cu_edge_idx));
+
+    // set manifold_face_adj
+    this->manifold_face_adj.resize(manifold_edge_count);
+   hipLaunchKernelGGL(( set_manifold_face_adj_kernel), dim3((manifold_edge_count+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+        cu_manifold_edge_idx,
+        this->edge2face.ptr,
+        this->edge2face_offset.ptr,
+        manifold_edge_count,
+        this->manifold_face_adj.ptr
+    );
+    CUDA_CHECK(hipGetLastError());
+    CUDA_CHECK(hipFree(cu_manifold_edge_idx));
+}
+
+
+static __global__ void set_manifold_bound_adj_kernel(
+    const int* manifold_boundary_verts_idx,
+    const int* vert2bound,
+    const int* vert2bound_offset,
+    const size_t MBV,
+    int2* manifold_bound_adj
+) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= MBV) return;
+
+    // get vertex
+    int vert_idx = manifold_boundary_verts_idx[tid];
+
+    // get adjacent boundaries
+    int b0 = vert2bound[vert2bound_offset[vert_idx]];
+    int b1 = vert2bound[vert2bound_offset[vert_idx] + 1];
+
+    manifold_bound_adj[tid] = {b0, b1};
+}
+
+
+struct is_manifold_boundary_vertex {
+    const uint8_t* vert_is_manifold;
+    const uint8_t* vert_is_boundary;
+    __host__ __device__
+    bool operator()(const int& idx) const {
+        return vert_is_manifold[idx] && vert_is_boundary[idx];
+    }
+};
+
+
+void CuMesh::get_manifold_boundary_adjacency() {
+    if (this->vert2bound.is_empty() || this->vert2bound_offset.is_empty()) {
+        this->get_vertex_boundary_adjacency();
+    }
+    if (this->vert_is_manifold.is_empty()) {
+        this->get_vertex_is_manifold();
+    }
+    size_t V = this->vertices.size;
+
+    // Select manifold boundary vertices
+    size_t temp_storage_bytes = 0;
+    int *cu_num_manifold_boundary_verts, *cu_vert_idx, *cu_manifold_vert_idx;
+    CUDA_CHECK(hipMalloc(&cu_num_manifold_boundary_verts, sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_vert_idx, V * sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_manifold_vert_idx, V * sizeof(int)));
+   hipLaunchKernelGGL(( arange_kernel), dim3((V+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, cu_vert_idx, V);
+    CUDA_CHECK(hipGetLastError());
+    CUDA_CHECK(hipcub::DeviceSelect::If(
+        nullptr, temp_storage_bytes,
+        cu_vert_idx, cu_manifold_vert_idx, cu_num_manifold_boundary_verts,
+        V,
+        is_manifold_boundary_vertex{this->vert_is_manifold.ptr, this->vert_is_boundary.ptr}
+    ));
+    this->cub_temp_storage.resize(temp_storage_bytes);
+    CUDA_CHECK(hipcub::DeviceSelect::If(
+        this->cub_temp_storage.ptr, temp_storage_bytes,
+        cu_vert_idx, cu_manifold_vert_idx, cu_num_manifold_boundary_verts,
+        V,
+        is_manifold_boundary_vertex{this->vert_is_manifold.ptr, this->vert_is_boundary.ptr}
+    ));
+    int manifold_boundary_vert_count;
+    CUDA_CHECK(hipMemcpy(&manifold_boundary_vert_count, cu_num_manifold_boundary_verts, sizeof(int), hipMemcpyDeviceToHost));
+    CUDA_CHECK(hipFree(cu_num_manifold_boundary_verts));
+    CUDA_CHECK(hipFree(cu_vert_idx));
+
+    // Early return if no manifold boundary vertices
+    if (manifold_boundary_vert_count == 0) {
+        CUDA_CHECK(hipFree(cu_manifold_vert_idx));
+        return;
+    }
+
+    // set manifold_bound_adj
+    this->manifold_bound_adj.resize(manifold_boundary_vert_count);
+   hipLaunchKernelGGL(( set_manifold_bound_adj_kernel), dim3((manifold_boundary_vert_count+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+        cu_manifold_vert_idx,
+        this->vert2bound.ptr,
+        this->vert2bound_offset.ptr,
+        manifold_boundary_vert_count,
+        this->manifold_bound_adj.ptr
+    );
+    CUDA_CHECK(hipGetLastError());
+}
+
+
+void CuMesh::get_connected_components() {
+    if (this->manifold_face_adj.is_empty()) {
+        this->get_manifold_face_adjacency();
+    }
+
+    size_t M = this->manifold_face_adj.size;
+    size_t F = this->faces.size;
+
+    // Iterative Hook and Compress
+    this->conn_comp_ids.resize(F);
+   hipLaunchKernelGGL(( arange_kernel), dim3((F+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, this->conn_comp_ids.ptr, F);
+    CUDA_CHECK(hipGetLastError());
+    int* cu_end_flag; int h_end_flag;
+    CUDA_CHECK(hipMalloc(&cu_end_flag, sizeof(int)));
+    do {
+        h_end_flag = 1;
+        CUDA_CHECK(hipMemcpy(cu_end_flag, &h_end_flag, sizeof(int), hipMemcpyHostToDevice));
+
+        // Hook
+       hipLaunchKernelGGL(( hook_edges_kernel), dim3((M+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+            this->manifold_face_adj.ptr,
+            M,
+            this->conn_comp_ids.ptr,
+            cu_end_flag
+        );
+        CUDA_CHECK(hipGetLastError());
+
+        // Compress
+       hipLaunchKernelGGL(( compress_components_kernel), dim3((F+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+            this->conn_comp_ids.ptr,
+            F
+        );
+        CUDA_CHECK(hipGetLastError());
+        CUDA_CHECK(hipMemcpy(&h_end_flag, cu_end_flag, sizeof(int), hipMemcpyDeviceToHost));
+    } while (h_end_flag == 0);
+    CUDA_CHECK(hipFree(cu_end_flag));
+
+    // Compresses boundary components
+    this->num_conn_comps = compress_ids(this->conn_comp_ids.ptr, F, this->cub_temp_storage);
+}
+
+
+void CuMesh::get_boundary_connected_components() {
+    if (this->manifold_bound_adj.is_empty()) {
+        this->get_manifold_boundary_adjacency();
+    }
+    size_t M = this->manifold_bound_adj.size;
+    size_t B = this->boundaries.size;
+
+    // Early return if no boundaries
+    if (B == 0) {
+        this->num_bound_conn_comps = 0;
+        return;
+    }
+
+    // Iterative Hook and Compress
+    this->bound_conn_comp_ids.resize(B);
+   hipLaunchKernelGGL(( arange_kernel), dim3((B+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, this->bound_conn_comp_ids.ptr, B);
+    CUDA_CHECK(hipGetLastError());
+    int* cu_end_flag; int h_end_flag;
+    CUDA_CHECK(hipMalloc(&cu_end_flag, sizeof(int)));
+    do {
+        h_end_flag = 1;
+        CUDA_CHECK(hipMemcpy(cu_end_flag, &h_end_flag, sizeof(int), hipMemcpyHostToDevice));
+
+        // Hook
+       hipLaunchKernelGGL(( hook_edges_kernel), dim3((M+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+            this->manifold_bound_adj.ptr,
+            M,
+            this->bound_conn_comp_ids.ptr,
+            cu_end_flag
+        );
+        CUDA_CHECK(hipGetLastError());
+
+        // Compress
+       hipLaunchKernelGGL(( compress_components_kernel), dim3((B+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+            this->bound_conn_comp_ids.ptr,
+            B
+        );
+        CUDA_CHECK(hipGetLastError());
+        CUDA_CHECK(hipMemcpy(&h_end_flag, cu_end_flag, sizeof(int), hipMemcpyDeviceToHost));
+    } while (h_end_flag == 0);
+    CUDA_CHECK(hipFree(cu_end_flag));
+
+    // Compresses boundary components
+    this->num_bound_conn_comps = compress_ids(this->bound_conn_comp_ids.ptr, B, this->cub_temp_storage);
+}
+
+
+static __global__ void is_bound_conn_comp_loop_kernel(
+    const uint64_t* edges,
+    const int* boundaries,
+    const int* bound_conn_comp_ids,
+    const int* vert2bound,
+    const int* vert2bound_offset,
+    const int B,
+    int* is_bound_conn_comp_loop
+) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= B) return;
+
+    // get edge
+    int eid = boundaries[tid];
+    uint64_t e = edges[eid];
+    int e0 = int(e >> 32);
+    int e1 = int(e & 0xFFFFFFFF);
+
+    int self_comp_id = bound_conn_comp_ids[tid];
+
+    // check if both vertices are connected to another boundary with the same connected component id
+    int cnt = 0;
+    for (int i = vert2bound_offset[e0]; i < vert2bound_offset[e0+1]; i++) {
+        int b = vert2bound[i];
+        if (b == tid) continue; // skip self
+        int comp_id = bound_conn_comp_ids[b];
+        if (comp_id == self_comp_id) cnt++;
+    }
+    if (cnt == 0) {
+        is_bound_conn_comp_loop[self_comp_id] = 0;   // no loop
+        return;
+    }
+    cnt = 0;
+    for (int i = vert2bound_offset[e1]; i < vert2bound_offset[e1+1]; i++) {
+        int b = vert2bound[i];
+        if (b == tid) continue; // skip self
+        int comp_id = bound_conn_comp_ids[b];
+        if (comp_id == self_comp_id) cnt++;
+    }
+    if (cnt == 0) {
+        is_bound_conn_comp_loop[self_comp_id] = 0;   // no loop
+        return;
+    }
+}
+
+
+void CuMesh::get_boundary_loops() {
+    if (this->bound_conn_comp_ids.is_empty()) {
+        this->get_boundary_connected_components();
+    }
+
+    size_t B = this->boundaries.size;
+
+    // Early return if no boundaries or boundary components
+    if (B == 0 || this->num_bound_conn_comps == 0) {
+        this->num_bound_loops = 0;
+        return;
+    }
+
+    // Check if boundary components are loops
+    int* cu_is_bound_conn_comp_loop;
+    CUDA_CHECK(hipMalloc(&cu_is_bound_conn_comp_loop, this->num_bound_conn_comps * sizeof(int)));
+   hipLaunchKernelGGL(( fill_kernel), dim3((this->num_bound_conn_comps+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+        cu_is_bound_conn_comp_loop,
+        this->num_bound_conn_comps,
+        1
+    );
+    CUDA_CHECK(hipGetLastError());
+   hipLaunchKernelGGL(( is_bound_conn_comp_loop_kernel), dim3((B+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+        this->edges.ptr,
+        this->boundaries.ptr,
+        this->bound_conn_comp_ids.ptr,
+        this->vert2bound.ptr,
+        this->vert2bound_offset.ptr,
+        B,
+        cu_is_bound_conn_comp_loop
+    );
+    CUDA_CHECK(hipGetLastError());
+    int* cu_num_bound_loops;
+    CUDA_CHECK(hipMalloc(&cu_num_bound_loops, sizeof(int)));
+    size_t temp_storage_bytes = 0;
+    CUDA_CHECK(hipcub::DeviceReduce::Sum(
+        nullptr, temp_storage_bytes,
+        cu_is_bound_conn_comp_loop,
+        cu_num_bound_loops,
+        this->num_bound_conn_comps
+    ));
+    this->cub_temp_storage.resize(temp_storage_bytes);
+    CUDA_CHECK(hipcub::DeviceReduce::Sum(
+        this->cub_temp_storage.ptr, temp_storage_bytes,
+        cu_is_bound_conn_comp_loop,
+        cu_num_bound_loops,
+        this->num_bound_conn_comps
+    ));
+    CUDA_CHECK(hipMemcpy(&this->num_bound_loops, cu_num_bound_loops, sizeof(int), hipMemcpyDeviceToHost));
+    CUDA_CHECK(hipFree(cu_num_bound_loops));
+    if (this->num_bound_loops == 0) {
+        CUDA_CHECK(hipFree(cu_is_bound_conn_comp_loop));
+        return;
+    }
+    
+    // Sort boundaries by connected component ids
+    int *cu_bound_sorted, *cu_bound_conn_comp_ids_sorted;
+    CUDA_CHECK(hipMalloc(&cu_bound_sorted, B * sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_bound_conn_comp_ids_sorted, B * sizeof(int)));
+    temp_storage_bytes = 0;
+    CUDA_CHECK(hipcub::DeviceRadixSort::SortPairs(
+        nullptr, temp_storage_bytes,
+        this->bound_conn_comp_ids.ptr, cu_bound_conn_comp_ids_sorted,
+        this->boundaries.ptr, cu_bound_sorted,
+        B
+    ));
+    this->cub_temp_storage.resize(temp_storage_bytes);
+    CUDA_CHECK(hipcub::DeviceRadixSort::SortPairs(
+        this->cub_temp_storage.ptr, temp_storage_bytes,
+        this->bound_conn_comp_ids.ptr, cu_bound_conn_comp_ids_sorted,
+        this->boundaries.ptr, cu_bound_sorted,
+        B
+    ));
+
+    // Select loops
+    int* cu_bound_is_on_loop;
+    CUDA_CHECK(hipMalloc(&cu_bound_is_on_loop, B * sizeof(int)));
+   hipLaunchKernelGGL(( index_kernel), dim3((B+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+        cu_is_bound_conn_comp_loop,
+        cu_bound_conn_comp_ids_sorted,
+        B,
+        cu_bound_is_on_loop
+    );
+    CUDA_CHECK(hipGetLastError());
+    CUDA_CHECK(hipFree(cu_is_bound_conn_comp_loop));
+    this->loop_boundaries.resize(B);
+    int *cu_loop_bound_conn_comp_ids_sorted, *cu_num_bound_on_loop;
+    CUDA_CHECK(hipMalloc(&cu_loop_bound_conn_comp_ids_sorted, B * sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_num_bound_on_loop, sizeof(int)));
+    temp_storage_bytes = 0;
+    CUDA_CHECK(hipcub::DeviceSelect::Flagged(
+        nullptr, temp_storage_bytes,
+        cu_bound_sorted, cu_bound_is_on_loop, this->loop_boundaries.ptr, cu_num_bound_on_loop,
+        B
+    ));
+    this->cub_temp_storage.resize(temp_storage_bytes);
+    CUDA_CHECK(hipcub::DeviceSelect::Flagged(
+        this->cub_temp_storage.ptr, temp_storage_bytes,
+        cu_bound_sorted, cu_bound_is_on_loop, this->loop_boundaries.ptr, cu_num_bound_on_loop,
+        B
+    ));
+    int num_bound_on_loop;
+    CUDA_CHECK(hipMemcpy(&num_bound_on_loop, cu_num_bound_on_loop, sizeof(int), hipMemcpyDeviceToHost));
+    CUDA_CHECK(hipFree(cu_bound_sorted));
+    this->loop_boundaries.resize(num_bound_on_loop);
+    temp_storage_bytes = 0;
+    CUDA_CHECK(hipcub::DeviceSelect::Flagged(
+        nullptr, temp_storage_bytes,
+        cu_bound_conn_comp_ids_sorted, cu_bound_is_on_loop, cu_loop_bound_conn_comp_ids_sorted, cu_num_bound_on_loop,
+        B
+    ));
+    this->cub_temp_storage.resize(temp_storage_bytes);
+    CUDA_CHECK(hipcub::DeviceSelect::Flagged(
+        this->cub_temp_storage.ptr, temp_storage_bytes,
+        cu_bound_conn_comp_ids_sorted, cu_bound_is_on_loop, cu_loop_bound_conn_comp_ids_sorted, cu_num_bound_on_loop,
+        B
+    ));
+    CUDA_CHECK(hipFree(cu_bound_conn_comp_ids_sorted));
+    CUDA_CHECK(hipFree(cu_bound_is_on_loop));
+    CUDA_CHECK(hipFree(cu_num_bound_on_loop));
+    
+    // RLE
+    this->loop_boundaries_offset.resize(this->num_bound_loops + 1);
+    this->loop_boundaries_offset.zero();
+    int* cu_rle_unique_out, *cu_rle_num_runs;
+    CUDA_CHECK(hipMalloc(&cu_rle_unique_out, this->num_bound_loops * sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_rle_num_runs, sizeof(int)));
+    temp_storage_bytes = 0;
+    CUDA_CHECK(hipcub::DeviceRunLengthEncode::Encode(
+        nullptr, temp_storage_bytes,
+        cu_loop_bound_conn_comp_ids_sorted,
+        cu_rle_unique_out, this->loop_boundaries_offset.ptr, cu_rle_num_runs,
+        num_bound_on_loop
+    ));
+    this->cub_temp_storage.resize(temp_storage_bytes);
+    CUDA_CHECK(hipcub::DeviceRunLengthEncode::Encode(
+        this->cub_temp_storage.ptr, temp_storage_bytes,
+        cu_loop_bound_conn_comp_ids_sorted,
+        cu_rle_unique_out, this->loop_boundaries_offset.ptr, cu_rle_num_runs,
+        num_bound_on_loop
+    ));
+    CUDA_CHECK(hipFree(cu_loop_bound_conn_comp_ids_sorted));
+    CUDA_CHECK(hipFree(cu_rle_unique_out));
+    CUDA_CHECK(hipFree(cu_rle_num_runs));
+
+    // Scan loop boundaries offset
+    temp_storage_bytes = 0;
+    CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum(
+        nullptr, temp_storage_bytes,
+        this->loop_boundaries_offset.ptr,
+        this->num_bound_loops + 1
+    ));
+    this->cub_temp_storage.resize(temp_storage_bytes);
+    CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum(
+        this->cub_temp_storage.ptr, temp_storage_bytes,
+        this->loop_boundaries_offset.ptr,
+        this->num_bound_loops + 1
+    ));
+}
+
+
+} // namespace cumesh
diff --git a/src/cumesh.h b/src/cumesh.h
index 01a073b..e0da9ca 100644
--- a/src/cumesh.h
+++ b/src/cumesh.h
@@ -1,7 +1,11 @@
 #pragma once
 
+#ifdef __HIP_PLATFORM_AMD__
+#include <hip/hip_runtime.h>
+#else
 #include <cuda.h>
 #include <cuda_runtime.h>
+#endif
 #include <torch/extension.h>
 
 #include "utils.h"
diff --git a/src/cumesh.hip b/src/cumesh.hip
new file mode 100644
index 0000000..d5860db
--- /dev/null
+++ b/src/cumesh.hip
@@ -0,0 +1,143 @@
+// !!! This is a file automatically generated by hipify!!!
+#include "cumesh_hip.h"
+
+
+namespace cumesh {
+
+CuMesh::CuMesh() {}
+
+CuMesh::~CuMesh() {
+    vertices.free();
+    faces.free();
+    face_areas.free();
+    face_normals.free();
+    vertex_normals.free();
+    edges.free();
+    boundaries.free();
+    vert_is_boundary.free();
+    vert_is_manifold.free();
+    vert2edge.free();
+    vert2edge_cnt.free();
+    vert2edge_offset.free();
+    vert2bound.free();
+    vert2bound_cnt.free();
+    vert2bound_offset.free();
+    edge2face.free();
+    edge2face_cnt.free();
+    edge2face_offset.free();
+    face2edge.free();
+    vert2face.free();
+    vert2face_cnt.free();
+    vert2face_offset.free();
+    manifold_face_adj.free();
+    manifold_bound_adj.free();
+    conn_comp_ids.free();
+    bound_conn_comp_ids.free();
+    loop_boundaries.free();
+    loop_boundaries_offset.free();
+    vertices_map.free();
+    faces_map.free();
+    edge_collapse_costs.free();
+    propagated_costs.free();
+
+    atlas_chart_ids.free();
+    atlas_chart_vertex_map.free();
+    atlas_chart_faces.free();
+    atlas_chart_faces_offset.free();
+    atlas_chart_vertex_offset.free();
+    atlas_chart_uvs.free();
+
+    atlas_chart_normal_cones.free();
+    atlas_chart_adj.free();
+    atlas_chart_adj_length.free();
+    atlas_chart_perims.free();
+    atlas_chart_areas.free();
+    atlas_chart2edge.free();
+    atlas_chart2edge_cnt.free();
+    atlas_chart2edge_offset.free();
+
+    temp_storage.free();
+    cub_temp_storage.free();
+}
+
+int CuMesh::num_vertices() const {
+    return vertices.size;
+}
+
+int CuMesh::num_faces() const {
+    return faces.size;
+}
+
+int CuMesh::num_edges() const {
+    return edges.size;
+}
+
+int CuMesh::num_boundaries() const {
+    return boundaries.size;
+}
+
+int CuMesh::num_conneted_components() const {
+    return num_conn_comps;
+}
+
+int CuMesh::num_boundary_conneted_components() const {
+    return num_bound_conn_comps;
+}
+
+int CuMesh::num_boundary_loops() const {
+    return num_bound_loops;
+}
+
+void CuMesh::clear_cache() {
+    face_areas.free();
+    face_normals.free();
+    vertex_normals.free();
+    edges.free();
+    boundaries.free();
+    vert_is_boundary.free();
+    vert_is_manifold.free();
+    vert2edge.free();
+    vert2edge_cnt.free();
+    vert2edge_offset.free();
+    vert2bound.free();
+    vert2bound_cnt.free();
+    vert2bound_offset.free();
+    edge2face.free();
+    edge2face_cnt.free();
+    edge2face_offset.free();
+    face2edge.free();
+    vert2face.free();
+    vert2face_cnt.free();
+    vert2face_offset.free();
+    manifold_face_adj.free();
+    manifold_bound_adj.free();
+    conn_comp_ids.free();
+    bound_conn_comp_ids.free();
+    loop_boundaries.free();
+    loop_boundaries_offset.free();
+    vertices_map.free();
+    faces_map.free();
+    edge_collapse_costs.free();
+    propagated_costs.free();
+
+    atlas_chart_ids.free();
+    atlas_chart_vertex_map.free();
+    atlas_chart_faces.free();
+    atlas_chart_faces_offset.free();
+    atlas_chart_vertex_offset.free();
+    atlas_chart_uvs.free();
+
+    atlas_chart_normal_cones.free();
+    atlas_chart_adj.free();
+    atlas_chart_adj_length.free();
+    atlas_chart_perims.free();
+    atlas_chart_areas.free();
+    atlas_chart2edge.free();
+    atlas_chart2edge_cnt.free();
+    atlas_chart2edge_offset.free();
+
+    temp_storage.free();
+    cub_temp_storage.free();
+}
+
+} // namespace cumesh
diff --git a/src/cumesh_hip.h b/src/cumesh_hip.h
new file mode 100644
index 0000000..243e27d
--- /dev/null
+++ b/src/cumesh_hip.h
@@ -0,0 +1,509 @@
+// !!! This is a file automatically generated by hipify!!!
+#pragma once
+
+#ifdef __HIP_PLATFORM_AMD__
+#include <hip/hip_runtime.h>
+#else
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime.h>
+#endif
+#include <torch/extension.h>
+
+#include "utils_hip.h"
+
+
+#define BLOCK_SIZE 256
+
+
+namespace cumesh {
+
+class CuMesh {
+public:
+    Buffer<float3> vertices;
+    Buffer<int3> faces;
+
+    // Geometric properties
+    Buffer<float> face_areas;
+    Buffer<float3> face_normals;
+    Buffer<float3> vertex_normals;
+
+    // Connectivity
+    Buffer<uint64_t> edges;
+    Buffer<int> boundaries;
+    Buffer<uint8_t> vert_is_boundary;
+    Buffer<uint8_t> vert_is_manifold;
+    Buffer<int> vert2edge;
+    Buffer<int> vert2edge_cnt;
+    Buffer<int> vert2edge_offset;
+    Buffer<int> vert2bound;
+    Buffer<int> vert2bound_cnt;
+    Buffer<int> vert2bound_offset;
+    Buffer<int> edge2face;
+    Buffer<int> edge2face_cnt;
+    Buffer<int> edge2face_offset;
+    Buffer<int3> face2edge;
+    Buffer<int> vert2face;
+    Buffer<int> vert2face_cnt;
+    Buffer<int> vert2face_offset;
+    Buffer<int2> manifold_face_adj;
+    Buffer<int2> manifold_bound_adj;
+    Buffer<int> conn_comp_ids;
+    Buffer<int> bound_conn_comp_ids;
+    Buffer<int> loop_boundaries;
+    Buffer<int> loop_boundaries_offset;
+    int num_conn_comps;
+    int num_bound_conn_comps;
+    int num_bound_loops;
+
+    // Cleanup
+    Buffer<int> vertices_map;
+    Buffer<int> faces_map;
+
+    // Simplification
+    Buffer<float> edge_collapse_costs;
+    Buffer<uint64_t> propagated_costs;
+
+    // Atlasing
+    int atlas_num_charts;
+    Buffer<int> atlas_chart_ids;
+    Buffer<int> atlas_chart_vertex_map;
+    Buffer<int3> atlas_chart_faces;
+    Buffer<int> atlas_chart_faces_offset;
+    Buffer<int> atlas_chart_vertex_offset;
+    Buffer<float2> atlas_chart_uvs;
+
+    Buffer<float4> atlas_chart_normal_cones;
+    Buffer<uint64_t> atlas_chart_adj;
+    Buffer<float> atlas_chart_adj_length;
+    Buffer<float> atlas_chart_perims;
+    Buffer<float> atlas_chart_areas;
+    Buffer<int> atlas_chart2edge;
+    Buffer<int> atlas_chart2edge_cnt;
+    Buffer<int> atlas_chart2edge_offset;
+
+    // Temporary storage
+    Buffer<char> temp_storage;
+    Buffer<char> cub_temp_storage;
+
+    CuMesh();
+
+    ~CuMesh();
+
+    int num_vertices() const;
+
+    int num_faces() const;
+
+    int num_edges() const;
+
+    int num_boundaries() const;
+
+    int num_conneted_components() const;
+
+    int num_boundary_conneted_components() const;
+
+    int num_boundary_loops() const;
+
+    void clear_cache();
+
+    /**
+     * Initialize mesh
+     * 
+     * @param vertices The vertex positions as an [V, 3] tensor.
+     * @param faces The triangle faces as an [F, 3] tensor.
+     */
+    void init(const torch::Tensor& vertices, const torch::Tensor& faces);
+
+    /**
+     * Get the mesh.
+     *
+     * @return A tuple of the vertex positions and the triangle faces.
+     */
+    std::tuple<torch::Tensor, torch::Tensor> read();
+
+    /**
+     * Get the face normals.
+     * 
+     * @return The face normals as an [F, 3] tensor.
+     */
+    torch::Tensor read_face_normals();
+
+    /**
+     * Get the normals of the vertices.
+     * 
+     * @return The vertex normals as an [V, 3] tensor.
+     */
+    torch::Tensor read_vertex_normals();
+    
+    /**
+     * Get the edges of the mesh.
+     * 
+     * @return The edges as an [E, 2] tensor.
+     */
+    torch::Tensor read_edges();
+
+    /**
+     * Get the boundaries of the mesh.
+     * 
+     * @return The boundaries as an [B] tensor.
+     *         Each element is the index of a boundary edge.
+     */
+    torch::Tensor read_boundaries();
+
+    /**
+     * Get the manifold faces adjacency.
+     * 
+     * @return The manifold faces adjacency as an [M, 2] tensor.
+     */
+    torch::Tensor read_manifold_face_adjacency();
+
+    /**
+     * Get the manifold boundary adjacency.
+     * 
+     * @return The manifold boundary adjacency as an [M, 2] tensor.
+     */
+    torch::Tensor read_manifold_boundary_adjacency();
+
+    /**
+     * Get the connected components of the mesh.
+     *
+     * @return A tuple of:
+     * - The number of connected components.
+     * - The connected components ids as an [F] tensor.
+     */
+    std::tuple<int, torch::Tensor> read_connected_components();
+
+    /**
+     * Get the connected components of the mesh boundaries.
+     *
+     * @return A tuple of:
+     * - The number of boundary connected components.
+     * - The boundary connected components ids as an [B] tensor.
+     */
+    std::tuple<int, torch::Tensor> read_boundary_connected_components();
+
+    /**
+     * Get the boundary loops of the mesh.
+     *
+     * @return A tuple of:
+     * - The number of boundary loops.
+     * - The boundary loops as an [L] tensor.
+     * - The boundary loops offsets as an [L+1] tensor.
+     */
+    std::tuple<int, torch::Tensor, torch::Tensor> read_boundary_loops();
+
+    /**
+     * Get all cached data.
+     * 
+     * @return A dictionary of all cached data.
+     */
+    std::unordered_map<std::string, torch::Tensor> read_all_cache();
+    
+
+    // Geometric functions
+
+    /**
+     * Compute face areas.
+     * This function refreshes:
+     * - face_areas
+     */
+    void compute_face_areas();
+
+    /**
+     * Compute face normals.
+     * This function refreshes:
+     * - face_normals
+     */
+    void compute_face_normals();
+
+    /**
+     * Compute vertex normals.
+     * This function requires:
+     * - vert2face
+     * - vert2face_offset
+     * This function refreshes:
+     * - vertex_normals
+     */
+    void compute_vertex_normals();
+
+
+    // Connectivity functions
+
+    /**
+     * Get the vertex to face adjacency.
+     * This function refreshes:
+     * - vert2face
+     * - vert2face_cnt
+     * - vert2face_offset
+     */
+    void get_vertex_face_adjacency();
+
+    /**
+     * Get the edges of the mesh.
+     * This function refreshes:
+     * - edges
+     * - edge2face_cnt
+     */
+    void get_edges();
+
+    /**
+     * Get the edges of the mesh.
+     * This function requires:
+     * - edges
+     * - edge2face_cnt
+     * - vert2face
+     * - vert2face_offset
+     * This function refreshes:
+     * - edge2face
+     * - edge2face_offset
+     * - face2edge
+     */
+    void get_edge_face_adjacency();
+
+    /**
+     * Get the vertex to edge adjacency.
+     * This function requires:
+     * - edges
+     * This function refreshes:
+     * - vert2edge
+     * - vert2edge_cnt
+     * - vert2edge_offset
+     */
+    void get_vertex_edge_adjacency();
+
+    /**
+     * Get boundary information.
+     * This function requires:
+     * - edges
+     * - edge2face_cnt
+     * This function refreshes:
+     * - boundaries
+     * - vert_is_boundary
+     */
+    void get_boundary_info();
+
+    /**
+     * Get the vertex to boundary adjacency.
+     * This function requires:
+     * - edges
+     * - boundaries
+     * This function refreshes:
+     * - vert2bound
+     * - vert2bound_cnt
+     * - vert2bound_offset
+     */
+    void get_vertex_boundary_adjacency();
+
+    /**
+     * Get edge is manifold information.
+     * This function requires:
+     * - vert2edge
+     * - vert2edge_offset
+     * - edge2face_cnt
+     * This function refreshes:
+     * - vert_is_manifold
+     */
+    void get_vertex_is_manifold();
+
+    /**
+     * Get the face adjacency for manifold edges.
+     * This function requires:
+     * - edge2face
+     * - edge2face_offset
+     * This function refreshes:
+     * - manifold_face_adj
+     */
+    void get_manifold_face_adjacency();
+
+    /**
+     * Get the face adjacency for manifold boundaries.
+     * This function requires:
+     * - vert_is_manifold
+     * - vert2bound
+     * - vert2bound_offset
+     * This function refreshes:
+     * - manifold_bound_adj
+     */
+    void get_manifold_boundary_adjacency();
+
+    /**
+     * Get the connected components of the mesh.
+     * This function requires:
+     * - manifold_face_adj
+     * This function refreshes:
+     * - conn_comp_ids
+     */
+    void get_connected_components();
+
+    /**
+     * Get the boundary connected components of the mesh.
+     * This function requires:
+     * - manifold_bound_adj
+     * This function refreshes:
+     * - bound_conn_comp_ids
+     */
+    void get_boundary_connected_components();
+
+    /**
+     * Get the boundary loops of the mesh.
+     * This function requires:
+     * - vert2bound
+     * - vert2bound_offset
+     * - vert_is_boundary
+     * - bound_conn_comp_ids
+     * This function refreshes:
+     * - loop_boundaries
+     * - loop_boundaries_offset
+     */
+    void get_boundary_loops();
+
+
+    // Cleanup functions
+    
+    /**
+     * Remove faces.
+     */
+    void remove_faces(torch::Tensor& face_mask);
+    void _remove_faces(uint8_t* face_mask);
+
+    /**
+     * Remove unreferenced vertices.
+     */
+    void remove_unreferenced_vertices();
+
+    /**
+     * Remove duplicate faces.
+     */
+    void remove_duplicate_faces();
+
+    /**
+     * Remove degenerate faces.
+     */
+    void remove_degenerate_faces(float abs_thresh, float rel_thresh);
+
+    /**
+     * Fill holes.
+     * This function requires:
+     * - loop_boundaries
+     * - loop_boundaries_offset
+     * 
+     * @param max_hole_perimeter The maximum perimeter of a hole to be filled.
+     */
+    void fill_holes(float max_hole_perimeter);
+
+    /**
+     * Repair Non-manifold edges by splitting edges.
+     * This function requires:
+     * - manifold_face_adj
+     * This function refreshes:
+     * - vertices
+     * - faces
+     * This function destroys:
+     * - All connectivity information
+     */
+    void repair_non_manifold_edges();
+
+    /**
+     * Remove faces on non-manifold edges.
+     * For each non-manifold edge (shared by >2 faces), only keep the first 2 faces.
+     * This repairs non-manifold edges by deleting faces instead of splitting vertices.
+     * This function requires:
+     * - edge2face
+     * - edge2face_offset
+     * - edge2face_cnt
+     * This function refreshes:
+     * - vertices
+     * - faces
+     * This function destroys:
+     * - All connectivity information
+     */
+    void remove_non_manifold_faces();
+
+    /**
+     * Remove small connected components.
+     * This function requires:
+     * - conn_comp_ids
+     * This function refreshes:
+     * - vertices
+     * - faces
+     * This function destroys:
+     * - All connectivity information
+     * 
+     * @param min_area The minimum area of the connected components to be kept.
+     */
+    void remove_small_connected_components(float min_area);
+
+    /**
+     * Unify face orientations.
+     * This function requires:
+     * - manifold_face_adj
+     * This function refreshes:
+     * - faces
+     */
+    void unify_face_orientations();
+    
+
+    // Simplification functions
+
+    /**
+     * Run the edge collapse algorithm.
+     * This function refreshes:
+     * - vertices
+     * - faces
+     * This function destroys:
+     * - All connectivity information
+     * 
+     * @param lambda_edge_length The weight for edge length term.
+     * @param lambda_skinny The weight for skinny term.
+     * @param threshold The threshold for edge collapse cost.
+     * @return A tuple of the number of vertices and the number of faces after simplification.
+     */
+    std::tuple<int, int> simplify_step(float lambda_edge_length, float lambda_skinny, float threshold, bool timing=false);
+
+
+    // Atlasing functions
+
+   /**
+     * Compute charts for atlasing.
+     * This function requires:
+     * - manifold_face_adj
+     * This function refreshes:
+     * - atlas_face_chart_ids
+     * - atlas_chart_vertex_map
+     * - atlas_chart_faces
+     * - atlas_chart_faces_offset
+     *
+     *  @param  threshold_cone_half_angle_rad The threshold for the cone half angle in radians.
+     *  @param  refine_iterations             The number of refinement iterations.
+     *  @param  global_iterations             The number of global iterations.
+     *  @param  smooth_strength               The strength of the smoothing.
+     *  @param  area_penalty_weight           Coefficient for chart size penalty. Cost += Area * weight.
+     *                                        Prevents charts from becoming too large if > 0, 
+     *                                        or encourages larger charts if < 0 (though usually used to penalize size variance).
+     *  @param  perimeter_area_ratio_weight   Coefficient for shape irregularity (long-strip) penalty. 
+     *                                        Cost += (Perimeter / Area) * weight.
+     *                                        Higher values penalize long strips and encourage circular/compact shapes.
+     */
+    void compute_charts(
+        float threshold_cone_half_angle_rad, 
+        int refine_iterations, 
+        int global_iterations, 
+        float smooth_strength,
+        float area_penalty_weight,
+        float perimeter_area_ratio_weight
+    );
+
+    /**
+     * Read the atlas charts.
+     *
+     * @return A tuple of:
+     * - The number of charts.
+     * - The chart ids as an [F] tensor.
+     * - The chart vertex map as an [V] tensor.
+     * - The chart faces as an [F, 3] tensor.
+     * - The chart vertices offset as an [C+1] tensor.
+     * - The chart faces offset as an [C+1] tensor.
+     */
+    std::tuple<int, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor> read_atlas_charts();
+};
+
+} // namespace cumesh
diff --git a/src/dtypes.cuh b/src/dtypes.cuh
index bff560c..ddc31ea 100644
--- a/src/dtypes.cuh
+++ b/src/dtypes.cuh
@@ -1,7 +1,11 @@
 #pragma once
 
+#ifdef __HIP_PLATFORM_AMD__
+#include <hip/hip_runtime.h>
+#else
 #include <cuda.h>
 #include <cuda_runtime.h>
+#endif
 
 
 namespace cumesh {
@@ -13,9 +17,9 @@ namespace cumesh {
 struct __align__(16) Vec3f {
     float x, y, z;
 
-    __device__ __forceinline__ Vec3f();
-    __device__ __forceinline__ Vec3f(float x, float y, float z);
-    __device__ __forceinline__ Vec3f(float3 v);
+    __host__ __device__ __forceinline__ Vec3f();
+    __host__ __device__ __forceinline__ Vec3f(float x, float y, float z);
+    __host__ __device__ __forceinline__ Vec3f(float3 v);
     __device__ __forceinline__ Vec3f operator+(const Vec3f& o) const;
     __device__ __forceinline__ Vec3f& operator+=(const Vec3f& o);
     __device__ __forceinline__ Vec3f operator-(const Vec3f& o) const;
@@ -55,19 +59,19 @@ struct __align__(16) QEM
 };
 
 
-__device__ __forceinline__ Vec3f::Vec3f() {
+__host__ __device__ __forceinline__ Vec3f::Vec3f() {
     x = 0.0f;
     y = 0.0f;
     z = 0.0f;
 }
 
-__device__ __forceinline__ Vec3f::Vec3f(float x, float y, float z) {
+__host__ __device__ __forceinline__ Vec3f::Vec3f(float x, float y, float z) {
     this->x = x;
     this->y = y;
     this->z = z;
 }
 
-__device__ __forceinline__ Vec3f::Vec3f(float3 v) {
+__host__ __device__ __forceinline__ Vec3f::Vec3f(float3 v) {
     x = v.x;
     y = v.y;
     z = v.z;
diff --git a/src/dtypes_hip.cuh b/src/dtypes_hip.cuh
new file mode 100644
index 0000000..c46201d
--- /dev/null
+++ b/src/dtypes_hip.cuh
@@ -0,0 +1,329 @@
+// !!! This is a file automatically generated by hipify!!!
+#pragma once
+
+#ifdef __HIP_PLATFORM_AMD__
+#include <hip/hip_runtime.h>
+#else
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime.h>
+#endif
+
+
+namespace cumesh {
+
+
+/**
+ * A 3D vector class with overloaded operators and methods.
+ */
+struct __align__(16) Vec3f {
+    float x, y, z;
+
+    __host__ __device__ __forceinline__ Vec3f();
+    __host__ __device__ __forceinline__ Vec3f(float x, float y, float z);
+    __host__ __device__ __forceinline__ Vec3f(float3 v);
+    __device__ __forceinline__ Vec3f operator+(const Vec3f& o) const;
+    __device__ __forceinline__ Vec3f& operator+=(const Vec3f& o);
+    __device__ __forceinline__ Vec3f operator-(const Vec3f& o) const;
+    __device__ __forceinline__ Vec3f& operator-=(const Vec3f& o);
+    __device__ __forceinline__ Vec3f operator*(float s) const;
+    __device__ __forceinline__ Vec3f& operator*=(float s);
+    __device__ __forceinline__ Vec3f operator/(float s) const;
+    __device__ __forceinline__ Vec3f& operator/=(float s);
+    __device__ __forceinline__ float dot(const Vec3f& o) const;
+    __device__ __forceinline__ float norm() const;
+    __device__ __forceinline__ float norm2() const;
+    __device__ __forceinline__ Vec3f normalized() const;
+    __device__ __forceinline__ void normalize();
+    __device__ __forceinline__ Vec3f cross(const Vec3f& o) const;
+    __device__ __forceinline__ Vec3f slerp(const Vec3f& o, float t) const;
+};
+
+
+/**
+ * QEM (Quadric Error Metric) class for mesh simplification.
+ */
+struct __align__(16) QEM
+{
+    // store upper triangle of symmetric 4x4 matrix:
+    // e = [ 00, 01, 02, 03, 11, 12, 13, 22, 23, 33 ]
+    float e[10];
+
+    __device__ __forceinline__ QEM();
+    __device__ __forceinline__ QEM operator+(const QEM& o) const;
+    __device__ __forceinline__ QEM& operator+=(const QEM& o);
+    __device__ __forceinline__ QEM operator-(const QEM& o) const;
+    __device__ __forceinline__ QEM& operator-=(const QEM& o);
+    __device__ __forceinline__ void zero();
+    __device__ __forceinline__ void add_plane(float4 p);
+    __device__ __forceinline__ float evaluate(const Vec3f& p) const;
+    __device__ __forceinline__ bool solve_optimal(float3 &out, float &err) const;
+};
+
+
+__host__ __device__ __forceinline__ Vec3f::Vec3f() {
+    x = 0.0f;
+    y = 0.0f;
+    z = 0.0f;
+}
+
+__host__ __device__ __forceinline__ Vec3f::Vec3f(float x, float y, float z) {
+    this->x = x;
+    this->y = y;
+    this->z = z;
+}
+
+__host__ __device__ __forceinline__ Vec3f::Vec3f(float3 v) {
+    x = v.x;
+    y = v.y;
+    z = v.z;
+}
+
+
+__device__ __forceinline__ Vec3f Vec3f::operator+(const Vec3f& o) const {
+    return Vec3f(x + o.x, y + o.y, z + o.z);
+}
+
+
+__device__ __forceinline__ Vec3f& Vec3f::operator+=(const Vec3f& o) {
+    x += o.x;
+    y += o.y;
+    z += o.z;
+    return *this;
+}
+
+
+__device__ __forceinline__ Vec3f Vec3f::operator-(const Vec3f& o) const {
+    return Vec3f(x - o.x, y - o.y, z - o.z);
+}
+
+
+__device__ __forceinline__ Vec3f& Vec3f::operator-=(const Vec3f& o) {
+    x -= o.x;
+    y -= o.y;
+    z -= o.z;
+    return *this;
+}
+
+
+__device__ __forceinline__ Vec3f Vec3f::operator*(float s) const {
+    return Vec3f(x * s, y * s, z * s);
+}
+
+
+__device__ __forceinline__ Vec3f& Vec3f::operator*=(float s) {
+    x *= s;
+    y *= s;
+    z *= s;
+    return *this;
+}
+
+
+__device__ __forceinline__ Vec3f Vec3f::operator/(float s) const {
+    return Vec3f(x / s, y / s, z / s);
+}
+
+
+__device__ __forceinline__ Vec3f& Vec3f::operator/=(float s) {
+    x /= s;
+    y /= s;
+    z /= s;
+    return *this;
+}
+
+
+__device__ __forceinline__ float Vec3f::dot(const Vec3f& o) const {
+    return x * o.x + y * o.y + z * o.z;
+}
+
+
+__device__ __forceinline__ float Vec3f::norm() const {
+    return sqrtf(x * x + y * y + z * z);
+}
+
+
+__device__ __forceinline__ float Vec3f::norm2() const {
+    return x * x + y * y + z * z;
+}
+
+
+__device__ __forceinline__ Vec3f Vec3f::normalized() const {
+    float inv_norm = rsqrtf(x * x + y * y + z * z);
+    return Vec3f(x * inv_norm, y * inv_norm, z * inv_norm);
+}
+
+
+__device__ __forceinline__ void Vec3f::normalize() {
+    float inv_norm = rsqrtf(x * x + y * y + z * z);
+    x *= inv_norm;
+    y *= inv_norm;
+    z *= inv_norm;
+}
+
+
+__device__ __forceinline__ Vec3f Vec3f::cross(const Vec3f& o) const {
+    return Vec3f(y * o.z - z * o.y, z * o.x - x * o.z, x * o.y - y * o.x);
+}
+
+
+__device__ __forceinline__ Vec3f Vec3f::slerp(const Vec3f& o, float t) const {
+    float dot_prod = this->dot(o);
+    dot_prod = fmaxf(fminf(dot_prod, 1.0f), -1.0f); // Clamp to [-1, 1]
+    float theta = acosf(dot_prod) * t;
+    Vec3f relative_vec = (o - (*this) * dot_prod).normalized();
+    return (*this) * cosf(theta) + relative_vec * sinf(theta);
+}
+
+
+__device__ __forceinline__ QEM::QEM() {
+    zero();
+}
+
+
+__device__ __forceinline__ QEM QEM::operator+(const QEM& o) const {
+    QEM res;
+    #pragma unroll
+    for (int i = 0; i < 10; ++i) res.e[i] = e[i] + o.e[i];
+    return res;
+}
+
+
+__device__ __forceinline__ QEM& QEM::operator+=(const QEM& o) {
+    #pragma unroll
+    for (int i = 0; i < 10; ++i) e[i] += o.e[i];
+    return *this;
+}
+
+
+__device__ __forceinline__ QEM QEM::operator-(const QEM& o) const {
+    QEM res;
+    #pragma unroll
+    for (int i = 0; i < 10; ++i) res.e[i] = e[i] - o.e[i];
+    return res;
+}
+
+
+__device__ __forceinline__ QEM& QEM::operator-=(const QEM& o) {
+    #pragma unroll
+    for (int i = 0; i < 10; ++i) e[i] -= o.e[i];
+    return *this;
+}
+
+__device__ __forceinline__ void QEM::zero() {
+    #pragma unroll
+    for (int i = 0; i < 10; ++i) e[i] = 0.0f;
+}
+
+
+// Add plane p = (a,b,c,d) as outer product p * p^T
+__device__ __forceinline__ void QEM::add_plane(float4 p) {
+    // upper triangle indices mapping:
+    // (0,0)->e[0]
+    // (0,1)->e[1]
+    // (0,2)->e[2]
+    // (0,3)->e[3]
+    // (1,1)->e[4]
+    // (1,2)->e[5]
+    // (1,3)->e[6]
+    // (2,2)->e[7]
+    // (2,3)->e[8]
+    // (3,3)->e[9]
+    float a = p.x, b = p.y, c = p.z, d = p.w;
+    e[0] += a * a;
+    e[1] += a * b;
+    e[2] += a * c;
+    e[3] += a * d;
+    e[4] += b * b;
+    e[5] += b * c;
+    e[6] += b * d;
+    e[7] += c * c;
+    e[8] += c * d;
+    e[9] += d * d;
+}
+
+
+// Evaluate v^T * Q * v for v = (x,y,z,1)
+__device__ __forceinline__ float QEM::evaluate(const Vec3f& p) const {
+    // compute v = [x,y,z,1]
+    float x = p.x, y = p.y, z = p.z, w = 1.0f;
+    // expand symmetric multiplication using stored upper triangular
+    // result = sum_{i<=j} M_ij * v_i * v_j * (1 if i==j else 2)
+    float res = 0.0f;
+    // (0,0)
+    res += e[0] * x * x;
+    // (0,1) and (1,0)
+    res += 2.0f * e[1] * x * y;
+    // (0,2)
+    res += 2.0f * e[2] * x * z;
+    // (0,3)
+    res += 2.0f * e[3] * x * w;
+    // (1,1)
+    res += e[4] * y * y;
+    // (1,2)
+    res += 2.0f * e[5] * y * z;
+    // (1,3)
+    res += 2.0f * e[6] * y * w;
+    // (2,2)
+    res += e[7] * z * z;
+    // (2,3)
+    res += 2.0f * e[8] * z * w;
+    // (3,3)
+    res += e[9] * w * w;
+    return res;
+}
+
+
+// Try to solve for optimal point minimizing v^T Q v with constraint v = (x,y,z,1)
+// Solve the linear system: A * [x y z]^T = -b, where
+// A = top-left 3x3 of Q, b = [e03, e13, e23] (note signs)
+// Return true if solved (matrix invertible), false otherwise. err returns the error at the solution.
+__device__ __forceinline__ bool QEM::solve_optimal(float3 &out, float &err) const {
+    // Build A (symmetric)
+    float A00 = e[0];
+    float A01 = e[1];
+    float A02 = e[2];
+    float A11 = e[4];
+    float A12 = e[5];
+    float A22 = e[7];
+    // b = (e03, e13, e23) where e03=e[3], e13=e[6], e23=e[8]
+    float b0 = e[3];
+    float b1 = e[6];
+    float b2 = e[8];
+
+    // Solve A * x = -b
+    // Use analytic inverse for 3x3 symmetric matrix (compute determinant)
+    // Compute determinant
+    float det =
+        A00 * (A11 * A22 - A12 * A12) -
+        A01 * (A01 * A22 - A12 * A02) +
+        A02 * (A01 * A12 - A11 * A02);
+
+    if (fabsf(det) < 1e-12f) {
+        // singular - fall back: pick minimal among corners (or average 0)
+        // Here choose to put out as (0,0,0)
+        out = make_float3(0.0f, 0.0f, 0.0f);
+        err = evaluate(out);
+        return false;
+    }
+
+    float invDet = 1.0f / det;
+
+    // Compute inverse(A) via adjugate
+    float inv00 =  (A11 * A22 - A12 * A12) * invDet;
+    float inv01 = -(A01 * A22 - A12 * A02) * invDet;
+    float inv02 =  (A01 * A12 - A11 * A02) * invDet;
+    float inv11 =  (A00 * A22 - A02 * A02) * invDet;
+    float inv12 = -(A00 * A12 - A01 * A02) * invDet;
+    float inv22 =  (A00 * A11 - A01 * A01) * invDet;
+
+    // x = -inv(A) * b
+    float x = -(inv00 * b0 + inv01 * b1 + inv02 * b2);
+    float y = -(inv01 * b0 + inv11 * b1 + inv12 * b2);
+    float z = -(inv02 * b0 + inv12 * b1 + inv22 * b2);
+
+    out = make_float3(x, y, z);
+    err = evaluate(out);
+    return true;
+}
+
+
+} // namespace cumesh
diff --git a/src/ext_hip.cpp b/src/ext_hip.cpp
new file mode 100644
index 0000000..e24577d
--- /dev/null
+++ b/src/ext_hip.cpp
@@ -0,0 +1,69 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <torch/extension.h>
+#include "hash/api.h"
+#include "cumesh_hip.h"
+#include "remesh/api.h"
+
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    // Hash functions
+    m.def("hashmap_insert_cuda", &cumesh::hashmap_insert_cuda);
+    m.def("hashmap_lookup_cuda", &cumesh::hashmap_lookup_cuda);
+    m.def("hashmap_insert_3d_cuda", &cumesh::hashmap_insert_3d_cuda);
+    m.def("hashmap_lookup_3d_cuda", &cumesh::hashmap_lookup_3d_cuda);
+    m.def("hashmap_insert_3d_idx_as_val_cuda", &cumesh::hashmap_insert_3d_idx_as_val_cuda);
+
+    /* CUMESH */
+    py::class_<cumesh::CuMesh>(m, "CuMesh")
+        .def(py::init<>())
+        .def("num_vertices", &cumesh::CuMesh::num_vertices)
+        .def("num_faces", &cumesh::CuMesh::num_faces)
+        .def("num_edges", &cumesh::CuMesh::num_edges)
+        .def("num_boundaries", &cumesh::CuMesh::num_boundaries)
+        .def("num_conneted_components", &cumesh::CuMesh::num_conneted_components)
+        .def("num_boundary_conneted_components", &cumesh::CuMesh::num_boundary_conneted_components)
+        .def("num_boundary_loops", &cumesh::CuMesh::num_boundary_loops)
+        .def("clear_cache", &cumesh::CuMesh::clear_cache)
+        .def("init", &cumesh::CuMesh::init)
+        .def("read", &cumesh::CuMesh::read)
+        .def("read_face_normals", &cumesh::CuMesh::read_face_normals)
+        .def("read_vertex_normals", &cumesh::CuMesh::read_vertex_normals)
+        .def("read_edges", &cumesh::CuMesh::read_edges)
+        .def("read_boundaries", &cumesh::CuMesh::read_boundaries)
+        .def("read_manifold_face_adjacency", &cumesh::CuMesh::read_manifold_face_adjacency)
+        .def("read_manifold_boundary_adjacency", &cumesh::CuMesh::read_manifold_boundary_adjacency)
+        .def("read_connected_components", &cumesh::CuMesh::read_connected_components)
+        .def("read_boundary_connected_components", &cumesh::CuMesh::read_boundary_connected_components)
+        .def("read_boundary_loops", &cumesh::CuMesh::read_boundary_loops)
+        .def("read_all_cache", &cumesh::CuMesh::read_all_cache)
+        .def("compute_face_normals", &cumesh::CuMesh::compute_face_normals)
+        .def("compute_vertex_normals", &cumesh::CuMesh::compute_vertex_normals)
+        .def("get_vertex_face_adjacency", &cumesh::CuMesh::get_vertex_face_adjacency)
+        .def("get_edges", &cumesh::CuMesh::get_edges)
+        .def("get_edge_face_adjacency", &cumesh::CuMesh::get_edge_face_adjacency)
+        .def("get_vertex_edge_adjacency", &cumesh::CuMesh::get_vertex_edge_adjacency)
+        .def("get_boundary_info", &cumesh::CuMesh::get_boundary_info)
+        .def("get_vertex_boundary_adjacency", &cumesh::CuMesh::get_vertex_boundary_adjacency)
+        .def("get_vertex_is_manifold", &cumesh::CuMesh::get_vertex_is_manifold)
+        .def("get_manifold_face_adjacency", &cumesh::CuMesh::get_manifold_face_adjacency)
+        .def("get_manifold_boundary_adjacency", &cumesh::CuMesh::get_manifold_boundary_adjacency)
+        .def("get_connected_components", &cumesh::CuMesh::get_connected_components)
+        .def("get_boundary_connected_components", &cumesh::CuMesh::get_boundary_connected_components)
+        .def("get_boundary_loops", &cumesh::CuMesh::get_boundary_loops)
+        .def("remove_faces", &cumesh::CuMesh::remove_faces)
+        .def("remove_unreferenced_vertices", &cumesh::CuMesh::remove_unreferenced_vertices)
+        .def("remove_duplicate_faces", &cumesh::CuMesh::remove_duplicate_faces)
+        .def("remove_degenerate_faces", &cumesh::CuMesh::remove_degenerate_faces)
+        .def("fill_holes", &cumesh::CuMesh::fill_holes)
+        .def("repair_non_manifold_edges", &cumesh::CuMesh::repair_non_manifold_edges)
+        .def("remove_non_manifold_faces", &cumesh::CuMesh::remove_non_manifold_faces)
+        .def("remove_small_connected_components", &cumesh::CuMesh::remove_small_connected_components)
+        .def("unify_face_orientations", &cumesh::CuMesh::unify_face_orientations)
+        .def("simplify_step", &cumesh::CuMesh::simplify_step)
+        .def("compute_charts", &cumesh::CuMesh::compute_charts)
+        .def("read_atlas_charts", &cumesh::CuMesh::read_atlas_charts);
+
+    // Remeshing functions
+    m.def("get_sparse_voxel_grid_active_vertices", &cumesh::get_sparse_voxel_grid_active_vertices);
+    m.def("simple_dual_contour", &cumesh::simple_dual_contour);
+}
\ No newline at end of file
diff --git a/src/geometry.cu b/src/geometry.cu
index 0e493ba..8866ee4 100644
--- a/src/geometry.cu
+++ b/src/geometry.cu
@@ -1,7 +1,11 @@
 #include "cumesh.h"
 #include "dtypes.cuh"
 #include "shared.h"
+#ifdef __HIP_PLATFORM_AMD__
+#include <hipcub/hipcub.hpp>
+#else
 #include <cub/cub.cuh>
+#endif
 
 
 namespace cumesh {
@@ -32,7 +36,7 @@ void CuMesh::compute_face_areas() {
         F,
         this->face_areas.ptr
     );
-    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(hipGetLastError());
 }
 
 
@@ -65,7 +69,7 @@ void CuMesh::compute_face_normals() {
         F,
         this->face_normals.ptr
     );
-    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(hipGetLastError());
 }
 
 
@@ -123,7 +127,7 @@ void CuMesh::compute_vertex_normals() {
         V,
         this->vertex_normals.ptr
     );
-    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(hipGetLastError());
 }
 
 
diff --git a/src/geometry.hip b/src/geometry.hip
new file mode 100644
index 0000000..5cbb9ce
--- /dev/null
+++ b/src/geometry.hip
@@ -0,0 +1,136 @@
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
+#include "cumesh_hip.h"
+#include "dtypes_hip.cuh"
+#include "shared_hip.h"
+#ifdef __HIP_PLATFORM_AMD__
+#include <hipcub/hipcub.hpp>
+#else
+#include <hipcub/hipcub.hpp>
+#endif
+
+
+namespace cumesh {
+
+
+static __global__ void compute_face_areas_kernel(
+    const float3* vertices,
+    const int3* faces,
+    const size_t F,
+    float* face_areas
+) {
+    const int fid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (fid >= F) return;
+    int3 face = faces[fid];
+    Vec3f v0 = Vec3f(vertices[face.x]);
+    Vec3f v1 = Vec3f(vertices[face.y]);
+    Vec3f v2 = Vec3f(vertices[face.z]);
+    face_areas[fid] = 0.5 * (v1 - v0).cross(v2 - v0).norm();
+}
+
+
+void CuMesh::compute_face_areas() {
+    size_t F = this->faces.size;
+    this->face_areas.resize(F);
+   hipLaunchKernelGGL(( compute_face_areas_kernel), dim3((F + BLOCK_SIZE - 1) / BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+        this->vertices.ptr,
+        this->faces.ptr,
+        F,
+        this->face_areas.ptr
+    );
+    CUDA_CHECK(hipGetLastError());
+}
+
+
+static __global__ void compute_face_normals_kernel(
+    const float3* vertices,
+    const int3* faces,
+    const size_t F,
+    float3* face_normals
+) {
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= F) return;
+
+    int3 face = faces[tid];
+    Vec3f v0 = Vec3f(vertices[face.x]);
+    Vec3f v1 = Vec3f(vertices[face.y]);
+    Vec3f v2 = Vec3f(vertices[face.z]);
+
+    Vec3f normal = (v1 - v0).cross(v2 - v0);
+    normal.normalize();
+    face_normals[tid] = make_float3(normal.x, normal.y, normal.z);
+}
+
+
+void CuMesh::compute_face_normals() {
+    size_t F = this->faces.size;
+    this->face_normals.resize(F);
+   hipLaunchKernelGGL(( compute_face_normals_kernel), dim3((F + BLOCK_SIZE - 1) / BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+        this->vertices.ptr,
+        this->faces.ptr,
+        F,
+        this->face_normals.ptr
+    );
+    CUDA_CHECK(hipGetLastError());
+}
+
+
+static __global__ void compute_vertex_normals_kernel(
+    const float3* vertices,
+    const int3* faces,
+    const int* vert2face,
+    const int* vert2face_offset,
+    const size_t V,
+    float3* vertex_normals
+) {
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= V) return;
+
+    int start = vert2face_offset[tid];
+    int end = vert2face_offset[tid + 1];
+
+    Vec3f normal(0.0f, 0.0f, 0.0f);
+    Vec3f first_face_normal;
+    for (int i = start; i < end; i++) {
+        int fid = vert2face[i];
+        int3 face = faces[fid];
+        Vec3f v0 = Vec3f(vertices[face.x]);
+        Vec3f v1 = Vec3f(vertices[face.y]);
+        Vec3f v2 = Vec3f(vertices[face.z]);
+
+        Vec3f face_normal = (v1 - v0).cross(v2 - v0);
+        normal += face_normal;
+        if (i == start) {
+            first_face_normal = face_normal;
+        }
+    }
+
+    normal.normalize();
+    // if NAN, fallback to first face normal
+    if (isnan(normal.x)) {
+        normal = first_face_normal;
+    }
+    vertex_normals[tid] = make_float3(normal.x, normal.y, normal.z);
+}
+
+
+void CuMesh::compute_vertex_normals() {
+    if (this->vert2face.is_empty() || this->vert2face_offset.is_empty()) {
+        this->get_vertex_face_adjacency();
+    }
+
+    size_t V = this->vertices.size;
+    this->vertex_normals.resize(V);
+   hipLaunchKernelGGL(( compute_vertex_normals_kernel), dim3((V + BLOCK_SIZE - 1) / BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+        this->vertices.ptr,
+        this->faces.ptr,
+        this->vert2face.ptr,
+        this->vert2face_offset.ptr,
+        V,
+        this->vertex_normals.ptr
+    );
+    CUDA_CHECK(hipGetLastError());
+}
+
+
+} // namespace cumesh
\ No newline at end of file
diff --git a/src/hash/hash.cu b/src/hash/hash.cu
index a9b1c23..9fd20c9 100644
--- a/src/hash/hash.cu
+++ b/src/hash/hash.cu
@@ -1,6 +1,10 @@
 #include <torch/extension.h>
+#ifdef __HIP_PLATFORM_AMD__
+#include <hip/hip_runtime.h>
+#else
 #include <cuda.h>
 #include <cuda_runtime.h>
+#endif
 
 #include "api.h"
 #include "hash.cuh"
diff --git a/src/hash/hash.hip b/src/hash/hash.hip
new file mode 100644
index 0000000..0e02def
--- /dev/null
+++ b/src/hash/hash.hip
@@ -0,0 +1,451 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <torch/extension.h>
+#ifdef __HIP_PLATFORM_AMD__
+#include <hip/hip_runtime.h>
+#else
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime.h>
+#endif
+
+#include "api.h"
+#include "hash.cuh"
+
+
+template<typename K, typename V>
+static __global__ void hashmap_insert_cuda_kernel(
+    const size_t N,
+    const size_t M,
+    K* __restrict__ hashmap_keys,
+    V* __restrict__ hashmap_values,
+    const K* __restrict__ keys,
+    const V* __restrict__ values
+) {
+    size_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    if (thread_id < M)
+    {
+        K key = keys[thread_id];
+        V value = values[thread_id];
+        linear_probing_insert(hashmap_keys, hashmap_values, key, value, N);
+    }
+}
+
+
+template<typename K, typename V>
+static void dispatch_hashmap_insert_cuda(
+    torch::Tensor& hashmap_keys,
+    torch::Tensor& hashmap_values,
+    const torch::Tensor& keys,
+    const torch::Tensor& values
+) {
+   hipLaunchKernelGGL(( hashmap_insert_cuda_kernel), 
+        dim3((keys.size(0) + BLOCK_SIZE - 1) / BLOCK_SIZE),
+        dim3(BLOCK_SIZE)
+    , 0, 0, 
+        hashmap_keys.size(0),
+        keys.size(0),
+        hashmap_keys.data_ptr<K>(),
+        hashmap_values.data_ptr<V>(),
+        keys.data_ptr<K>(),
+        values.data_ptr<V>()
+    );
+}
+
+
+/**
+ * Insert keys into the hashmap
+ * 
+ * @param hashmap_keys      [N] uint32/uint64 tensor containing the hashmap keys
+ * @param hashmap_values    [N] uint32/uint64 tensor containing the hashmap values
+ * @param keys              [M] uint32/uint64 tensor containing the keys to be inserted
+ * @param values            [M] uint32/uint64 tensor containing the values to be inserted
+ */
+void cumesh::hashmap_insert_cuda(
+    torch::Tensor& hashmap_keys,
+    torch::Tensor& hashmap_values,
+    const torch::Tensor& keys,
+    const torch::Tensor& values
+) {
+    // Dispatch to 32-bit or 64-bit kernel
+    if (hashmap_keys.dtype() == torch::kUInt32 && hashmap_values.dtype() == torch::kUInt32) {
+        TORCH_CHECK(keys.dtype() == torch::kUInt32, "Keys must be uint32");
+        TORCH_CHECK(values.dtype() == torch::kUInt32, "Values must be uint32");
+        dispatch_hashmap_insert_cuda<uint32_t, uint32_t>(hashmap_keys, hashmap_values, keys, values);
+    }
+    else if (hashmap_keys.dtype() == torch::kUInt32 && hashmap_values.dtype() == torch::kUInt64) {
+        TORCH_CHECK(keys.dtype() == torch::kUInt32, "Keys must be uint32");
+        TORCH_CHECK(values.dtype() == torch::kUInt64, "Values must be uint64");
+        dispatch_hashmap_insert_cuda<uint32_t, uint64_t>(hashmap_keys, hashmap_values, keys, values);
+    }
+    else if (hashmap_keys.dtype() == torch::kUInt64 && hashmap_values.dtype() == torch::kUInt32) {
+        TORCH_CHECK(keys.dtype() == torch::kUInt64, "Keys must be uint64");
+        TORCH_CHECK(values.dtype() == torch::kUInt32, "Values must be uint32");
+        dispatch_hashmap_insert_cuda<uint64_t, uint32_t>(hashmap_keys, hashmap_values, keys, values);
+    }
+    else if (hashmap_keys.dtype() == torch::kUInt64 && hashmap_values.dtype() == torch::kUInt64) {
+        TORCH_CHECK(keys.dtype() == torch::kUInt64, "Keys must be uint64");
+        TORCH_CHECK(values.dtype() == torch::kUInt64, "Values must be uint64");
+        dispatch_hashmap_insert_cuda<uint64_t, uint64_t>(hashmap_keys, hashmap_values, keys, values);
+    }
+    else {
+        TORCH_CHECK(false, "Unsupported data type");
+    }
+}
+
+
+template<typename K, typename V>
+static __global__ void hashmap_lookup_cuda_kernel(
+    const size_t N,
+    const size_t M,
+    const K * __restrict__ hashmap_keys,
+    const V * __restrict__ hashmap_values,
+    const K * __restrict__ keys,
+    V * __restrict__ values
+) {
+    size_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    if (thread_id < M) {
+        K key = keys[thread_id];
+        values[thread_id] = linear_probing_lookup(hashmap_keys, hashmap_values, key, N);
+    }
+}
+
+
+template<typename K, typename V>
+static void dispatch_hashmap_lookup_cuda(
+    const torch::Tensor& hashmap_keys,
+    const torch::Tensor& hashmap_values,
+    const torch::Tensor& keys,
+    torch::Tensor& values
+) {
+   hipLaunchKernelGGL(( hashmap_lookup_cuda_kernel), 
+        dim3((keys.size(0) + BLOCK_SIZE - 1) / BLOCK_SIZE),
+        dim3(BLOCK_SIZE)
+    , 0, 0, 
+        hashmap_keys.size(0),
+        keys.size(0),
+        hashmap_keys.data_ptr<K>(),
+        hashmap_values.data_ptr<V>(),
+        keys.data_ptr<K>(),
+        values.data_ptr<V>()
+    );
+}
+
+
+/**
+ * Lookup keys in the hashmap
+ * 
+ * @param hashmap_keys      [N] uint32/uint64 tensor containing the hashmap keys
+ * @param hashmap_values    [N] uint32/uint64 tensor containing the hashmap values
+ * @param keys              [M] uint32/uint64 tensor containing the keys to be looked up
+ * @return                  [M] uint32/uint64 tensor containing the values of the keys
+ */
+torch::Tensor cumesh::hashmap_lookup_cuda(
+    const torch::Tensor& hashmap_keys,
+    const torch::Tensor& hashmap_values,
+    const torch::Tensor& keys
+) {
+    // Allocate output tensor
+    auto output = torch::empty({keys.size(0)}, torch::dtype(hashmap_values.dtype()).device(hashmap_values.device()));
+
+    // Dispatch to 32-bit or 64-bit kernel
+    if (hashmap_keys.dtype() == torch::kUInt32 && hashmap_values.dtype() == torch::kUInt32) {
+        TORCH_CHECK(keys.dtype() == torch::kUInt32, "Keys must be uint32");
+        TORCH_CHECK(output.dtype() == torch::kUInt32, "Output must be uint32");
+        dispatch_hashmap_lookup_cuda<uint32_t, uint32_t>(hashmap_keys, hashmap_values, keys, output);
+    }
+    else if (hashmap_keys.dtype() == torch::kUInt32 && hashmap_values.dtype() == torch::kUInt64) {
+        TORCH_CHECK(keys.dtype() == torch::kUInt32, "Keys must be uint32");
+        TORCH_CHECK(output.dtype() == torch::kUInt64, "Output must be uint64");
+        dispatch_hashmap_lookup_cuda<uint32_t, uint64_t>(hashmap_keys, hashmap_values, keys, output);
+    }
+    else if (hashmap_keys.dtype() == torch::kUInt64 && hashmap_values.dtype() == torch::kUInt32) {
+        TORCH_CHECK(keys.dtype() == torch::kUInt64, "Keys must be uint64");
+        TORCH_CHECK(output.dtype() == torch::kUInt32, "Output must be uint32");
+        dispatch_hashmap_lookup_cuda<uint64_t, uint32_t>(hashmap_keys, hashmap_values, keys, output);
+    }
+    else if (hashmap_keys.dtype() == torch::kUInt64 && hashmap_values.dtype() == torch::kUInt64) {
+        TORCH_CHECK(keys.dtype() == torch::kUInt64, "Keys must be uint64");
+        TORCH_CHECK(output.dtype() == torch::kUInt64, "Output must be uint64");
+        dispatch_hashmap_lookup_cuda<uint64_t, uint64_t>(hashmap_keys, hashmap_values, keys, output);
+    }
+    else {
+        TORCH_CHECK(false, "Unsupported data type");
+    }
+
+    return output;
+}
+
+
+template<typename K, typename V>
+static __global__ void hashmap_insert_3d_cuda_kernel(
+    const size_t N,
+    const size_t M,
+    const int W,
+    const int H,
+    const int D,
+    K* __restrict__ hashmap_keys,
+    V* __restrict__ hashmap_values,
+    const int32_t* __restrict__ coords,
+    const V* __restrict__ values
+) {
+    size_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    if (thread_id < M) {
+        int4 coord = reinterpret_cast<const int4*>(coords)[thread_id];
+        int b = coord.x;
+        int x = coord.y;
+        int y = coord.z;
+        int z = coord.w;
+        size_t flat_idx = (size_t)b * W * H * D + (size_t)x * H * D + (size_t)y * D + z;
+        K key = static_cast<K>(flat_idx);
+        V value = values[thread_id];
+        linear_probing_insert(hashmap_keys, hashmap_values, key, value, N);
+    }
+}
+
+
+template<typename K, typename V>
+static void dispatch_hashmap_insert_3d_cuda(
+    torch::Tensor& hashmap_keys,
+    torch::Tensor& hashmap_values,
+    const torch::Tensor& coords,
+    const torch::Tensor& values,
+    int W, int H, int D
+) {
+   hipLaunchKernelGGL(( hashmap_insert_3d_cuda_kernel), 
+        dim3((coords.size(0) + BLOCK_SIZE - 1) / BLOCK_SIZE),
+        dim3(BLOCK_SIZE)
+    , 0, 0, 
+        hashmap_keys.size(0),
+        coords.size(0),
+        W, H, D,
+        hashmap_keys.data_ptr<K>(),
+        hashmap_values.data_ptr<V>(),
+        coords.data_ptr<int32_t>(),
+        values.data_ptr<V>()
+    );
+}
+
+
+/**
+ * Insert 3D coordinates into the hashmap
+ * 
+ * @param hashmap_keys      [N] uint32/uint64 tensor containing the hashmap keys
+ * @param hashmap_values    [N] uint32/uint64 tensor containing the hashmap values
+ * @param coords            [M, 4] int32 tensor containing the keys to be inserted
+ * @param values            [M] uint32/uint64 tensor containing the values to be inserted
+ * @param W                 the number of width dimensions
+ * @param H                 the number of height dimensions
+ * @param D                 the number of depth dimensions
+ */
+void cumesh::hashmap_insert_3d_cuda(
+    torch::Tensor& hashmap_keys,
+    torch::Tensor& hashmap_values,
+    const torch::Tensor& coords,
+    const torch::Tensor& values,
+    int W,
+    int H,
+    int D
+) {
+    TORCH_CHECK(coords.dtype() == torch::kInt32, "Coords must be int32");
+
+    // Dispatch to 32-bit or 64-bit kernel
+    if (hashmap_keys.dtype() == torch::kUInt32 && hashmap_values.dtype() == torch::kUInt32) {
+        TORCH_CHECK(values.dtype() == torch::kUInt32, "Values must be uint32");
+        dispatch_hashmap_insert_3d_cuda<uint32_t, uint32_t>(hashmap_keys, hashmap_values, coords, values, W, H, D);
+    }
+    else if (hashmap_keys.dtype() == torch::kUInt32 && hashmap_values.dtype() == torch::kUInt64) {
+        TORCH_CHECK(values.dtype() == torch::kUInt64, "Values must be uint64");
+        dispatch_hashmap_insert_3d_cuda<uint32_t, uint64_t>(hashmap_keys, hashmap_values, coords, values, W, H, D);
+    }
+    else if (hashmap_keys.dtype() == torch::kUInt64 && hashmap_values.dtype() == torch::kUInt32) {
+        TORCH_CHECK(values.dtype() == torch::kUInt32, "Values must be uint32");
+        dispatch_hashmap_insert_3d_cuda<uint64_t, uint32_t>(hashmap_keys, hashmap_values, coords, values, W, H, D);
+    }
+    else if (hashmap_keys.dtype() == torch::kUInt64 && hashmap_values.dtype() == torch::kUInt64) {
+        TORCH_CHECK(values.dtype() == torch::kUInt64, "Values must be uint64");
+        dispatch_hashmap_insert_3d_cuda<uint64_t, uint64_t>(hashmap_keys, hashmap_values, coords, values, W, H, D);
+    }
+    else {
+        TORCH_CHECK(false, "Unsupported data type");
+    }   
+}
+
+
+template<typename K, typename V>
+static __global__ void hashmap_lookup_3d_cuda_kernel(
+    const size_t N,
+    const size_t M,
+    const int W,
+    const int H,
+    const int D,
+    const K* __restrict__ hashmap_keys,
+    const V* __restrict__ hashmap_values,
+    const int32_t* __restrict__ coords,
+    V* __restrict__ values
+) {
+    const size_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    if (thread_id < M) {
+        int4 coord = reinterpret_cast<const int4*>(coords)[thread_id];
+        int b = coord.x;
+        int x = coord.y;
+        int y = coord.z;
+        int z = coord.w;
+        if (x < 0 || x >= W || y < 0 || y >= H || z < 0 || z >= D) {
+            values[thread_id] = std::numeric_limits<V>::max();
+            return;
+        }
+        size_t flat_idx = (size_t)b * W * H * D + (size_t)x * H * D + (size_t)y * D + z;
+        K key = static_cast<K>(flat_idx);
+        values[thread_id] = linear_probing_lookup(hashmap_keys, hashmap_values, key, N);
+    }
+}
+
+
+template<typename K, typename V>
+static void dispatch_hashmap_lookup_3d_cuda(
+    const torch::Tensor& hashmap_keys,
+    const torch::Tensor& hashmap_values,
+    const torch::Tensor& coords,
+    torch::Tensor& values,
+    int W, int H, int D
+) {
+   hipLaunchKernelGGL(( hashmap_lookup_3d_cuda_kernel), 
+        dim3((coords.size(0) + BLOCK_SIZE - 1) / BLOCK_SIZE),
+        dim3(BLOCK_SIZE)
+    , 0, 0, 
+        hashmap_keys.size(0),
+        coords.size(0),
+        W, H, D,
+        hashmap_keys.data_ptr<K>(),
+        hashmap_values.data_ptr<V>(),
+        coords.data_ptr<int32_t>(),
+        values.data_ptr<V>()
+    );
+}
+
+
+/**
+ * Lookup 3D coordinates in the hashmap
+ * 
+ * @param hashmap_keys      [N] uint32/uint64 tensor containing the hashmap keys
+ * @param hashmap_values    [N] uint32/uint64 tensor containing the hashmap values
+ * @param coords            [M, 4] int32 tensor containing the keys to be looked up
+ * @param W                 the number of width dimensions
+ * @param H                 the number of height dimensions
+ * @param D                 the number of depth dimensions
+ * 
+ * @return                  [M] uint32/uint64 tensor containing the values of the keys
+ */
+torch::Tensor cumesh::hashmap_lookup_3d_cuda(
+    const torch::Tensor& hashmap_keys,
+    const torch::Tensor& hashmap_values,
+    const torch::Tensor& coords,
+    int W,
+    int H,
+    int D
+) {
+    // Allocate output tensor
+    auto output = torch::empty({coords.size(0)}, torch::dtype(hashmap_values.dtype()).device(hashmap_values.device()));
+
+    // Dispatch to 32-bit or 64-bit kernel
+    if (hashmap_keys.dtype() == torch::kUInt32 && hashmap_values.dtype() == torch::kUInt32) {
+        dispatch_hashmap_lookup_3d_cuda<uint32_t, uint32_t>(hashmap_keys, hashmap_values, coords, output, W, H, D);
+    }
+    else if (hashmap_keys.dtype() == torch::kUInt32 && hashmap_values.dtype() == torch::kUInt64) {
+        dispatch_hashmap_lookup_3d_cuda<uint32_t, uint64_t>(hashmap_keys, hashmap_values, coords, output, W, H, D);
+    }
+    else if (hashmap_keys.dtype() == torch::kUInt64 && hashmap_values.dtype() == torch::kUInt32) {
+        dispatch_hashmap_lookup_3d_cuda<uint64_t, uint32_t>(hashmap_keys, hashmap_values, coords, output, W, H, D);
+    }
+    else if (hashmap_keys.dtype() == torch::kUInt64 && hashmap_values.dtype() == torch::kUInt64) {
+        dispatch_hashmap_lookup_3d_cuda<uint64_t, uint64_t>(hashmap_keys, hashmap_values, coords, output, W, H, D);
+    }
+    else {
+        TORCH_CHECK(false, "Unsupported data type");
+    }
+    
+    return output;
+}
+
+
+template<typename K, typename V>
+static __global__ void hashmap_insert_3d_idx_as_val_cuda_kernel(
+    const size_t N,
+    const size_t M,
+    const int W,
+    const int H,
+    const int D,
+    K* __restrict__ hashmap,
+    V* __restrict__ values,
+    const int32_t* __restrict__ coords
+) {
+    const size_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    if (thread_id < M) {
+        int4 coord = reinterpret_cast<const int4*>(coords)[thread_id];
+        int b = coord.x;
+        int x = coord.y;
+        int y = coord.z;
+        int z = coord.w;
+        size_t flat_idx = (size_t)b * W * H * D + (size_t)x * H * D + (size_t)y * D + z;
+        K key = static_cast<K>(flat_idx);
+        V value = static_cast<V>(thread_id);
+        linear_probing_insert(hashmap, values, key, value, N);
+    }
+}
+
+
+template<typename K, typename V>
+static void dispatch_hashmap_insert_3d_idx_as_val_cuda(
+    torch::Tensor& hashmap_keys,
+    torch::Tensor& hashmap_values,
+    const torch::Tensor& coords,
+    int W, int H, int D
+) {
+   hipLaunchKernelGGL(( hashmap_insert_3d_idx_as_val_cuda_kernel), 
+        dim3((coords.size(0) + BLOCK_SIZE - 1) / BLOCK_SIZE),
+        dim3(BLOCK_SIZE)
+    , 0, 0, 
+        hashmap_keys.size(0),
+        coords.size(0),
+        W, H, D,
+        hashmap_keys.data_ptr<K>(),
+        hashmap_values.data_ptr<V>(),
+        coords.data_ptr<int32_t>()
+    );
+}
+
+
+/**
+ * Insert 3D coordinates into the hashmap using index as value
+ * 
+ * @param hashmap_keys      [N] uint32/uint64 tensor containing the hashmap keys
+ * @param hashmap_values    [N] uint32/uint64 tensor containing the hashmap values
+ * @param coords            [M, 4] int32 tensor containing the keys to be inserted
+ * @param W                 the number of width dimensions
+ * @param H                 the number of height dimensions
+ * @param D                 the number of depth dimensions
+ */
+void cumesh::hashmap_insert_3d_idx_as_val_cuda(
+    torch::Tensor& hashmap_keys,
+    torch::Tensor& hashmap_values,
+    const torch::Tensor& coords,
+    int W,
+    int H,
+    int D
+) {
+    // Dispatch to 32-bit or 64-bit kernel
+    if (hashmap_keys.dtype() == torch::kUInt32 && hashmap_values.dtype() == torch::kUInt32) {
+        dispatch_hashmap_insert_3d_idx_as_val_cuda<uint32_t, uint32_t>(hashmap_keys, hashmap_values, coords, W, H, D);
+    }
+    else if (hashmap_keys.dtype() == torch::kUInt32 && hashmap_values.dtype() == torch::kUInt64) {
+        dispatch_hashmap_insert_3d_idx_as_val_cuda<uint32_t, uint64_t>(hashmap_keys, hashmap_values, coords, W, H, D);
+    }
+    else if (hashmap_keys.dtype() == torch::kUInt64 && hashmap_values.dtype() == torch::kUInt32) {
+        dispatch_hashmap_insert_3d_idx_as_val_cuda<uint64_t, uint32_t>(hashmap_keys, hashmap_values, coords, W, H, D);
+    }
+    else if (hashmap_keys.dtype() == torch::kUInt64 && hashmap_values.dtype() == torch::kUInt64) {
+        dispatch_hashmap_insert_3d_idx_as_val_cuda<uint64_t, uint64_t>(hashmap_keys, hashmap_values, coords, W, H, D);
+    }
+    else {
+        TORCH_CHECK(false, "Unsupported data type");
+    }
+}
\ No newline at end of file
diff --git a/src/io.cu b/src/io.cu
index 579cf16..aa12883 100644
--- a/src/io.cu
+++ b/src/io.cu
@@ -85,11 +85,11 @@ static torch::Tensor buffer_to_tensor(const Buffer<T> buffer) {
 
     static constexpr int dst_bytes = Mapping::sizeof_scalar * Mapping::channels;
     if (sizeof(T) == dst_bytes) {
-        CUDA_CHECK(cudaMemcpy(
+        CUDA_CHECK(hipMemcpy(
             tensor.data_ptr(),
             buffer.ptr,
             count * sizeof(T),
-            cudaMemcpyDeviceToDevice
+            hipMemcpyDeviceToDevice
         ));
     } else {
         CUDA_CHECK(cudaMemcpy2D(
@@ -99,7 +99,7 @@ static torch::Tensor buffer_to_tensor(const Buffer<T> buffer) {
             sizeof(T),
             dst_bytes,
             count,
-            cudaMemcpyDeviceToDevice
+            hipMemcpyDeviceToDevice
         ));
     }
 
@@ -119,7 +119,7 @@ void CuMesh::init(const torch::Tensor& vertices, const torch::Tensor& faces) {
         sizeof(float) * 3,
         sizeof(float) * 3,
         num_vertices,
-        cudaMemcpyDeviceToDevice
+        hipMemcpyDeviceToDevice
     ));
     CUDA_CHECK(cudaMemcpy2D(
         this->faces.ptr,
@@ -128,7 +128,7 @@ void CuMesh::init(const torch::Tensor& vertices, const torch::Tensor& faces) {
         sizeof(int) * 3,
         sizeof(int) * 3,
         num_faces,
-        cudaMemcpyDeviceToDevice
+        hipMemcpyDeviceToDevice
     ));
 }
 
diff --git a/src/io.hip b/src/io.hip
new file mode 100644
index 0000000..0043191
--- /dev/null
+++ b/src/io.hip
@@ -0,0 +1,251 @@
+// !!! This is a file automatically generated by hipify!!!
+#include "cumesh_hip.h"
+
+
+namespace cumesh {
+
+
+template <typename T>
+struct TorchTypeMapping;
+
+template <> struct TorchTypeMapping<int> {
+    static constexpr auto scalar_type = torch::kInt32;
+    static constexpr int sizeof_scalar = 4;
+    static constexpr int channels = 1;
+};
+
+template <> struct TorchTypeMapping<char> {
+    static constexpr auto scalar_type = torch::kInt8;
+    static constexpr int sizeof_scalar = 1;
+    static constexpr int channels = 1;
+};
+
+template <> struct TorchTypeMapping<uint8_t> {
+    static constexpr auto scalar_type = torch::kUInt8;
+    static constexpr int sizeof_scalar = 1;
+    static constexpr int channels = 1;
+};
+
+template <> struct TorchTypeMapping<float> {
+    static constexpr auto scalar_type = torch::kFloat32;
+    static constexpr int sizeof_scalar = 4;
+    static constexpr int channels = 1;
+};
+
+template <> struct TorchTypeMapping<uint64_t> {
+    static constexpr auto scalar_type = torch::kInt32;
+    static constexpr int sizeof_scalar = 4;
+    static constexpr int channels = 2;
+};
+
+template <> struct TorchTypeMapping<int2> {
+    static constexpr auto scalar_type = torch::kInt32;
+    static constexpr int sizeof_scalar = 4;
+    static constexpr int channels = 2;
+};
+
+template <> struct TorchTypeMapping<int3> {
+    static constexpr auto scalar_type = torch::kInt32;
+    static constexpr int sizeof_scalar = 4;
+    static constexpr int channels = 3;
+};
+
+template <> struct TorchTypeMapping<float2> {
+    static constexpr auto scalar_type = torch::kFloat32;
+    static constexpr int sizeof_scalar = 4;
+    static constexpr int channels = 2;
+};
+
+template <> struct TorchTypeMapping<float3> {
+    static constexpr auto scalar_type = torch::kFloat32;
+    static constexpr int sizeof_scalar = 4;
+    static constexpr int channels = 3;
+};
+
+template <> struct TorchTypeMapping<float4> {
+    static constexpr auto scalar_type = torch::kFloat32;
+    static constexpr int sizeof_scalar = 4;
+    static constexpr int channels = 4;
+};
+
+
+template<typename T>
+static torch::Tensor buffer_to_tensor(const Buffer<T> buffer) {
+    using Mapping = TorchTypeMapping<T>;
+
+    int64_t count = static_cast<int64_t>(buffer.size);
+    std::vector<int64_t> shape;
+    if (Mapping::channels == 1) {
+        shape = {count}; // 1D Tensor
+    } else {
+        shape = {count, Mapping::channels}; // 2D Tensor [N, C]
+    }
+
+    auto options = torch::dtype(Mapping::scalar_type).device(torch::kCUDA);
+    auto tensor = torch::empty(shape, options);
+
+    static constexpr int dst_bytes = Mapping::sizeof_scalar * Mapping::channels;
+    if (sizeof(T) == dst_bytes) {
+        CUDA_CHECK(hipMemcpy(
+            tensor.data_ptr(),
+            buffer.ptr,
+            count * sizeof(T),
+            hipMemcpyDeviceToDevice
+        ));
+    } else {
+        CUDA_CHECK(hipMemcpy2D(
+            tensor.data_ptr(),
+            dst_bytes,
+            buffer.ptr,
+            sizeof(T),
+            dst_bytes,
+            count,
+            hipMemcpyDeviceToDevice
+        ));
+    }
+
+    return tensor;
+}
+
+
+void CuMesh::init(const torch::Tensor& vertices, const torch::Tensor& faces) {
+    size_t num_vertices = vertices.size(0);
+    size_t num_faces = faces.size(0);
+    this->vertices.resize(num_vertices);
+    this->faces.resize(num_faces);
+    CUDA_CHECK(hipMemcpy2D(
+        this->vertices.ptr,
+        sizeof(float3),
+        vertices.data_ptr<float>(),
+        sizeof(float) * 3,
+        sizeof(float) * 3,
+        num_vertices,
+        hipMemcpyDeviceToDevice
+    ));
+    CUDA_CHECK(hipMemcpy2D(
+        this->faces.ptr,
+        sizeof(int3),
+        faces.data_ptr<int>(),
+        sizeof(int) * 3,
+        sizeof(int) * 3,
+        num_faces,
+        hipMemcpyDeviceToDevice
+    ));
+}
+
+
+std::tuple<torch::Tensor, torch::Tensor> CuMesh::read() {
+    auto vertices = buffer_to_tensor(this->vertices);
+    auto faces = buffer_to_tensor(this->faces);
+    return std::make_tuple(vertices, faces);
+}
+
+
+torch::Tensor CuMesh::read_face_normals() {
+    auto face_normals = buffer_to_tensor(this->face_normals);
+    return face_normals;
+}
+
+
+torch::Tensor CuMesh::read_vertex_normals() {
+    auto vertex_normals = buffer_to_tensor(this->vertex_normals);
+    return vertex_normals;
+}
+
+
+torch::Tensor CuMesh::read_edges() {
+    auto edges = buffer_to_tensor(this->edges);
+    return edges;
+}
+
+
+torch::Tensor CuMesh::read_boundaries() {
+    auto boundaries = buffer_to_tensor(this->boundaries);
+    return boundaries;
+}
+
+
+torch::Tensor CuMesh::read_manifold_face_adjacency() {
+    auto manifold_face_adj = buffer_to_tensor(this->manifold_face_adj);
+    return manifold_face_adj;
+}
+
+
+torch::Tensor CuMesh::read_manifold_boundary_adjacency() {
+    auto manifold_bound_adj = buffer_to_tensor(this->manifold_bound_adj);
+    return manifold_bound_adj;
+}
+
+  
+std::tuple<int, torch::Tensor> CuMesh::read_connected_components() {
+    auto conn_comp_ids_tensor = buffer_to_tensor(this->conn_comp_ids);
+    return std::make_tuple(this->num_conn_comps, conn_comp_ids_tensor);
+}
+
+
+std::tuple<int, torch::Tensor> CuMesh::read_boundary_connected_components() {
+    auto bound_conn_comp_ids_tensor = buffer_to_tensor(this->bound_conn_comp_ids);
+    return std::make_tuple(this->num_bound_conn_comps, bound_conn_comp_ids_tensor);
+}
+
+
+std::tuple<int, torch::Tensor, torch::Tensor> CuMesh::read_boundary_loops() {
+    auto loop_boundaries_tensor = buffer_to_tensor(this->loop_boundaries);
+    auto loop_boundaries_offset_tensor = buffer_to_tensor(this->loop_boundaries_offset);
+    return std::make_tuple(this->num_bound_loops, loop_boundaries_tensor, loop_boundaries_offset_tensor);
+}
+
+
+std::unordered_map<std::string, torch::Tensor> CuMesh::read_all_cache() {
+    std::unordered_map<std::string, torch::Tensor> cache;
+    cache["face_areas"] = buffer_to_tensor(this->face_areas);
+    cache["face_normals"] = buffer_to_tensor(this->face_normals);
+    cache["vertex_normals"] = buffer_to_tensor(this->vertex_normals);
+    cache["edges"] = buffer_to_tensor(this->edges);
+    cache["boundaries"] = buffer_to_tensor(this->boundaries);
+    cache["vert_is_boundary"] = buffer_to_tensor(this->vert_is_boundary);
+    cache["vert_is_manifold"] = buffer_to_tensor(this->vert_is_manifold);
+    cache["vert2edge"] = buffer_to_tensor(this->vert2edge);
+    cache["vert2edge_cnt"] = buffer_to_tensor(this->vert2edge_cnt);
+    cache["vert2edge_offset"] = buffer_to_tensor(this->vert2edge_offset);
+    cache["vert2bound"] = buffer_to_tensor(this->vert2bound);
+    cache["vert2bound_cnt"] = buffer_to_tensor(this->vert2bound_cnt);
+    cache["vert2bound_offset"] = buffer_to_tensor(this->vert2bound_offset);
+    cache["edge2face"] = buffer_to_tensor(this->edge2face);
+    cache["edge2face_cnt"] = buffer_to_tensor(this->edge2face_cnt);
+    cache["edge2face_offset"] = buffer_to_tensor(this->edge2face_offset);
+    cache["face2edge"] = buffer_to_tensor(this->face2edge);
+    cache["vert2face"] = buffer_to_tensor(this->vert2face);
+    cache["vert2face_cnt"] = buffer_to_tensor(this->vert2face_cnt);
+    cache["vert2face_offset"] = buffer_to_tensor(this->vert2face_offset);
+    cache["manifold_face_adj"] = buffer_to_tensor(this->manifold_face_adj);
+    cache["manifold_bound_adj"] = buffer_to_tensor(this->manifold_bound_adj);
+    cache["conn_comp_ids"] = buffer_to_tensor(this->conn_comp_ids);
+    cache["bound_conn_comp_ids"] = buffer_to_tensor(this->bound_conn_comp_ids);
+    cache["loop_boundaries"] = buffer_to_tensor(this->loop_boundaries);
+    cache["loop_boundaries_offset"] = buffer_to_tensor(this->loop_boundaries_offset);
+    cache["vertices_map"] = buffer_to_tensor(this->vertices_map);
+    cache["faces_map"] = buffer_to_tensor(this->faces_map);
+    cache["edge_collapse_costs"] = buffer_to_tensor(this->edge_collapse_costs);
+    cache["propagated_costs"] = buffer_to_tensor(this->propagated_costs);
+    cache["atlas_chart_ids"] = buffer_to_tensor(this->atlas_chart_ids);
+    cache["atlas_chart_vertex_map"] = buffer_to_tensor(this->atlas_chart_vertex_map);
+    cache["atlas_chart_faces"] = buffer_to_tensor(this->atlas_chart_faces);
+    cache["atlas_chart_faces_offset"] = buffer_to_tensor(this->atlas_chart_faces_offset);
+    cache["atlas_chart_vertex_offset"] = buffer_to_tensor(this->atlas_chart_vertex_offset);
+    cache["atlas_chart_uvs"] = buffer_to_tensor(this->atlas_chart_uvs);
+    cache["atlas_chart_normal_cones"] = buffer_to_tensor(this->atlas_chart_normal_cones);
+    cache["atlas_chart_adj"] = buffer_to_tensor(this->atlas_chart_adj);
+    cache["atlas_chart_adj_length"] = buffer_to_tensor(this->atlas_chart_adj_length);
+    cache["atlas_chart_perims"] = buffer_to_tensor(this->atlas_chart_perims);
+    cache["atlas_chart_areas"] = buffer_to_tensor(this->atlas_chart_areas);
+    cache["atlas_chart2edge"] = buffer_to_tensor(this->atlas_chart2edge);
+    cache["atlas_chart2edge_cnt"] = buffer_to_tensor(this->atlas_chart2edge_cnt);
+    cache["atlas_chart2edge_offset"] = buffer_to_tensor(this->atlas_chart2edge_offset);
+    cache["temp_storage"] = buffer_to_tensor(this->temp_storage);
+    cache["cub_temp_storage"] = buffer_to_tensor(this->cub_temp_storage);
+    return cache;
+}
+
+
+} // namespace cumesh
diff --git a/src/remesh/simple_dual_contour.cu b/src/remesh/simple_dual_contour.cu
index e064ee3..81c6782 100644
--- a/src/remesh/simple_dual_contour.cu
+++ b/src/remesh/simple_dual_contour.cu
@@ -1,6 +1,10 @@
 #include <torch/extension.h>
+#ifdef __HIP_PLATFORM_AMD__
+#include <hip/hip_runtime.h>
+#else
 #include <cuda.h>
 #include <cuda_runtime.h>
+#endif
 #include <vector>
 
 #include "api.h"
@@ -218,6 +222,6 @@ std::tuple<torch::Tensor, torch::Tensor> cumesh::simple_dual_contour(
         TORCH_CHECK(false, "Unsupported hashmap data type");
     }
 
-    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(hipGetLastError());
     return std::make_tuple(vertices, intersected);
 }
diff --git a/src/remesh/simple_dual_contour.hip b/src/remesh/simple_dual_contour.hip
new file mode 100644
index 0000000..6bbe049
--- /dev/null
+++ b/src/remesh/simple_dual_contour.hip
@@ -0,0 +1,228 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <torch/extension.h>
+#ifdef __HIP_PLATFORM_AMD__
+#include <hip/hip_runtime.h>
+#else
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime.h>
+#endif
+#include <vector>
+
+#include "api.h"
+#include "../utils_hip.h"
+#include "../hash/hash.cuh"
+
+
+template<typename T>
+__device__ __forceinline__ float get_vertex_val(
+    const T* __restrict__ hashmap_keys,
+    const uint32_t* __restrict__ hashmap_vals,
+    const float* __restrict__ udf,
+    const size_t N_vert,
+    int x, int y, int z,
+    int W, int H, int D
+) {
+    size_t flat_idx = (size_t)x * H * D + (size_t)y * D + z;
+    T key = static_cast<T>(flat_idx);
+    uint32_t idx = linear_probing_lookup(hashmap_keys, hashmap_vals, key, N_vert);
+    return udf[idx];
+}
+
+
+template<typename T>
+static __global__ void simple_dual_contour_kernel(
+    const size_t N_vert,
+    const size_t M,
+    const int W,
+    const int H,
+    const int D,
+    const T* __restrict__ hashmap_keys,
+    const uint32_t* __restrict__ hashmap_vals,
+    const int32_t* __restrict__ coords,
+    const float* __restrict__ udf,
+    float* __restrict__ out_vertices,
+    int32_t* __restrict__ out_intersected
+) {
+    size_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    if (thread_id >= M) return;
+
+    int vx = coords[thread_id * 3 + 0];
+    int vy = coords[thread_id * 3 + 1];
+    int vz = coords[thread_id * 3 + 2];
+
+    float3 intersection_sum = make_float3(0.0f, 0.0f, 0.0f);
+    int intersection_count = 0;
+
+    // Traverse the 12 edges of the voxel
+    // Axis X
+    #pragma unroll
+    for (int u = 0; u <= 1; ++u) {
+        #pragma unroll
+        for (int v = 0; v <= 1; ++v) {
+            float val1 = get_vertex_val(hashmap_keys, hashmap_vals, udf, N_vert, vx, vy + u, vz + v, W, H, D);
+            float val2 = get_vertex_val(hashmap_keys, hashmap_vals, udf, N_vert, vx + 1, vy + u, vz + v, W, H, D);
+
+            // Calculate the intersection point
+            if ((val1 < 0 && val2 >= 0) || (val1 >= 0 && val2 < 0)) {
+                float t = -val1 / (val2 - val1);
+                // P = P1 + t * (P2 - P1)
+                intersection_sum.x += (float)vx + t;
+                intersection_sum.y += (float)(vy + u);
+                intersection_sum.z += (float)(vz + v);
+                intersection_count++;
+            }
+
+            if (u == 1 && v == 1) {
+                if (val1 < 0 && val2 >= 0) {
+                    out_intersected[thread_id * 3 + 0] = 1;
+                }
+                else if (val1 >= 0 && val2 < 0) {
+                    out_intersected[thread_id * 3 + 0] = -1;
+                }
+                else {
+                    out_intersected[thread_id * 3 + 0] = 0;
+                }
+            }
+        }
+    }
+
+    // Axis Y
+    #pragma unroll
+    for (int u = 0; u <= 1; ++u) {
+        #pragma unroll
+        for (int v = 0; v <= 1; ++v) {
+            float val1 = get_vertex_val(hashmap_keys, hashmap_vals, udf, N_vert, vx + u, vy, vz + v, W, H, D);
+            float val2 = get_vertex_val(hashmap_keys, hashmap_vals, udf, N_vert, vx + u, vy + 1, vz + v, W, H, D);
+
+            if ((val1 < 0 && val2 >= 0) || (val1 >= 0 && val2 < 0)) {
+                float t = -val1 / (val2 - val1);
+                intersection_sum.x += (float)(vx + u);
+                intersection_sum.y += (float)vy + t;
+                intersection_sum.z += (float)(vz + v);
+                intersection_count++;
+            }
+            
+            if (u == 1 && v == 1) {
+                if (val1 < 0 && val2 >= 0) {
+                    out_intersected[thread_id * 3 + 1] = 1;
+                }
+                else if (val1 >= 0 && val2 < 0) {
+                    out_intersected[thread_id * 3 + 1] = -1;
+                }
+                else {
+                    out_intersected[thread_id * 3 + 1] = 0;
+                }
+            }
+        }
+    }
+
+    // Axis Z
+    #pragma unroll
+    for (int u = 0; u <= 1; ++u) {
+        #pragma unroll
+        for (int v = 0; v <= 1; ++v) {
+            float val1 = get_vertex_val(hashmap_keys, hashmap_vals, udf, N_vert, vx + u, vy + v, vz, W, H, D);
+            float val2 = get_vertex_val(hashmap_keys, hashmap_vals, udf, N_vert, vx + u, vy + v, vz + 1, W, H, D);
+
+            if ((val1 < 0 && val2 >= 0) || (val1 >= 0 && val2 < 0)) {
+                float t = -val1 / (val2 - val1);
+                intersection_sum.x += (float)(vx + u);
+                intersection_sum.y += (float)(vy + v);
+                intersection_sum.z += (float)vz + t;
+                intersection_count++;
+            }
+
+            if (u == 1 && v == 1) {
+                if (val1 < 0 && val2 >= 0) {
+                    out_intersected[thread_id * 3 + 2] = 1;
+                }
+                else if (val1 >= 0 && val2 < 0) {
+                    out_intersected[thread_id * 3 + 2] = -1;
+                }
+                else {
+                    out_intersected[thread_id * 3 + 2] = 0;
+                }
+            }
+        }
+    }
+
+    // Calculate the mean intersection point
+    if (intersection_count > 0) {
+        out_vertices[thread_id * 3 + 0] = intersection_sum.x / intersection_count;
+        out_vertices[thread_id * 3 + 1] = intersection_sum.y / intersection_count;
+        out_vertices[thread_id * 3 + 2] = intersection_sum.z / intersection_count;
+    } else {
+        // Fallback: Voxel Center
+        out_vertices[thread_id * 3 + 0] = (float)vx + 0.5f;
+        out_vertices[thread_id * 3 + 1] = (float)vy + 0.5f;
+        out_vertices[thread_id * 3 + 2] = (float)vz + 0.5f;
+    }
+}
+
+
+/**
+ * Isosurfacing a volume defined on vertices of a sparse voxel grid using a simple dual contouring algorithm.
+ * Dual vertices are computed by mean of edge intersections.
+ * 
+ * @param hashmap_keys  [Nvert] uint32/uint64 hashmap of the vertices keys
+ * @param hashmap_vals  [Nvert] uint32 tensor containing the hashmap values as vertex indices
+ * @param coords        [Mvox, 3] int32 tensor containing the coordinates of the active voxels
+ * @param udf           [Mvert] float tensor containing the UDF/SDF values at the vertices
+ * @param W             the number of width dimensions
+ * @param H             the number of height dimensions
+ * @param D             the number of depth dimensions
+ *
+ * @return              [L, 3] float tensor containing the active vertices (Dual Vertices)
+                        [L, 3] int32 tensor containing the intersected edges (1: intersected, 0: not intersected)
+ */
+std::tuple<torch::Tensor, torch::Tensor> cumesh::simple_dual_contour(
+    const torch::Tensor& hashmap_keys,
+    const torch::Tensor& hashmap_vals,
+    const torch::Tensor& coords,
+    const torch::Tensor& udf,
+    int W,
+    int H,
+    int D
+) {
+    const size_t M = coords.size(0);
+    const size_t N_vert = hashmap_keys.size(0);
+
+    auto vertices = torch::empty({(long)M, 3}, torch::dtype(torch::kFloat32).device(coords.device()));
+    auto intersected = torch::empty({(long)M, 3}, torch::dtype(torch::kInt32).device(coords.device()));
+
+    dim3 threads(BLOCK_SIZE);
+    dim3 blocks((M + BLOCK_SIZE - 1) / BLOCK_SIZE);
+
+    if (hashmap_keys.dtype() == torch::kUInt32) {
+       hipLaunchKernelGGL(( simple_dual_contour_kernel), dim3(blocks), dim3(threads), 0, 0, 
+            N_vert,
+            M,
+            W, H, D,
+            hashmap_keys.data_ptr<uint32_t>(),
+            hashmap_vals.data_ptr<uint32_t>(),
+            coords.data_ptr<int32_t>(),
+            udf.data_ptr<float>(),
+            vertices.data_ptr<float>(),
+            intersected.data_ptr<int32_t>()
+        );
+    } 
+    else if (hashmap_keys.dtype() == torch::kUInt64) {
+       hipLaunchKernelGGL(( simple_dual_contour_kernel), dim3(blocks), dim3(threads), 0, 0, 
+            N_vert,
+            M,
+            W, H, D,
+            hashmap_keys.data_ptr<uint64_t>(),
+            hashmap_vals.data_ptr<uint32_t>(),
+            coords.data_ptr<int32_t>(),
+            udf.data_ptr<float>(),
+            vertices.data_ptr<float>(),
+            intersected.data_ptr<int32_t>()
+        );
+    } 
+    else {
+        TORCH_CHECK(false, "Unsupported hashmap data type");
+    }
+
+    CUDA_CHECK(hipGetLastError());
+    return std::make_tuple(vertices, intersected);
+}
diff --git a/src/remesh/svox2vert.cu b/src/remesh/svox2vert.cu
index 6f1d517..63f1ee3 100644
--- a/src/remesh/svox2vert.cu
+++ b/src/remesh/svox2vert.cu
@@ -1,7 +1,15 @@
 #include <torch/extension.h>
+#ifdef __HIP_PLATFORM_AMD__
+#include <hip/hip_runtime.h>
+#else
 #include <cuda.h>
 #include <cuda_runtime.h>
+#endif
+#ifdef __HIP_PLATFORM_AMD__
+#include <hipcub/hipcub.hpp>
+#else
 #include <cub/cub.cuh>
+#endif
 
 #include "api.h"
 #include "../utils.h"
@@ -148,7 +156,7 @@ torch::Tensor cumesh::get_sparse_voxel_grid_active_vertices(
     // Get the number of active vertices for each voxel
     size_t N = hashmap_keys.size(0);
     int* num_vertices;
-    CUDA_CHECK(cudaMalloc(&num_vertices, (M + 1) * sizeof(int)));
+    CUDA_CHECK(hipMalloc(&num_vertices, (M + 1) * sizeof(int)));
     if (hashmap_keys.dtype() == torch::kUInt32) {
         get_vertex_num<<<(M + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
             N,
@@ -176,17 +184,17 @@ torch::Tensor cumesh::get_sparse_voxel_grid_active_vertices(
     } else {
         TORCH_CHECK(false, "Unsupported data type");
     }
-    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(hipGetLastError());
 
     // Compute the offset
     size_t temp_storage_bytes = 0;
-    cub::DeviceScan::ExclusiveSum(nullptr, temp_storage_bytes, num_vertices, M + 1);
+    hipcub::DeviceScan::ExclusiveSum(nullptr, temp_storage_bytes, num_vertices, M + 1);
     void* d_temp_storage = nullptr;
-    CUDA_CHECK(cudaMalloc(&d_temp_storage, temp_storage_bytes));
-    cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, num_vertices, M + 1);
-    CUDA_CHECK(cudaFree(d_temp_storage));
+    CUDA_CHECK(hipMalloc(&d_temp_storage, temp_storage_bytes));
+    hipcub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, num_vertices, M + 1);
+    CUDA_CHECK(hipFree(d_temp_storage));
     int total_vertices;
-    CUDA_CHECK(cudaMemcpy(&total_vertices, num_vertices + M, sizeof(int), cudaMemcpyDeviceToHost));
+    CUDA_CHECK(hipMemcpy(&total_vertices, num_vertices + M, sizeof(int), hipMemcpyDeviceToHost));
 
     // Set the active vertices for each voxel
     auto vertices = torch::empty({total_vertices, 3}, torch::dtype(torch::kInt32).device(hashmap_keys.device()));
@@ -218,10 +226,10 @@ torch::Tensor cumesh::get_sparse_voxel_grid_active_vertices(
             vertices.data_ptr<int32_t>()
         );
     }
-    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(hipGetLastError());
 
     // Free the temporary memory
-    CUDA_CHECK(cudaFree(num_vertices));
+    CUDA_CHECK(hipFree(num_vertices));
 
     return vertices;
 }
diff --git a/src/remesh/svox2vert.hip b/src/remesh/svox2vert.hip
new file mode 100644
index 0000000..dde7050
--- /dev/null
+++ b/src/remesh/svox2vert.hip
@@ -0,0 +1,236 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <torch/extension.h>
+#ifdef __HIP_PLATFORM_AMD__
+#include <hip/hip_runtime.h>
+#else
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime.h>
+#endif
+#ifdef __HIP_PLATFORM_AMD__
+#include <hipcub/hipcub.hpp>
+#else
+#include <hipcub/hipcub.hpp>
+#endif
+
+#include "api.h"
+#include "../utils_hip.h"
+#include "../hash/api.h"
+#include "../hash/hash.cuh"
+
+
+template<typename T>
+static __global__ void get_vertex_num(
+    const size_t N,
+    const size_t M,
+    const int W,
+    const int H,
+    const int D,
+    const T* __restrict__ hashmap_keys,
+    const uint32_t* __restrict__ hashmap_vals,
+    const int32_t* __restrict__ coords,
+    int* __restrict__ num_vertices
+) {
+    size_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    if (thread_id >= M) return;
+
+    int num = 1;        // include the current voxel
+    
+    int x = coords[3 * thread_id + 0];
+    int y = coords[3 * thread_id + 1];
+    int z = coords[3 * thread_id + 2];
+
+    size_t flat_idx;
+    T key;
+
+    #pragma unroll
+    for (int i = 0; i <= 1; i++) {
+        #pragma unroll
+        for (int j = 0; j <= 1; j++) {
+            #pragma unroll
+            for (int k = 0; k <= 1; k++) {
+                if (i == 0 && j == 0 && k == 0) continue;
+                int xx = x + i;
+                int yy = y + j;
+                int zz = z + k;
+                if (xx >= W || yy >= H || zz >= D) {
+                    num++;
+                    continue;
+                }
+                flat_idx = (size_t)(xx * H + yy) * D + zz;
+                key = static_cast<T>(flat_idx);
+                if (linear_probing_lookup(hashmap_keys, hashmap_vals, key, N) == std::numeric_limits<uint32_t>::max()) {
+                    num++;
+                }
+            }
+        }
+    }
+
+    num_vertices[thread_id] = num;
+}
+
+
+template<typename T>
+static __global__ void set_vertex(
+    const size_t N,
+    const size_t M,
+    const int W,
+    const int H,
+    const int D,
+    const T* __restrict__ hashmap_keys,
+    const uint32_t* __restrict__ hashmap_vals,
+    const int32_t* __restrict__ coords,
+    const int* __restrict__ vertices_offset,
+    int* __restrict__ vertices
+) {
+    size_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    if (thread_id >= M) return;
+    
+    int x = coords[3 * thread_id + 0];
+    int y = coords[3 * thread_id + 1];
+    int z = coords[3 * thread_id + 2];
+    int ptr_start = vertices_offset[thread_id];
+    vertices[3 * ptr_start + 0] = x;
+    vertices[3 * ptr_start + 1] = y;
+    vertices[3 * ptr_start + 2] = z;
+    ptr_start++;
+
+    size_t flat_idx;
+    T key;
+
+    #pragma unroll
+    for (int i = 0; i <= 1; i++) {
+        #pragma unroll
+        for (int j = 0; j <= 1; j++) {
+            #pragma unroll
+            for (int k = 0; k <= 1; k++) {
+                if (i == 0 && j == 0 && k == 0) continue;
+                int xx = x + i;
+                int yy = y + j;
+                int zz = z + k;
+                if (xx >= W || yy >= H || zz >= D) {
+                    vertices[3 * ptr_start + 0] = xx;
+                    vertices[3 * ptr_start + 1] = yy;
+                    vertices[3 * ptr_start + 2] = zz;
+                    ptr_start++;
+                    continue;
+                }
+                flat_idx = (size_t)(xx * H + yy) * D + zz;
+                key = static_cast<T>(flat_idx);
+                if (linear_probing_lookup(hashmap_keys, hashmap_vals, key, N) == std::numeric_limits<uint32_t>::max()) {
+                    vertices[3 * ptr_start + 0] = xx;
+                    vertices[3 * ptr_start + 1] = yy;
+                    vertices[3 * ptr_start + 2] = zz;
+                    ptr_start++;
+                }
+            }
+        }
+    }
+}
+
+
+/**
+ * Get the active vetices of a sparse voxel grid
+ * 
+ * @param hashmap_keys  [N] uint32/uint64 tensor containing the hashmap keys
+ * @param hashmap_vals  [N] uint32 tensor containing the hashmap values as voxel indices
+ * @param coords        [M, 3] int32 tensor containing the coordinates of the active voxels
+ * @param W             the number of width dimensions
+ * @param H             the number of height dimensions
+ * @param D             the number of depth dimensions
+ *  
+ * @return              [L, 3] int32 tensor containing the active vertices
+ */
+torch::Tensor cumesh::get_sparse_voxel_grid_active_vertices(
+    torch::Tensor& hashmap_keys,
+    torch::Tensor& hashmap_vals,
+    const torch::Tensor& coords,
+    const int W,
+    const int H,
+    const int D
+) {
+    // Handle empty input - return early to avoid launching kernels with 0 blocks
+    size_t M = coords.size(0);
+    if (M == 0) {
+        return torch::empty({0, 3}, torch::dtype(torch::kInt32).device(hashmap_keys.device()));
+    }
+
+    // Get the number of active vertices for each voxel
+    size_t N = hashmap_keys.size(0);
+    int* num_vertices;
+    CUDA_CHECK(hipMalloc(&num_vertices, (M + 1) * sizeof(int)));
+    if (hashmap_keys.dtype() == torch::kUInt32) {
+       hipLaunchKernelGGL(( get_vertex_num), dim3((M + BLOCK_SIZE - 1) / BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+            N,
+            M,
+            W,
+            H,
+            D,
+            hashmap_keys.data_ptr<uint32_t>(),
+            hashmap_vals.data_ptr<uint32_t>(),
+            coords.data_ptr<int32_t>(),
+            num_vertices
+        );
+    } else if (hashmap_keys.dtype() == torch::kUInt64) {
+       hipLaunchKernelGGL(( get_vertex_num), dim3((M + BLOCK_SIZE - 1) / BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+            N,
+            M,
+            W,
+            H,
+            D,
+            hashmap_keys.data_ptr<uint64_t>(),
+            hashmap_vals.data_ptr<uint32_t>(),
+            coords.data_ptr<int32_t>(),
+            num_vertices
+        );
+    } else {
+        TORCH_CHECK(false, "Unsupported data type");
+    }
+    CUDA_CHECK(hipGetLastError());
+
+    // Compute the offset
+    size_t temp_storage_bytes = 0;
+    hipcub::DeviceScan::ExclusiveSum(nullptr, temp_storage_bytes, num_vertices, M + 1);
+    void* d_temp_storage = nullptr;
+    CUDA_CHECK(hipMalloc(&d_temp_storage, temp_storage_bytes));
+    hipcub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, num_vertices, M + 1);
+    CUDA_CHECK(hipFree(d_temp_storage));
+    int total_vertices;
+    CUDA_CHECK(hipMemcpy(&total_vertices, num_vertices + M, sizeof(int), hipMemcpyDeviceToHost));
+
+    // Set the active vertices for each voxel
+    auto vertices = torch::empty({total_vertices, 3}, torch::dtype(torch::kInt32).device(hashmap_keys.device()));
+    if (hashmap_keys.dtype() == torch::kUInt32) {
+       hipLaunchKernelGGL(( set_vertex), dim3((M + BLOCK_SIZE - 1) / BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+            N,
+            M,
+            W,
+            H,
+            D,
+            hashmap_keys.data_ptr<uint32_t>(),
+            hashmap_vals.data_ptr<uint32_t>(),
+            coords.data_ptr<int32_t>(),
+            num_vertices,
+            vertices.data_ptr<int32_t>()
+        );
+    }
+    else if (hashmap_keys.dtype() == torch::kUInt64) {
+       hipLaunchKernelGGL(( set_vertex), dim3((M + BLOCK_SIZE - 1) / BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+            N,
+            M,
+            W,
+            H,
+            D,
+            hashmap_keys.data_ptr<uint64_t>(),
+            hashmap_vals.data_ptr<uint32_t>(),
+            coords.data_ptr<int32_t>(),
+            num_vertices,
+            vertices.data_ptr<int32_t>()
+        );
+    }
+    CUDA_CHECK(hipGetLastError());
+
+    // Free the temporary memory
+    CUDA_CHECK(hipFree(num_vertices));
+
+    return vertices;
+}
diff --git a/src/shared.h b/src/shared.h
index 66ecac7..e6bf04c 100644
--- a/src/shared.h
+++ b/src/shared.h
@@ -1,8 +1,16 @@
 #pragma once
 
+#ifdef __HIP_PLATFORM_AMD__
+#include <hip/hip_runtime.h>
+#else
 #include <cuda.h>
 #include <cuda_runtime.h>
+#endif
+#ifdef __HIP_PLATFORM_AMD__
+#include <hipcub/hipcub.hpp>
+#else
 #include <cub/cub.cuh>
+#endif
 #include "utils.h"
 #include "cumesh.h"
 
@@ -160,66 +168,66 @@ template<typename T>
 int compress_ids(T* ids, size_t N, Buffer<char>& cub_temp_storage, T* inverse=nullptr) {
     int *cu_indices, *cu_indices_argsorted;
     T *cu_ids_sorted;
-    CUDA_CHECK(cudaMalloc(&cu_indices, N * sizeof(int)));
-    CUDA_CHECK(cudaMalloc(&cu_indices_argsorted, N * sizeof(int)));
-    CUDA_CHECK(cudaMalloc(&cu_ids_sorted, N * sizeof(T)));
+    CUDA_CHECK(hipMalloc(&cu_indices, N * sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_indices_argsorted, N * sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_ids_sorted, N * sizeof(T)));
     arange_kernel<<<(N+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(cu_indices, N);
-    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(hipGetLastError());
     size_t temp_storage_bytes = 0;
-    CUDA_CHECK(cub::DeviceRadixSort::SortPairs(
+    CUDA_CHECK(hipcub::DeviceRadixSort::SortPairs(
         nullptr, temp_storage_bytes,
         ids, cu_ids_sorted,
         cu_indices, cu_indices_argsorted,
         N
     ));
     cub_temp_storage.resize(temp_storage_bytes);
-    CUDA_CHECK(cub::DeviceRadixSort::SortPairs(
+    CUDA_CHECK(hipcub::DeviceRadixSort::SortPairs(
         cub_temp_storage.ptr, temp_storage_bytes,
         ids, cu_ids_sorted,
         cu_indices, cu_indices_argsorted,
         N
     ));
-    CUDA_CHECK(cudaFree(cu_indices));
+    CUDA_CHECK(hipFree(cu_indices));
 
     // get diff
     T* cu_new_ids;
-    CUDA_CHECK(cudaMalloc(&cu_new_ids, N * sizeof(T)));
+    CUDA_CHECK(hipMalloc(&cu_new_ids, N * sizeof(T)));
     get_diff_kernel<<<(N+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
         cu_ids_sorted,
         cu_new_ids,
         N
     );
-    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(hipGetLastError());
 
     // inverse
     if (inverse) {
         int* cu_num;
-        CUDA_CHECK(cudaMalloc(&cu_num, sizeof(int)));
+        CUDA_CHECK(hipMalloc(&cu_num, sizeof(int)));
         temp_storage_bytes = 0;
-        CUDA_CHECK(cub::DeviceSelect::Flagged(
+        CUDA_CHECK(hipcub::DeviceSelect::Flagged(
             nullptr, temp_storage_bytes,
             cu_ids_sorted, cu_new_ids, inverse, cu_num,
             N
         ));
         cub_temp_storage.resize(temp_storage_bytes);
-        CUDA_CHECK(cub::DeviceSelect::Flagged(
+        CUDA_CHECK(hipcub::DeviceSelect::Flagged(
             cub_temp_storage.ptr, temp_storage_bytes,
             cu_ids_sorted, cu_new_ids, inverse, cu_num,
             N
         ));
-        CUDA_CHECK(cudaFree(cu_num));
+        CUDA_CHECK(hipFree(cu_num));
     }
-    CUDA_CHECK(cudaFree(cu_ids_sorted));
+    CUDA_CHECK(hipFree(cu_ids_sorted));
     
     // scan diff
     temp_storage_bytes = 0;
-    CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
+    CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum(
         nullptr, temp_storage_bytes,
         cu_new_ids,
         N
     ));
     cub_temp_storage.resize(temp_storage_bytes);
-    CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
+    CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum(
         cub_temp_storage.ptr, temp_storage_bytes,
         cu_new_ids,
         N
@@ -232,12 +240,12 @@ int compress_ids(T* ids, size_t N, Buffer<char>& cub_temp_storage, T* inverse=nu
         N,
         ids
     );
-    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(hipGetLastError());
     T num_components;
-    CUDA_CHECK(cudaMemcpy(&num_components, cu_new_ids + N-1, sizeof(T), cudaMemcpyDeviceToHost));
+    CUDA_CHECK(hipMemcpy(&num_components, cu_new_ids + N-1, sizeof(T), hipMemcpyDeviceToHost));
     num_components += 1;
-    CUDA_CHECK(cudaFree(cu_new_ids));
-    CUDA_CHECK(cudaFree(cu_indices_argsorted));
+    CUDA_CHECK(hipFree(cu_new_ids));
+    CUDA_CHECK(hipFree(cu_indices_argsorted));
 
     return static_cast<int>(num_components);
 }
@@ -248,33 +256,33 @@ int compress_ids(T* ids, size_t N, Buffer<char>& cub_temp_storage, T* inverse=nu
 template <typename T>
 void print_max_val(T* ptr, size_t size) {
     T* dbg_cu_max_val;
-    CUDA_CHECK(cudaMalloc(&dbg_cu_max_val, sizeof(T)));
+    CUDA_CHECK(hipMalloc(&dbg_cu_max_val, sizeof(T)));
     size_t temp_storage_bytes = 0;
-    CUDA_CHECK(cub::DeviceReduce::Max(
+    CUDA_CHECK(hipcub::DeviceReduce::Max(
         nullptr, temp_storage_bytes,
         ptr,
         dbg_cu_max_val,
         size
     ));
     char* temp_storage;
-    CUDA_CHECK(cudaMalloc(&temp_storage, temp_storage_bytes));
-    CUDA_CHECK(cub::DeviceReduce::Max(
+    CUDA_CHECK(hipMalloc(&temp_storage, temp_storage_bytes));
+    CUDA_CHECK(hipcub::DeviceReduce::Max(
         temp_storage, temp_storage_bytes,
         ptr,
         dbg_cu_max_val,
         size
     ));
     T h_max_val;
-    CUDA_CHECK(cudaMemcpy(&h_max_val, dbg_cu_max_val, sizeof(T), cudaMemcpyDeviceToHost));
+    CUDA_CHECK(hipMemcpy(&h_max_val, dbg_cu_max_val, sizeof(T), hipMemcpyDeviceToHost));
     std::cout << "Max value: " << h_max_val << std::endl;
-    CUDA_CHECK(cudaFree(dbg_cu_max_val));
-    CUDA_CHECK(cudaFree(temp_storage));
+    CUDA_CHECK(hipFree(dbg_cu_max_val));
+    CUDA_CHECK(hipFree(temp_storage));
 }
 
 template <typename T>
 void print_val(T* ptr, size_t size) {
     T h_ptr[size];
-    CUDA_CHECK(cudaMemcpy(h_ptr, ptr, size * sizeof(T), cudaMemcpyDeviceToHost));
+    CUDA_CHECK(hipMemcpy(h_ptr, ptr, size * sizeof(T), hipMemcpyDeviceToHost));
     for (size_t i = 0; i < size; i++) {
         std::cout << h_ptr[i] << " ";
     }
diff --git a/src/shared.hip b/src/shared.hip
new file mode 100644
index 0000000..9d74d4c
--- /dev/null
+++ b/src/shared.hip
@@ -0,0 +1,69 @@
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
+#include "shared_hip.h"
+
+
+namespace cumesh {
+
+
+/**
+ * Hook edges
+ * @param adj: the buffer for adjacency, shape (M)
+ * @param M: the number of adjacency
+ * @param conn_comp_ids: the buffer for connected component ids, shape (F)
+ * @param end_flag: flag to indicate if any union operation happened
+ */
+__global__ void hook_edges_kernel(
+    const int2* adj,
+    const int M,
+    int* conn_comp_ids,
+    int* end_flag
+) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= M) return;
+
+    // get adjacent faces
+    int f0 = adj[tid].x;
+    int f1 = adj[tid].y;
+
+    // union
+    // find roots
+    int root0 = conn_comp_ids[f0];
+    while (root0 != conn_comp_ids[root0]) {
+        root0 = conn_comp_ids[root0];
+    }
+    int root1 = conn_comp_ids[f1];
+    while (root1 != conn_comp_ids[root1]) {
+        root1 = conn_comp_ids[root1];
+    }
+
+    if (root0 == root1) return;
+
+    int high = max(root0, root1);
+    int low = min(root0, root1);
+    atomicMin(&conn_comp_ids[high], low);
+    *end_flag = 0;
+}
+
+
+/**
+ * Compress connected components
+ * @param conn_comp_ids: the buffer for connected component ids, shape (F)
+ * @param F: the number of faces
+ */
+__global__ void compress_components_kernel(
+    int* conn_comp_ids,
+    const int F
+) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= F) return;
+
+    int p = conn_comp_ids[tid];
+    while (p != conn_comp_ids[p]) {
+        p = conn_comp_ids[p];
+    }
+    conn_comp_ids[tid] = p;
+}
+
+
+} // namespace cumesh
\ No newline at end of file
diff --git a/src/shared_hip.h b/src/shared_hip.h
new file mode 100644
index 0000000..a82e1b4
--- /dev/null
+++ b/src/shared_hip.h
@@ -0,0 +1,294 @@
+// !!! This is a file automatically generated by hipify!!!
+#pragma once
+
+#ifdef __HIP_PLATFORM_AMD__
+#include <hip/hip_runtime.h>
+#else
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime.h>
+#endif
+#ifdef __HIP_PLATFORM_AMD__
+#include <hipcub/hipcub.hpp>
+#else
+#include <hipcub/hipcub.hpp>
+#endif
+#include "utils_hip.h"
+#include "cumesh_hip.h"
+
+
+namespace cumesh {
+
+
+template<typename T>
+__global__ void arange_kernel(T* array, int N, int stride=1) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= N) return;
+    array[tid] = static_cast<T>(tid * stride);
+}
+
+
+template<typename T1, typename T2>
+__global__ void cast_kernel(T1* input, T2* output, int N) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= N) return;
+    output[tid] = static_cast<T2>(input[tid]);
+}
+
+
+template<typename T>
+__global__ void fill_kernel(T* array, int N, T value) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= N) return;
+    array[tid] = value;
+}
+
+
+template<typename T>
+__global__ void scatter_kernel(
+    const int* indices,
+    const T* values,
+    const size_t N,
+    T* output
+) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= N) return;
+    output[indices[tid]] = values[tid];
+}
+
+
+template<typename T>
+__global__ void index_kernel(
+    const T* values,
+    const int* indices,
+    const size_t N,
+    T* output
+) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= N) return;
+    output[tid] = values[indices[tid]];
+}
+
+
+template<typename T>
+__global__ void diff_kernel(
+    const T* values,
+    const size_t N,
+    T* output
+) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= N) return;
+    output[tid] = values[tid+1] - values[tid];
+}
+
+
+template<typename T>
+__global__ void set_flag_kernel(
+    const int* indices,
+    const size_t N,
+    T* output
+) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= N) return;
+    output[indices[tid]] = static_cast<T>(1);
+}
+
+
+template<typename CompT, typename FlagT, typename Comparator>
+__global__ void compare_kernel(
+    const CompT* values,
+    const CompT threshold,
+    const size_t N,
+    Comparator op,
+    FlagT* flag
+) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= N) return;
+    flag[tid] = op(values[tid], threshold) ? static_cast<FlagT>(1) : static_cast<FlagT>(0);
+}
+
+
+template<typename Ta, typename Tb>
+__global__ void inplace_div_kernel(
+    Ta* a,
+    const Tb* b,
+    const size_t N
+) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= N) return;
+    a[tid] = a[tid] / static_cast<float>(b[tid]);
+}
+
+
+/**
+ * Hook edges
+ * @param adj: the buffer for adjacency, shape (M)
+ * @param M: the number of adjacency
+ * @param conn_comp_ids: the buffer for connected component ids, shape (F)
+ * @param end_flag: flag to indicate if any union operation happened
+ */
+__global__ void hook_edges_kernel(
+    const int2* adj,
+    const int M,
+    int* conn_comp_ids,
+    int* end_flag
+);
+
+
+/**
+ * Compress connected components
+ * @param conn_comp_ids: the buffer for connected component ids, shape (F)
+ * @param F: the number of faces
+ */
+__global__ void compress_components_kernel(
+    int* conn_comp_ids,
+    const int F
+);
+
+
+template<typename T>
+__global__ void get_diff_kernel(
+    const T* ids_sorted,
+    T* ids_diff,
+    const int N
+) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= N) return;
+    if (tid == N-1) {
+        ids_diff[tid] = 1;
+        return;
+    }
+    if (ids_sorted[tid] != ids_sorted[tid+1]) {
+        ids_diff[tid] = 1;
+    } else {
+        ids_diff[tid] = 0;
+    }
+}
+
+
+template<typename T>
+int compress_ids(T* ids, size_t N, Buffer<char>& cub_temp_storage, T* inverse=nullptr) {
+    int *cu_indices, *cu_indices_argsorted;
+    T *cu_ids_sorted;
+    CUDA_CHECK(hipMalloc(&cu_indices, N * sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_indices_argsorted, N * sizeof(int)));
+    CUDA_CHECK(hipMalloc(&cu_ids_sorted, N * sizeof(T)));
+   hipLaunchKernelGGL(( arange_kernel), dim3((N+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, cu_indices, N);
+    CUDA_CHECK(hipGetLastError());
+    size_t temp_storage_bytes = 0;
+    CUDA_CHECK(hipcub::DeviceRadixSort::SortPairs(
+        nullptr, temp_storage_bytes,
+        ids, cu_ids_sorted,
+        cu_indices, cu_indices_argsorted,
+        N
+    ));
+    cub_temp_storage.resize(temp_storage_bytes);
+    CUDA_CHECK(hipcub::DeviceRadixSort::SortPairs(
+        cub_temp_storage.ptr, temp_storage_bytes,
+        ids, cu_ids_sorted,
+        cu_indices, cu_indices_argsorted,
+        N
+    ));
+    CUDA_CHECK(hipFree(cu_indices));
+
+    // get diff
+    T* cu_new_ids;
+    CUDA_CHECK(hipMalloc(&cu_new_ids, N * sizeof(T)));
+   hipLaunchKernelGGL(( get_diff_kernel), dim3((N+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+        cu_ids_sorted,
+        cu_new_ids,
+        N
+    );
+    CUDA_CHECK(hipGetLastError());
+
+    // inverse
+    if (inverse) {
+        int* cu_num;
+        CUDA_CHECK(hipMalloc(&cu_num, sizeof(int)));
+        temp_storage_bytes = 0;
+        CUDA_CHECK(hipcub::DeviceSelect::Flagged(
+            nullptr, temp_storage_bytes,
+            cu_ids_sorted, cu_new_ids, inverse, cu_num,
+            N
+        ));
+        cub_temp_storage.resize(temp_storage_bytes);
+        CUDA_CHECK(hipcub::DeviceSelect::Flagged(
+            cub_temp_storage.ptr, temp_storage_bytes,
+            cu_ids_sorted, cu_new_ids, inverse, cu_num,
+            N
+        ));
+        CUDA_CHECK(hipFree(cu_num));
+    }
+    CUDA_CHECK(hipFree(cu_ids_sorted));
+    
+    // scan diff
+    temp_storage_bytes = 0;
+    CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum(
+        nullptr, temp_storage_bytes,
+        cu_new_ids,
+        N
+    ));
+    cub_temp_storage.resize(temp_storage_bytes);
+    CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum(
+        cub_temp_storage.ptr, temp_storage_bytes,
+        cu_new_ids,
+        N
+    ));
+    
+    // scatter
+   hipLaunchKernelGGL(( scatter_kernel), dim3((N+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+        cu_indices_argsorted,
+        cu_new_ids,
+        N,
+        ids
+    );
+    CUDA_CHECK(hipGetLastError());
+    T num_components;
+    CUDA_CHECK(hipMemcpy(&num_components, cu_new_ids + N-1, sizeof(T), hipMemcpyDeviceToHost));
+    num_components += 1;
+    CUDA_CHECK(hipFree(cu_new_ids));
+    CUDA_CHECK(hipFree(cu_indices_argsorted));
+
+    return static_cast<int>(num_components);
+}
+
+
+// DEBUG
+
+template <typename T>
+void print_max_val(T* ptr, size_t size) {
+    T* dbg_cu_max_val;
+    CUDA_CHECK(hipMalloc(&dbg_cu_max_val, sizeof(T)));
+    size_t temp_storage_bytes = 0;
+    CUDA_CHECK(hipcub::DeviceReduce::Max(
+        nullptr, temp_storage_bytes,
+        ptr,
+        dbg_cu_max_val,
+        size
+    ));
+    char* temp_storage;
+    CUDA_CHECK(hipMalloc(&temp_storage, temp_storage_bytes));
+    CUDA_CHECK(hipcub::DeviceReduce::Max(
+        temp_storage, temp_storage_bytes,
+        ptr,
+        dbg_cu_max_val,
+        size
+    ));
+    T h_max_val;
+    CUDA_CHECK(hipMemcpy(&h_max_val, dbg_cu_max_val, sizeof(T), hipMemcpyDeviceToHost));
+    std::cout << "Max value: " << h_max_val << std::endl;
+    CUDA_CHECK(hipFree(dbg_cu_max_val));
+    CUDA_CHECK(hipFree(temp_storage));
+}
+
+template <typename T>
+void print_val(T* ptr, size_t size) {
+    T h_ptr[size];
+    CUDA_CHECK(hipMemcpy(h_ptr, ptr, size * sizeof(T), hipMemcpyDeviceToHost));
+    for (size_t i = 0; i < size; i++) {
+        std::cout << h_ptr[i] << " ";
+    }
+    std::cout << std::endl;
+}
+
+
+} // namespace cumesh
\ No newline at end of file
diff --git a/src/simplify.cu b/src/simplify.cu
index 9efde9e..50eb5b6 100644
--- a/src/simplify.cu
+++ b/src/simplify.cu
@@ -1,6 +1,10 @@
 #include "cumesh.h"
 #include "dtypes.cuh"
+#ifdef __HIP_PLATFORM_AMD__
+#include <hipcub/hipcub.hpp>
+#else
 #include <cub/cub.cuh>
+#endif
 
 
 namespace cumesh {
@@ -77,7 +81,7 @@ void get_qem(
         V, F,
         reinterpret_cast<QEM*>(ctx.temp_storage.ptr)
     );
-    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(hipGetLastError());
 }
 
 
@@ -246,7 +250,7 @@ void get_edge_collapse_cost(
         lambda_edge_length, lambda_skinny,
         ctx.edge_collapse_costs.ptr
     );
-    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(hipGetLastError());
 }
 
 
@@ -311,7 +315,7 @@ void propagate_cost(
         V, F, E,
         ctx.propagated_costs.ptr
     );
-    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(hipGetLastError());
 }
 
 
@@ -466,22 +470,22 @@ void collapse_edges(
         ctx.vertices_map.ptr,
         ctx.faces_map.ptr
     );
-    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(hipGetLastError());
     
     // update vertices buffer
     // get vertices map
     size_t temp_storage_bytes = 0;
-    CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
+    CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum(
         nullptr, temp_storage_bytes,
         ctx.vertices_map.ptr, V+1
     ));
     ctx.cub_temp_storage.resize(temp_storage_bytes);
-    CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
+    CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum(
         ctx.cub_temp_storage.ptr, temp_storage_bytes,
         ctx.vertices_map.ptr, V+1
     ));
     int new_num_vertices;
-    CUDA_CHECK(cudaMemcpy(&new_num_vertices, ctx.vertices_map.ptr + V, sizeof(int), cudaMemcpyDeviceToHost));
+    CUDA_CHECK(hipMemcpy(&new_num_vertices, ctx.vertices_map.ptr + V, sizeof(int), hipMemcpyDeviceToHost));
     // compress vertices
     ctx.temp_storage.resize(new_num_vertices * sizeof(float3));
     compress_vertices_kernel<<<(V+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
@@ -490,22 +494,22 @@ void collapse_edges(
         V,
         reinterpret_cast<float3*>(ctx.temp_storage.ptr)
     );
-    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(hipGetLastError());
     swap_buffers(ctx.temp_storage, ctx.vertices);
 
     // update faces buffer
     // get faces map
-    CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
+    CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum(
         nullptr, temp_storage_bytes,
         ctx.faces_map.ptr, F+1
     ));
     ctx.cub_temp_storage.resize(temp_storage_bytes);
-    CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
+    CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum(
         ctx.cub_temp_storage.ptr, temp_storage_bytes,
         ctx.faces_map.ptr, F+1
     ));
     int new_num_faces;
-    CUDA_CHECK(cudaMemcpy(&new_num_faces, ctx.faces_map.ptr + F, sizeof(int), cudaMemcpyDeviceToHost));
+    CUDA_CHECK(hipMemcpy(&new_num_faces, ctx.faces_map.ptr + F, sizeof(int), hipMemcpyDeviceToHost));
     // compress faces
     ctx.temp_storage.resize(new_num_faces * sizeof(int3));
     compress_faces_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
@@ -515,7 +519,7 @@ void collapse_edges(
         F,
         reinterpret_cast<int3*>(ctx.temp_storage.ptr)
     );
-    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(hipGetLastError());
     swap_buffers(ctx.temp_storage, ctx.faces);
 }
 
@@ -526,7 +530,7 @@ std::tuple<int, int> CuMesh::simplify_step(float lambda_edge_length, float lambd
     if (timing) start = std::chrono::high_resolution_clock::now();
     this->get_vertex_face_adjacency();
     if (timing) {
-        CUDA_CHECK(cudaDeviceSynchronize());
+        CUDA_CHECK(hipDeviceSynchronize());
         end = std::chrono::high_resolution_clock::now();
         std::cout << "get_vertex_face_adjacency: " << std::chrono::duration_cast<std::chrono::microseconds>(end - start).count() << " us" << std::endl;
     }
@@ -535,7 +539,7 @@ std::tuple<int, int> CuMesh::simplify_step(float lambda_edge_length, float lambd
     this->get_edges();
     this->get_boundary_info();
     if (timing) {
-        CUDA_CHECK(cudaDeviceSynchronize());
+        CUDA_CHECK(hipDeviceSynchronize());
         end = std::chrono::high_resolution_clock::now();
         std::cout << "get_edges: " << std::chrono::duration_cast<std::chrono::microseconds>(end - start).count() << " us" << std::endl;
     }
@@ -543,7 +547,7 @@ std::tuple<int, int> CuMesh::simplify_step(float lambda_edge_length, float lambd
     if (timing) start = std::chrono::high_resolution_clock::now();
     get_qem(*this);
     if (timing) {
-        CUDA_CHECK(cudaDeviceSynchronize());
+        CUDA_CHECK(hipDeviceSynchronize());
         end = std::chrono::high_resolution_clock::now();
         std::cout << "get_qem: " << std::chrono::duration_cast<std::chrono::microseconds>(end - start).count() << " us" << std::endl;
     }
@@ -551,7 +555,7 @@ std::tuple<int, int> CuMesh::simplify_step(float lambda_edge_length, float lambd
     if (timing) start = std::chrono::high_resolution_clock::now();
     get_edge_collapse_cost(*this, lambda_edge_length, lambda_skinny);
     if (timing) {
-        CUDA_CHECK(cudaDeviceSynchronize());
+        CUDA_CHECK(hipDeviceSynchronize());
         end = std::chrono::high_resolution_clock::now();
         std::cout << "get_edge_collapse_cost: " << std::chrono::duration_cast<std::chrono::microseconds>(end - start).count() << " us" << std::endl;
     }
@@ -559,7 +563,7 @@ std::tuple<int, int> CuMesh::simplify_step(float lambda_edge_length, float lambd
     if (timing) start = std::chrono::high_resolution_clock::now();
     propagate_cost(*this);
     if (timing) {
-        CUDA_CHECK(cudaDeviceSynchronize());
+        CUDA_CHECK(hipDeviceSynchronize());
         end = std::chrono::high_resolution_clock::now();
         std::cout << "propagate_cost: " << std::chrono::duration_cast<std::chrono::microseconds>(end - start).count() << " us" << std::endl;
     }
@@ -567,7 +571,7 @@ std::tuple<int, int> CuMesh::simplify_step(float lambda_edge_length, float lambd
     if (timing) start = std::chrono::high_resolution_clock::now();
     collapse_edges(*this, threshold);
     if (timing) {
-        CUDA_CHECK(cudaDeviceSynchronize());
+        CUDA_CHECK(hipDeviceSynchronize());
         end = std::chrono::high_resolution_clock::now();
         std::cout << "collapse_edges: " << std::chrono::duration_cast<std::chrono::microseconds>(end - start).count() << " us" << std::endl;
     }
diff --git a/src/simplify.hip b/src/simplify.hip
new file mode 100644
index 0000000..3748bd6
--- /dev/null
+++ b/src/simplify.hip
@@ -0,0 +1,588 @@
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
+#include "cumesh_hip.h"
+#include "dtypes_hip.cuh"
+#ifdef __HIP_PLATFORM_AMD__
+#include <hipcub/hipcub.hpp>
+#else
+#include <hipcub/hipcub.hpp>
+#endif
+
+
+namespace cumesh {
+
+
+__device__ inline uint64_t pack_key_value_positive(int key, float value) {
+    unsigned int v = __float_as_uint(value);
+    return (static_cast<uint64_t>(v) << 32) |
+           static_cast<unsigned int>(key);
+}
+
+
+__device__ inline void unpack_key_value_positive(uint64_t key_value, int& key, float& value) {
+    key = static_cast<int>(key_value & 0xffffffffu);
+    value = __uint_as_float(static_cast<unsigned int>(key_value >> 32));
+}
+
+
+/**
+ * Get the QEM for each vertex
+ * 
+ * @param vertices: the vertices of the mesh, shape (V)
+ * @param faces: the faces of the mesh, shape (F)
+ * @param vert2face: the buffer for neighbor face ids, shape (total_neighbor_face_cnt)
+ * @param vert2face_offset: the buffer for neighbor face ids offset, shape (V+1)
+ * @param V: the number of vertices
+ * @param F: the number of faces
+ * @param qems: the buffer for QEMs, shape (V)
+ */
+static __global__ void get_qem_kernel(
+    const float3* vertices,
+    const int3* faces,
+    const int* vert2face,
+    const int* vert2face_offset,
+    const int V,
+    const int F,
+    QEM* qems
+) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= V) return;
+
+    // compute QEM
+    QEM v_qem;
+    for (int f = vert2face_offset[tid]; f < vert2face_offset[tid+1]; f++) {
+        int3 f_vids = faces[vert2face[f]];
+        Vec3f f_v0(vertices[f_vids.x]);
+        Vec3f e1(vertices[f_vids.y]);
+        Vec3f e2(vertices[f_vids.z]);
+        e1 -= f_v0;
+        e2 -= f_v0;
+        Vec3f n = e1.cross(e2);
+        n.normalize();
+        float d = -(n.dot(f_v0));
+        v_qem.add_plane({ n.x, n.y, n.z, d });
+    }
+    qems[tid] = v_qem;
+}
+
+
+/**
+ * Get the QEM for each vertex
+ */
+void get_qem(
+    CuMesh& ctx
+) {
+    size_t V = ctx.vertices.size;
+    size_t F = ctx.faces.size;
+    ctx.temp_storage.resize(V * sizeof(QEM));
+   hipLaunchKernelGGL(( get_qem_kernel), dim3((V+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+        ctx.vertices.ptr,
+        ctx.faces.ptr,
+        ctx.vert2face.ptr,
+        ctx.vert2face_offset.ptr,
+        V, F,
+        reinterpret_cast<QEM*>(ctx.temp_storage.ptr)
+    );
+    CUDA_CHECK(hipGetLastError());
+}
+
+
+inline __device__ bool process_incident_tri(
+    int tri_idx,
+    int collapse_keep_vert, // the vertex we keep (e0 or e1)
+    int collapse_other_vert, // the other one (the one removed)
+    const float3* vertices,
+    const int3* faces,
+    const Vec3f& v_new, // midpoint
+    float& skinny_cost,
+    int& num_tri
+) {
+    const float EPS = 1e-12f;
+    int3 f_vids = faces[tri_idx];
+
+    // If this triangle contains the other vertex (the edge), it will be removed, skip it
+    if (f_vids.x == collapse_other_vert || f_vids.y == collapse_other_vert || f_vids.z == collapse_other_vert)
+        return true; // skip, not an error
+
+    // get old positions
+    Vec3f a(vertices[f_vids.x]);
+    Vec3f b(vertices[f_vids.y]);
+    Vec3f c(vertices[f_vids.z]);
+
+    // build new positions: replace occurrences of collapse_keep_vert with v_new
+    Vec3f na = (f_vids.x == collapse_keep_vert) ? v_new : a;
+    Vec3f nb = (f_vids.y == collapse_keep_vert) ? v_new : b;
+    Vec3f nc = (f_vids.z == collapse_keep_vert) ? v_new : c;
+
+    // compute old edge vectors (for old normal)
+    Vec3f old_e1 = b - a;
+    Vec3f old_e2 = c - a;
+    Vec3f old_normal = old_e1.cross(old_e2);
+    float old_area = 0.5f * old_normal.norm();
+
+    // compute new edge vectors consistently: e1 = nb - na, e2 = nc - na
+    Vec3f new_e1 = nb - na;
+    Vec3f new_e2 = nc - na;
+    Vec3f new_normal = new_e1.cross(new_e2);
+    float new_area = 0.5f * new_normal.norm();
+
+    // check flipping
+    if (old_normal.dot(new_normal) < 0.0f) {
+        return false; // invalid (flipped)
+    }
+
+    // compute side lengths squared for shape metric
+    Vec3f new_e0 = nc - nb;
+    float denom = new_e0.norm2() + new_e1.norm2() + new_e2.norm2();
+    if (denom < EPS) denom = EPS;
+    float shapeMetric = 4.0f * sqrtf(3.0f) * new_area / denom;
+    float term = 1.0f - fminf(fmaxf(shapeMetric, 0.0f), 1.0f);
+    skinny_cost += term;
+    num_tri += 1;
+    return true;
+}
+
+
+/**
+ * Get the cost for each edge collapse
+ * 
+ * @param vertices: the vertices of the mesh, shape (V)
+ * @param faces: the faces of the mesh, shape (F)
+ * @param vert2face: the buffer for neighbor face ids, shape (total_neighbor_face_cnt)
+ * @param vert2face_offset: the buffer for neighbor face ids offset, shape (V+1)
+ * @param edges: the buffer for edges, shape (E)
+ * @param vert_is_boundary: the buffer for boundary vertex indicator, shape (V)
+ * @param qems: the buffer for QEMs, shape (V)
+ * @param V: the number of vertices
+ * @param F: the number of faces
+ * @param E: the number of edges
+ * @param edge_collapse_costs: the buffer for edge collapse costs, shape (E)
+ */
+static __global__ void get_edge_collapse_cost_kernel(
+    const float3* vertices,
+    const int3* faces,
+    const int* vert2face,
+    const int* vert2face_offset,
+    const uint64_t* edges,
+    const uint8_t * vert_is_boundary,
+    const QEM* qems,
+    const int V,
+    const int F,
+    const int E,
+    const float lambda_edge_length,
+    const float lambda_skinny,
+    float* edge_collapse_costs
+) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= E) return;
+
+    // get edge
+    uint64_t e = edges[tid];
+    int e0 = int(e >> 32);
+    int e1 = int(e & 0xFFFFFFFF);
+
+    // get edge vertices
+    Vec3f v0(vertices[e0]);
+    Vec3f v1(vertices[e1]);
+    uint8_t v0_is_bound = vert_is_boundary[e0];
+    uint8_t v1_is_bound = vert_is_boundary[e1];
+    float w0 = 0.5;
+    if (v0_is_bound && !v1_is_bound) w0 = 1.0;
+    else if (!v0_is_bound &&  v1_is_bound) w0 = 0.0;
+    Vec3f v = v0 * w0 + v1 * (1.0f - w0);
+
+    float cost = 0.0f;
+
+    // QEM cost
+    QEM edge_qem = qems[e0] + qems[e1];
+    float qem_cost = edge_qem.evaluate(v);
+    cost += qem_cost;
+
+    // edge length cost
+    float edge_length2 = (v1 - v0).norm2();
+    cost += lambda_edge_length * edge_length2;
+
+    // skinny cost
+    float skinny_cost = 0.0f;
+    int num_tri = 0;
+    for (int f = vert2face_offset[e0]; f < vert2face_offset[e0+1]; f++) {
+        int tri_idx = vert2face[f];
+        if (!process_incident_tri(tri_idx, e0, e1, vertices, faces, v, skinny_cost, num_tri)) {
+            edge_collapse_costs[tid] = INFINITY;
+            return;
+        }
+    }
+    for (int f = vert2face_offset[e1]; f < vert2face_offset[e1+1]; f++) {
+        int tri_idx = vert2face[f];
+        if (!process_incident_tri(tri_idx, e1, e0, vertices, faces, v, skinny_cost, num_tri)) {
+            edge_collapse_costs[tid] = INFINITY;
+            return;
+        }
+    }
+    if (num_tri > 0) {
+        skinny_cost /= num_tri;
+    }
+    cost += lambda_skinny * skinny_cost * edge_length2;
+
+    edge_collapse_costs[tid] = cost;
+}
+
+
+/**
+ * Get the cost for each edge collapse
+ */
+void get_edge_collapse_cost(
+    CuMesh& ctx,
+    float lambda_edge_length,
+    float lambda_skinny
+) {
+    size_t V = ctx.vertices.size;
+    size_t F = ctx.faces.size;
+    size_t E = ctx.edges.size;
+    ctx.edge_collapse_costs.resize(E);
+   hipLaunchKernelGGL(( get_edge_collapse_cost_kernel), dim3((E+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+        ctx.vertices.ptr,
+        ctx.faces.ptr,
+        ctx.vert2face.ptr,
+        ctx.vert2face_offset.ptr,
+        ctx.edges.ptr,
+        ctx.vert_is_boundary.ptr,
+        reinterpret_cast<const QEM*>(ctx.temp_storage.ptr),
+        V, F, E,
+        lambda_edge_length, lambda_skinny,
+        ctx.edge_collapse_costs.ptr
+    );
+    CUDA_CHECK(hipGetLastError());
+}
+
+
+/**
+ * Propagate cost to neighboring faces
+ * 
+ * @param edges: the buffer for edges, shape (E)
+ * @param vert2face: the buffer for neighboring face ids, shape (total_neighbor_face_cnt)
+ * @param vert2face_offset: the buffer for neighboring face ids offset, shape (V+1)
+ * @param edge_collapse_costs: the buffer for edge collapse costs, shape (E)
+ * @param V: the number of vertices
+ * @param F: the number of faces
+ * @param E: the number of edges
+ * @param propagated_costs: the buffer for edge collapse costs propagated, shape (F)
+ */
+static __global__ void propagate_cost_kernel(
+    const uint64_t* edges,
+    const int* vert2face,
+    const int* vert2face_offset,
+    const float* edge_collapse_costs,
+    const int V,
+    const int F,
+    const int E,
+    uint64_t* propagated_costs
+) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= E) return;
+
+    // get edge
+    uint64_t e = edges[tid];
+    int e0 = int(e >> 32);
+    int e1 = int(e & 0xFFFFFFFF);
+
+    uint64_t cost = pack_key_value_positive(tid, edge_collapse_costs[tid]);
+
+    // propagate cost to neighboring faces
+    for (int f = vert2face_offset[e0]; f < vert2face_offset[e0+1]; f++) {
+        atomicMin(reinterpret_cast<unsigned long long*>(&propagated_costs[vert2face[f]]), static_cast<unsigned long long>(cost));
+    }
+    for (int f = vert2face_offset[e1]; f < vert2face_offset[e1+1]; f++) {
+        atomicMin(reinterpret_cast<unsigned long long*>(&propagated_costs[vert2face[f]]), static_cast<unsigned long long>(cost));
+    }
+}
+
+
+/**
+ * Propagate cost to neighboring faces
+ */
+void propagate_cost(
+    CuMesh& ctx
+) {
+    size_t V = ctx.vertices.size;
+    size_t F = ctx.faces.size;
+    size_t E = ctx.edges.size;
+    ctx.propagated_costs.resize(F);
+    ctx.propagated_costs.fill(std::numeric_limits<uint64_t>::max());
+   hipLaunchKernelGGL(( propagate_cost_kernel), dim3((E+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+        ctx.edges.ptr,
+        ctx.vert2face.ptr,
+        ctx.vert2face_offset.ptr,
+        ctx.edge_collapse_costs.ptr,
+        V, F, E,
+        ctx.propagated_costs.ptr
+    );
+    CUDA_CHECK(hipGetLastError());
+}
+
+
+/**
+ * Collapse edges parallelly
+ * 
+ * @param vertices: the vertices of the mesh, shape (V)
+ * @param faces: the faces of the mesh, shape (F)
+ * @param edges: the buffer for edges, shape (E)
+ * @param vert2face: the buffer for neighboring face ids, shape (total_neighbor_face_cnt)
+ * @param vert2face_offset: the buffer for neighboring face ids offset, shape (V+1)
+ * @param edge_collapse_costs: the buffer for edge collapse costs, shape (E)
+ * @param propagated_costs: the buffer for edge collapse costs propagated, shape (F)
+ * @param vert_is_boundary: the buffer for boundary vertex indicator, shape (V)
+ * @param V: the number of vertices
+ * @param F: the number of faces
+ * @param E: the number of edges
+ * @param collapse_thresh: the threshold for cost collapse
+ * @param vertices_kept: the flag for vertices kept, shape (V)
+ * @param faces_kept: the flag for faces kept, shape (F)
+ */
+static __global__ void collapse_edges_kernel(
+    float3* vertices,
+    int3* faces,
+    uint64_t* edges,
+    const int* vert2face,
+    const int* vert2face_offset,
+    const float* edge_collapse_costs,
+    const uint64_t* propagated_costs,
+    const uint8_t * vert_is_boundary,
+    const int V,
+    const int F,
+    const int E,
+    const float collapse_thresh,
+    int* vertices_kept,
+    int* faces_kept
+) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= E) return;
+
+    float cost = edge_collapse_costs[tid];
+    if (cost > collapse_thresh) return;
+
+    // get edge
+    uint64_t e = edges[tid];
+    int e0 = int(e >> 32);
+    int e1 = int(e & 0xFFFFFFFF);
+    uint64_t pack = pack_key_value_positive(tid, cost);
+
+    for (int f = vert2face_offset[e0]; f < vert2face_offset[e0+1]; f++) {
+        if (propagated_costs[vert2face[f]] != pack) return;
+    }
+    for (int f = vert2face_offset[e1]; f < vert2face_offset[e1+1]; f++) {
+        if (propagated_costs[vert2face[f]] != pack) return;
+    }
+
+    // collapse edge
+    Vec3f v0(vertices[e0]);
+    Vec3f v1(vertices[e1]);
+    uint8_t v0_is_bound = vert_is_boundary[e0];
+    uint8_t v1_is_bound = vert_is_boundary[e1];
+    float w0 = 0.5;
+    if (v0_is_bound && !v1_is_bound) w0 = 1.0;
+    else if (!v0_is_bound &&  v1_is_bound) w0 = 0.0;
+    Vec3f v_new = v0 * w0 + v1 * (1.0f - w0);
+    vertices[e0] = { v_new.x, v_new.y, v_new.z };
+    vertices_kept[e1] = 0;
+    // delete shared faces
+    for (int f = vert2face_offset[e0]; f < vert2face_offset[e0+1]; f++) {
+        int fid = vert2face[f];
+        int3 f_vids = faces[fid];
+        if (f_vids.x == e1 || f_vids.y == e1 || f_vids.z == e1) {
+            faces_kept[fid] = 0;
+        }
+    }
+    // update faces
+    for (int f = vert2face_offset[e1]; f < vert2face_offset[e1+1]; f++) {
+        int fid = vert2face[f];
+        int3 f_vids = faces[fid];
+        if (f_vids.x == e1) {
+            f_vids.x = e0;
+        } else if (f_vids.y == e1) {
+            f_vids.y = e0;
+        } else if (f_vids.z == e1) {
+            f_vids.z = e0;
+        }
+        faces[fid] = f_vids;
+    }
+}
+
+
+static __global__ void compress_vertices_kernel(
+    const int* vertices_map,
+    const float3* old_vertices,
+    const int V,
+    float3* new_vertices
+) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= V) return;
+    int new_id = vertices_map[tid];
+    int is_kept = vertices_map[tid + 1] == new_id + 1;
+    if (is_kept) {
+        new_vertices[new_id] = old_vertices[tid];
+    }
+}
+
+
+static __global__ void compress_faces_kernel(
+    const int* faces_map,
+    const int* vertices_map,
+    const int3* old_faces,
+    const int F,
+    int3* new_faces
+) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= F) return;
+    int new_id = faces_map[tid];
+    int is_kept = faces_map[tid + 1] == new_id + 1;
+    if (is_kept) {
+        new_faces[new_id].x = vertices_map[old_faces[tid].x];
+        new_faces[new_id].y = vertices_map[old_faces[tid].y];
+        new_faces[new_id].z = vertices_map[old_faces[tid].z];        
+    }
+}
+
+
+/**
+ * Collapse edges parallelly
+ */
+void collapse_edges(
+    CuMesh& ctx,
+    float collapse_thresh
+) {
+    size_t V = ctx.vertices.size;
+    size_t F = ctx.faces.size;
+    size_t E = ctx.edges.size;
+    ctx.vertices_map.resize(V + 1);
+    ctx.faces_map.resize(F + 1);
+    ctx.vertices_map.fill(1);
+    ctx.faces_map.fill(1);
+   hipLaunchKernelGGL(( collapse_edges_kernel), dim3((E+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+        ctx.vertices.ptr,
+        ctx.faces.ptr,
+        ctx.edges.ptr,
+        ctx.vert2face.ptr,
+        ctx.vert2face_offset.ptr,
+        ctx.edge_collapse_costs.ptr,
+        ctx.propagated_costs.ptr,
+        ctx.vert_is_boundary.ptr,
+        V, F, E,
+        collapse_thresh,
+        ctx.vertices_map.ptr,
+        ctx.faces_map.ptr
+    );
+    CUDA_CHECK(hipGetLastError());
+    
+    // update vertices buffer
+    // get vertices map
+    size_t temp_storage_bytes = 0;
+    CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum(
+        nullptr, temp_storage_bytes,
+        ctx.vertices_map.ptr, V+1
+    ));
+    ctx.cub_temp_storage.resize(temp_storage_bytes);
+    CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum(
+        ctx.cub_temp_storage.ptr, temp_storage_bytes,
+        ctx.vertices_map.ptr, V+1
+    ));
+    int new_num_vertices;
+    CUDA_CHECK(hipMemcpy(&new_num_vertices, ctx.vertices_map.ptr + V, sizeof(int), hipMemcpyDeviceToHost));
+    // compress vertices
+    ctx.temp_storage.resize(new_num_vertices * sizeof(float3));
+   hipLaunchKernelGGL(( compress_vertices_kernel), dim3((V+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+        ctx.vertices_map.ptr,
+        ctx.vertices.ptr,
+        V,
+        reinterpret_cast<float3*>(ctx.temp_storage.ptr)
+    );
+    CUDA_CHECK(hipGetLastError());
+    swap_buffers(ctx.temp_storage, ctx.vertices);
+
+    // update faces buffer
+    // get faces map
+    CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum(
+        nullptr, temp_storage_bytes,
+        ctx.faces_map.ptr, F+1
+    ));
+    ctx.cub_temp_storage.resize(temp_storage_bytes);
+    CUDA_CHECK(hipcub::DeviceScan::ExclusiveSum(
+        ctx.cub_temp_storage.ptr, temp_storage_bytes,
+        ctx.faces_map.ptr, F+1
+    ));
+    int new_num_faces;
+    CUDA_CHECK(hipMemcpy(&new_num_faces, ctx.faces_map.ptr + F, sizeof(int), hipMemcpyDeviceToHost));
+    // compress faces
+    ctx.temp_storage.resize(new_num_faces * sizeof(int3));
+   hipLaunchKernelGGL(( compress_faces_kernel), dim3((F+BLOCK_SIZE-1)/BLOCK_SIZE), dim3(BLOCK_SIZE), 0, 0, 
+        ctx.faces_map.ptr,
+        ctx.vertices_map.ptr,
+        ctx.faces.ptr,
+        F,
+        reinterpret_cast<int3*>(ctx.temp_storage.ptr)
+    );
+    CUDA_CHECK(hipGetLastError());
+    swap_buffers(ctx.temp_storage, ctx.faces);
+}
+
+
+std::tuple<int, int> CuMesh::simplify_step(float lambda_edge_length, float lambda_skinny, float threshold, bool timing) {
+    std::chrono::high_resolution_clock::time_point start, end;
+
+    if (timing) start = std::chrono::high_resolution_clock::now();
+    this->get_vertex_face_adjacency();
+    if (timing) {
+        CUDA_CHECK(hipDeviceSynchronize());
+        end = std::chrono::high_resolution_clock::now();
+        std::cout << "get_vertex_face_adjacency: " << std::chrono::duration_cast<std::chrono::microseconds>(end - start).count() << " us" << std::endl;
+    }
+
+    if (timing) start = std::chrono::high_resolution_clock::now();
+    this->get_edges();
+    this->get_boundary_info();
+    if (timing) {
+        CUDA_CHECK(hipDeviceSynchronize());
+        end = std::chrono::high_resolution_clock::now();
+        std::cout << "get_edges: " << std::chrono::duration_cast<std::chrono::microseconds>(end - start).count() << " us" << std::endl;
+    }
+
+    if (timing) start = std::chrono::high_resolution_clock::now();
+    get_qem(*this);
+    if (timing) {
+        CUDA_CHECK(hipDeviceSynchronize());
+        end = std::chrono::high_resolution_clock::now();
+        std::cout << "get_qem: " << std::chrono::duration_cast<std::chrono::microseconds>(end - start).count() << " us" << std::endl;
+    }
+
+    if (timing) start = std::chrono::high_resolution_clock::now();
+    get_edge_collapse_cost(*this, lambda_edge_length, lambda_skinny);
+    if (timing) {
+        CUDA_CHECK(hipDeviceSynchronize());
+        end = std::chrono::high_resolution_clock::now();
+        std::cout << "get_edge_collapse_cost: " << std::chrono::duration_cast<std::chrono::microseconds>(end - start).count() << " us" << std::endl;
+    }
+
+    if (timing) start = std::chrono::high_resolution_clock::now();
+    propagate_cost(*this);
+    if (timing) {
+        CUDA_CHECK(hipDeviceSynchronize());
+        end = std::chrono::high_resolution_clock::now();
+        std::cout << "propagate_cost: " << std::chrono::duration_cast<std::chrono::microseconds>(end - start).count() << " us" << std::endl;
+    }
+
+    if (timing) start = std::chrono::high_resolution_clock::now();
+    collapse_edges(*this, threshold);
+    if (timing) {
+        CUDA_CHECK(hipDeviceSynchronize());
+        end = std::chrono::high_resolution_clock::now();
+        std::cout << "collapse_edges: " << std::chrono::duration_cast<std::chrono::microseconds>(end - start).count() << " us" << std::endl;
+    }
+
+    // Delete all cached info since mesh has changed
+    this->clear_cache();
+
+    return std::make_tuple(this->vertices.size, this->faces.size);
+}
+
+
+} // namespace cumesh
diff --git a/src/utils.h b/src/utils.h
index f15823b..8757bba 100644
--- a/src/utils.h
+++ b/src/utils.h
@@ -1,21 +1,25 @@
 #pragma once
 
 #include <vector>
+#ifdef __HIP_PLATFORM_AMD__
+#include <hip/hip_runtime.h>
+#else
 #include <cuda.h>
 #include <cuda_runtime.h>
+#endif
 #include <torch/extension.h>
 
 #define CUDA_CHECK(call)                                \
 do {                                                    \
-    const cudaError_t error_code = call;                \
-    if (error_code != cudaSuccess) {                    \
+    const hipError_t error_code = call;                 \
+    if (error_code != hipSuccess) {                     \
         TORCH_CHECK(false,                              \
-            "[CuMesh] CUDA error:\n",                   \
+            "[CuMesh] HIP error:\n",                    \
             "    File:       ", __FILE__, "\n",         \
             "    Line:       ", __LINE__, "\n",         \
             "    Error code: ", error_code, "\n",       \
             "    Error text: ",                         \
-            cudaGetErrorString(error_code), "\n");      \
+            hipGetErrorString(error_code), "\n");       \
     }                                                   \
 } while (0)
 
@@ -39,11 +43,11 @@ struct Buffer {
 
     void init(size_t capacity) {
         this->capacity = capacity;
-        CUDA_CHECK(cudaMalloc(&ptr, capacity * sizeof(T)));
+        CUDA_CHECK(hipMalloc(&ptr, capacity * sizeof(T)));
     }
 
     void free() {
-        if (ptr != nullptr) CUDA_CHECK(cudaFree(ptr));
+        if (ptr != nullptr) CUDA_CHECK(hipFree(ptr));
         ptr = nullptr;
         size = 0;
         capacity = 0;
@@ -61,9 +65,9 @@ struct Buffer {
         size_t new_size = size + this->size;
         if (new_size > capacity) {
             T* new_ptr;
-            CUDA_CHECK(cudaMalloc(&new_ptr, new_size * sizeof(T)));
-            CUDA_CHECK(cudaMemcpy(new_ptr, ptr, this->size * sizeof(T), cudaMemcpyDeviceToDevice));
-            CUDA_CHECK(cudaFree(ptr));
+            CUDA_CHECK(hipMalloc(&new_ptr, new_size * sizeof(T)));
+            CUDA_CHECK(hipMemcpy(new_ptr, ptr, this->size * sizeof(T), hipMemcpyDeviceToDevice));
+            CUDA_CHECK(hipFree(ptr));
             ptr = new_ptr;
             this->capacity = new_size;
         }
@@ -71,12 +75,12 @@ struct Buffer {
     }
 
     void zero() {
-        CUDA_CHECK(cudaMemset(ptr, 0, size * sizeof(T)));
+        CUDA_CHECK(hipMemset(ptr, 0, size * sizeof(T)));
     }
 
     void fill(T val) {
         std::vector<T> tmp(size, val);
-        CUDA_CHECK(cudaMemcpy(ptr, tmp.data(), size * sizeof(T), cudaMemcpyHostToDevice));
+        CUDA_CHECK(hipMemcpy(ptr, tmp.data(), size * sizeof(T), hipMemcpyHostToDevice));
     }
 };
 
diff --git a/src/utils_hip.h b/src/utils_hip.h
new file mode 100644
index 0000000..e5ed6b8
--- /dev/null
+++ b/src/utils_hip.h
@@ -0,0 +1,110 @@
+// !!! This is a file automatically generated by hipify!!!
+#pragma once
+
+#include <vector>
+#ifdef __HIP_PLATFORM_AMD__
+#include <hip/hip_runtime.h>
+#else
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime.h>
+#endif
+#include <torch/extension.h>
+
+#define CUDA_CHECK(call)                                \
+do {                                                    \
+    const hipError_t error_code = call;                 \
+    if (error_code != hipSuccess) {                     \
+        TORCH_CHECK(false,                              \
+            "[CuMesh] HIP error:\n",                    \
+            "    File:       ", __FILE__, "\n",         \
+            "    Line:       ", __LINE__, "\n",         \
+            "    Error code: ", error_code, "\n",       \
+            "    Error text: ",                         \
+            hipGetErrorString(error_code), "\n");       \
+    }                                                   \
+} while (0)
+
+namespace cumesh {
+
+
+/**
+ * A GPU buffer class that manages device memory.
+ */
+template<typename T>
+struct Buffer {
+    T* ptr;
+    size_t size;
+    size_t capacity;
+
+    Buffer() : ptr(nullptr), size(0), capacity(0) {}
+
+    bool is_empty() const {
+        return size == 0;
+    }
+
+    void init(size_t capacity) {
+        this->capacity = capacity;
+        CUDA_CHECK(hipMalloc(&ptr, capacity * sizeof(T)));
+    }
+
+    void free() {
+        if (ptr != nullptr) CUDA_CHECK(hipFree(ptr));
+        ptr = nullptr;
+        size = 0;
+        capacity = 0;
+    }
+
+    void resize(size_t size) {
+        if (size > capacity) {
+            free();
+            init(size);
+        }
+        this->size = size;
+    }
+
+    void extend(size_t size) {
+        size_t new_size = size + this->size;
+        if (new_size > capacity) {
+            T* new_ptr;
+            CUDA_CHECK(hipMalloc(&new_ptr, new_size * sizeof(T)));
+            CUDA_CHECK(hipMemcpy(new_ptr, ptr, this->size * sizeof(T), hipMemcpyDeviceToDevice));
+            CUDA_CHECK(hipFree(ptr));
+            ptr = new_ptr;
+            this->capacity = new_size;
+        }
+        this->size = new_size;
+    }
+
+    void zero() {
+        CUDA_CHECK(hipMemset(ptr, 0, size * sizeof(T)));
+    }
+
+    void fill(T val) {
+        std::vector<T> tmp(size, val);
+        CUDA_CHECK(hipMemcpy(ptr, tmp.data(), size * sizeof(T), hipMemcpyHostToDevice));
+    }
+};
+
+
+/**
+ * Swap the contents of two buffers.
+ */
+template<typename T1, typename T2>
+void swap_buffers(Buffer<T1>& b1, Buffer<T2>& b2) {
+    void* b1_ptr = reinterpret_cast<void*>(b1.ptr);
+    void* b2_ptr = reinterpret_cast<void*>(b2.ptr);
+    size_t b1_capacity_bytes = b1.capacity * sizeof(T1);
+    size_t b2_capacity_bytes = b2.capacity * sizeof(T2);
+    size_t b1_size_bytes = b1.size * sizeof(T1);
+    size_t b2_size_bytes = b2.size * sizeof(T2);
+    
+    b1.ptr = reinterpret_cast<T1*>(b2_ptr);
+    b2.ptr = reinterpret_cast<T2*>(b1_ptr);
+    b1.capacity = b2_capacity_bytes / sizeof(T1);
+    b2.capacity = b1_capacity_bytes / sizeof(T2);
+    b1.size = b2_size_bytes / sizeof(T1);
+    b2.size = b1_size_bytes / sizeof(T2);
+}
+
+
+} // namespace cumesh